2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
19 import Control.Lens ((^.), set)
20 import Data.Maybe (fromMaybe, catMaybes)
21 import Data.Ord (Down(..))
24 import Data.Text (Text)
25 import qualified Data.Char as Char
26 import qualified Data.List as List
27 import qualified Data.Map as Map
28 import qualified Data.Set as Set
29 import qualified Data.Text as Text
31 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
32 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
33 import Gargantext.API.Ngrams.Types (RepoCmdM)
34 import Gargantext.Core.Text.List.Social (flowSocialList, flowSocialList', FlowSocialListPriority(..), invertForw)
35 import Gargantext.Core.Text.List.Social.Scores (FlowListScores)
36 import Gargantext.Core.Text.List.Group
37 import Gargantext.Core.Text.List.Group.WithStem
38 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
39 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
40 import Gargantext.Core.Types.Individu (User(..))
41 import Gargantext.Database.Admin.Types.Node (NodeId)
42 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
43 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
44 import Gargantext.Database.Prelude (CmdM)
45 import Gargantext.Database.Query.Table.Node (defaultList)
46 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
47 import Gargantext.Database.Query.Tree.Error (HasTreeError)
48 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
49 import Gargantext.Prelude
52 -- | TODO improve grouping functions of Authors, Sources, Institutes..
53 buildNgramsLists :: ( RepoCmdM env err m
62 -> m (Map NgramsType [NgramsElement])
63 buildNgramsLists user gp uCid mCid = do
64 ngTerms <- buildNgramsTermsList user uCid mCid gp
65 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
66 [ (Authors , MapListSize 9)
67 , (Sources , MapListSize 9)
68 , (Institutes, MapListSize 9)
71 pure $ Map.unions $ [ngTerms] <> othersTerms
74 data MapListSize = MapListSize { unMapListSize :: !Int }
76 buildNgramsOthersList ::( HasNodeError err
84 -> (NgramsType, MapListSize)
85 -> m (Map NgramsType [NgramsElement])
86 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
87 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
89 socialLists' :: Map Text FlowListScores
90 <- flowSocialList' MySelfFirst user nt (Set.fromList $ Map.keys ngs')
91 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
93 -- printDebug "flowSocialList'" socialLists'
96 groupParams = GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-}
97 groupedWithList = toGroupedText groupParams socialLists' ngs'
99 printDebug "groupedWithList" groupedWithList
102 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
103 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm) tailTerms
105 listSize = mapListSize - (List.length mapTerms)
106 (mapTerms', candiTerms) = List.splitAt listSize
107 $ List.sortOn (Down . _gt_score)
108 $ Map.elems tailTerms'
110 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
111 <> (List.concat $ map toNgramsElement mapTerms )
112 <> (List.concat $ map toNgramsElement
113 $ map (set gt_listType (Just MapTerm )) mapTerms' )
114 <> (List.concat $ map toNgramsElement
115 $ map (set gt_listType (Just CandidateTerm)) candiTerms)
120 buildNgramsTermsList :: ( HasNodeError err
129 -> m (Map NgramsType [NgramsElement])
130 buildNgramsTermsList user uCid mCid groupParams = do
132 -- Computing global speGen score
133 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
135 -- printDebug "head candidates" (List.take 10 $ allTerms)
136 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
138 -- First remove stops terms
139 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
140 -- printDebug "\n * socialLists * \n" socialLists
142 -- Grouping the ngrams and keeping the maximum score for label
143 let grouped = groupedTextWithStem ( GroupedTextParams (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
145 groupedWithList = map (addListType (invertForw socialLists)) grouped
147 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
148 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
150 -- printDebug "\n * stopTerms * \n" stopTerms
151 -- splitting monterms and multiterms to take proportional candidates
153 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
154 monoSize = 0.4 :: Double
155 multSize = 1 - monoSize
157 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
159 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
160 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
162 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
163 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
164 -- printDebug "groupedMultHead" (List.length groupedMultHead)
165 -- printDebug "groupedMultTail" (List.length groupedMultTail)
168 -- Get Local Scores now for selected grouped ngrams
169 selectedTerms = Set.toList $ List.foldl'
170 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
174 (groupedMonoHead <> groupedMultHead)
176 -- TO remove (and remove HasNodeError instance)
177 userListId <- defaultList uCid
178 masterListId <- defaultList mCid
180 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
183 mapGroups = Map.fromList
184 $ map (\g -> (g ^. gt_stem, g))
185 $ groupedMonoHead <> groupedMultHead
187 -- grouping with Set NodeId
188 contextsAdded = foldl' (\mapGroups' k ->
189 let k' = ngramsGroup groupParams k in
190 case Map.lookup k' mapGroups' of
191 Nothing -> mapGroups'
192 Just g -> case Map.lookup k mapTextDocIds of
193 Nothing -> mapGroups'
194 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
197 $ Map.keys mapTextDocIds
199 -- compute cooccurrences
200 mapCooc = Map.filter (>2)
201 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
202 | (t1, s1) <- mapStemNodeIds
203 , (t2, s2) <- mapStemNodeIds
204 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
207 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
208 -- printDebug "mapCooc" mapCooc
212 mapScores f = Map.fromList
213 $ map (\(Scored t g s') -> (t, f (g,s')))
218 groupsWithScores = catMaybes
220 -> case Map.lookup stem mapScores' of
222 Just s' -> Just $ g { _gt_score = s'}
223 ) $ Map.toList contextsAdded
225 mapScores' = mapScores identity
226 -- adapt2 TOCHECK with DC
227 -- printDebug "groupsWithScores" groupsWithScores
229 -- sort / partition / split
230 -- filter mono/multi again
231 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
232 -- filter with max score
233 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
235 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
236 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
240 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
241 inclSize = 0.4 :: Double
242 exclSize = 1 - inclSize
243 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
245 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
246 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
248 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
249 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
252 -- Final Step building the Typed list
253 termListHead = maps <> cands
255 maps = set gt_listType (Just MapTerm)
256 <$> monoScoredInclHead
257 <> monoScoredExclHead
258 <> multScoredInclHead
259 <> multScoredExclHead
261 cands = set gt_listType (Just CandidateTerm)
262 <$> monoScoredInclTail
263 <> monoScoredExclTail
264 <> multScoredInclTail
265 <> multScoredExclTail
267 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
269 -- printDebug "monoScoredInclHead" monoScoredInclHead
270 -- printDebug "monoScoredExclHead" monoScoredExclTail
271 -- printDebug "multScoredInclHead" multScoredInclHead
272 -- printDebug "multScoredExclTail" multScoredExclTail
274 let result = Map.unionsWith (<>)
275 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
276 <> (List.concat $ map toNgramsElement $ termListTail)
277 <> (List.concat $ map toNgramsElement $ stopTerms)
280 -- printDebug "\n result \n" r
285 toNgramsElement :: GroupedText a -> [NgramsElement]
286 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
287 [parentElem] <> childrenElems
290 children = Set.toList setNgrams
291 parentElem = mkNgramsElement (NgramsTerm parent)
292 (fromMaybe CandidateTerm listType)
294 (mSetFromList (NgramsTerm <$> children))
295 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
296 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
298 ) (NgramsTerm <$> children)
301 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
302 toGargList l n = (l,n)
305 isStopTerm :: StopSize -> Text -> Bool
306 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
308 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
310 ------------------------------------------------------------------------------