2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
14 module Gargantext.Core.Text.List
18 import Control.Lens ((^.), set)
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
22 import Data.Text (Text)
23 import qualified Data.Char as Char
24 import qualified Data.List as List
25 import qualified Data.Map as Map
26 import qualified Data.Set as Set
27 import qualified Data.Text as Text
29 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
30 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
31 import Gargantext.API.Ngrams.Types (RepoCmdM)
32 import Gargantext.Core.Text.List.Social (flowSocialList, invertForw)
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
34 import Gargantext.Core.Text.Group
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Node (defaultList)
41 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
42 import Gargantext.Database.Query.Tree.Error (HasTreeError)
43 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
44 import Gargantext.Prelude
47 -- | TODO improve grouping functions of Authors, Sources, Institutes..
48 buildNgramsLists :: ( RepoCmdM env err m
57 -> m (Map NgramsType [NgramsElement])
58 buildNgramsLists user gp uCid mCid = do
59 ngTerms <- buildNgramsTermsList user uCid mCid gp
60 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
61 [ (Authors, MapListSize 9)
62 , (Sources, MapListSize 9)
63 , (Institutes, MapListSize 9)
66 pure $ Map.unions $ [ngTerms] <> othersTerms
69 data MapListSize = MapListSize Int
71 buildNgramsOthersList ::( HasNodeError err
79 -> (NgramsType, MapListSize)
80 -> m (Map NgramsType [NgramsElement])
81 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
83 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
86 grouped = toGroupedText groupIt (Set.size . snd) fst snd (Map.toList $ Map.mapWithKey (\k (a,b) -> (Set.delete k a, b)) $ ngs)
88 socialLists <- flowSocialList user nt (Set.fromList $ Map.keys ngs)
91 groupedWithList = map (addListType (invertForw socialLists)) grouped
92 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
93 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm) tailTerms
95 listSize = mapListSize - (List.length mapTerms)
96 (mapTerms', candiTerms) = List.splitAt listSize $ List.sortOn (Down . _gt_score) $ Map.elems tailTerms'
98 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
99 <> (List.concat $ map toNgramsElement mapTerms)
100 <> (List.concat $ map toNgramsElement $ map (set gt_listType (Just MapTerm)) mapTerms')
101 <> (List.concat $ map toNgramsElement $ map (set gt_listType (Just CandidateTerm)) candiTerms)
105 buildNgramsTermsList :: ( HasNodeError err
114 -> m (Map NgramsType [NgramsElement])
115 buildNgramsTermsList user uCid mCid groupParams = do
117 -- Computing global speGen score
118 allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
120 -- printDebug "head candidates" (List.take 10 $ allTerms)
121 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
123 -- First remove stops terms
124 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst allTerms)
125 -- printDebug "\n * socialLists * \n" socialLists
128 -- Grouping the ngrams and keeping the maximum score for label
129 let grouped = toGroupedText (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) allTerms
131 groupedWithList = map (addListType (invertForw socialLists)) grouped
133 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
134 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
136 -- printDebug "\n * stopTerms * \n" stopTerms
137 -- splitting monterms and multiterms to take proportional candidates
139 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
140 monoSize = 0.4 :: Double
141 multSize = 1 - monoSize
143 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
145 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
146 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
148 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
149 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
150 -- printDebug "groupedMultHead" (List.length groupedMultHead)
151 -- printDebug "groupedMultTail" (List.length groupedMultTail)
154 -- Get Local Scores now for selected grouped ngrams
155 selectedTerms = Set.toList $ List.foldl'
156 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
160 (groupedMonoHead <> groupedMultHead)
162 -- TO remove (and remove HasNodeError instance)
163 userListId <- defaultList uCid
164 masterListId <- defaultList mCid
166 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
169 mapGroups = Map.fromList
170 $ map (\g -> (g ^. gt_stem, g))
171 $ groupedMonoHead <> groupedMultHead
173 -- grouping with Set NodeId
174 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup groupParams k
175 in case Map.lookup k' mapGroups' of
176 Nothing -> mapGroups'
177 Just g -> case Map.lookup k mapTextDocIds of
178 Nothing -> mapGroups'
179 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
182 $ Map.keys mapTextDocIds
184 -- compute cooccurrences
185 mapCooc = Map.filter (>2)
186 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
187 | (t1, s1) <- mapStemNodeIds
188 , (t2, s2) <- mapStemNodeIds
191 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
192 -- printDebug "mapCooc" mapCooc
196 mapScores f = Map.fromList
197 $ map (\(Scored t g s') -> (t, f (g,s')))
202 groupsWithScores = catMaybes
204 -> case Map.lookup stem mapScores' of
206 Just s' -> Just $ g { _gt_score = s'}
207 ) $ Map.toList contextsAdded
209 mapScores' = mapScores identity
210 -- adapt2 TOCHECK with DC
211 -- printDebug "groupsWithScores" groupsWithScores
213 -- sort / partition / split
214 -- filter mono/multi again
215 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
216 -- filter with max score
217 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
219 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
220 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
224 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
225 inclSize = 0.4 :: Double
226 exclSize = 1 - inclSize
227 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
229 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
230 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
232 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
233 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
236 -- Final Step building the Typed list
237 termListHead = maps <> cands
239 maps = set gt_listType (Just MapTerm)
240 <$> monoScoredInclHead
241 <> monoScoredExclHead
242 <> multScoredInclHead
243 <> multScoredExclHead
245 cands = set gt_listType (Just CandidateTerm)
246 <$> monoScoredInclTail
247 <> monoScoredExclTail
248 <> multScoredInclTail
249 <> multScoredExclTail
251 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
253 -- printDebug "monoScoredInclHead" monoScoredInclHead
254 -- printDebug "monoScoredExclHead" monoScoredExclTail
256 -- printDebug "multScoredInclHead" multScoredInclHead
257 -- printDebug "multScoredExclTail" multScoredExclTail
259 let result = Map.unionsWith (<>)
260 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
261 <> (List.concat $ map toNgramsElement $ termListTail)
262 <> (List.concat $ map toNgramsElement $ stopTerms)
265 -- printDebug "\n result \n" r
270 toNgramsElement :: GroupedText a -> [NgramsElement]
271 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
272 [parentElem] <> childrenElems
275 children = Set.toList setNgrams
276 parentElem = mkNgramsElement (NgramsTerm parent)
277 (fromMaybe CandidateTerm listType)
279 (mSetFromList (NgramsTerm <$> children))
280 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
281 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
283 ) (NgramsTerm <$> children)
286 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
287 toGargList l n = (l,n)
290 isStopTerm :: StopSize -> Text -> Bool
291 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
293 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
295 ------------------------------------------------------------------------------