2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
14 module Gargantext.Core.Text.List
18 import Control.Lens ((^.), set)
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
22 import Data.Text (Text)
23 import qualified Data.Char as Char
24 import qualified Data.List as List
25 import qualified Data.Map as Map
26 import qualified Data.Set as Set
27 import qualified Data.Text as Text
29 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
30 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
31 import Gargantext.API.Ngrams.Types (RepoCmdM)
32 import Gargantext.Core.Text.List.Social (flowSocialList, invertForw)
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
34 import Gargantext.Core.Text.Group
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Node (defaultList)
41 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
42 import Gargantext.Database.Query.Tree.Error (HasTreeError)
43 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
44 import Gargantext.Prelude
47 -- | TODO improve grouping functions of Authors, Sources, Institutes..
48 buildNgramsLists :: ( RepoCmdM env err m
57 -> m (Map NgramsType [NgramsElement])
58 buildNgramsLists user gp uCid mCid = do
59 ngTerms <- buildNgramsTermsList user uCid mCid gp
60 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
61 [ (Authors , MapListSize 9)
62 , (Sources , MapListSize 9)
63 , (Institutes, MapListSize 9)
66 pure $ Map.unions $ [ngTerms] <> othersTerms
69 data MapListSize = MapListSize Int
71 buildNgramsOthersList ::( HasNodeError err
79 -> (NgramsType, MapListSize)
80 -> m (Map NgramsType [NgramsElement])
81 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
83 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
86 grouped = toGroupedText groupIt (Set.size . snd) fst snd
87 (Map.toList $ Map.mapWithKey (\k (a,b) -> (Set.delete k a, b)) $ ngs)
89 socialLists <- flowSocialList user nt (Set.fromList $ Map.keys ngs)
92 groupedWithList = map (addListType (invertForw socialLists)) grouped
93 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
94 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm) tailTerms
96 listSize = mapListSize - (List.length mapTerms)
97 (mapTerms', candiTerms) = List.splitAt listSize $ List.sortOn (Down . _gt_score) $ Map.elems tailTerms'
99 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
100 <> (List.concat $ map toNgramsElement mapTerms)
101 <> (List.concat $ map toNgramsElement $ map (set gt_listType (Just MapTerm)) mapTerms')
102 <> (List.concat $ map toNgramsElement $ map (set gt_listType (Just CandidateTerm)) candiTerms)
106 buildNgramsTermsList :: ( HasNodeError err
115 -> m (Map NgramsType [NgramsElement])
116 buildNgramsTermsList user uCid mCid groupParams = do
118 -- Computing global speGen score
119 allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
121 -- printDebug "head candidates" (List.take 10 $ allTerms)
122 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
124 -- First remove stops terms
125 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst allTerms)
126 -- printDebug "\n * socialLists * \n" socialLists
129 -- Grouping the ngrams and keeping the maximum score for label
130 let grouped = toGroupedText (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) allTerms
132 groupedWithList = map (addListType (invertForw socialLists)) grouped
134 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
135 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
137 -- printDebug "\n * stopTerms * \n" stopTerms
138 -- splitting monterms and multiterms to take proportional candidates
140 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
141 monoSize = 0.4 :: Double
142 multSize = 1 - monoSize
144 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
146 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
147 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
149 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
150 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
151 -- printDebug "groupedMultHead" (List.length groupedMultHead)
152 -- printDebug "groupedMultTail" (List.length groupedMultTail)
155 -- Get Local Scores now for selected grouped ngrams
156 selectedTerms = Set.toList $ List.foldl'
157 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
161 (groupedMonoHead <> groupedMultHead)
163 -- TO remove (and remove HasNodeError instance)
164 userListId <- defaultList uCid
165 masterListId <- defaultList mCid
167 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
170 mapGroups = Map.fromList
171 $ map (\g -> (g ^. gt_stem, g))
172 $ groupedMonoHead <> groupedMultHead
174 -- grouping with Set NodeId
175 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup groupParams k
176 in case Map.lookup k' mapGroups' of
177 Nothing -> mapGroups'
178 Just g -> case Map.lookup k mapTextDocIds of
179 Nothing -> mapGroups'
180 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
183 $ Map.keys mapTextDocIds
185 -- compute cooccurrences
186 mapCooc = Map.filter (>2)
187 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
188 | (t1, s1) <- mapStemNodeIds
189 , (t2, s2) <- mapStemNodeIds
192 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
193 -- printDebug "mapCooc" mapCooc
197 mapScores f = Map.fromList
198 $ map (\(Scored t g s') -> (t, f (g,s')))
203 groupsWithScores = catMaybes
205 -> case Map.lookup stem mapScores' of
207 Just s' -> Just $ g { _gt_score = s'}
208 ) $ Map.toList contextsAdded
210 mapScores' = mapScores identity
211 -- adapt2 TOCHECK with DC
212 -- printDebug "groupsWithScores" groupsWithScores
214 -- sort / partition / split
215 -- filter mono/multi again
216 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
217 -- filter with max score
218 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
220 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
221 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
225 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
226 inclSize = 0.4 :: Double
227 exclSize = 1 - inclSize
228 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
230 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
231 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
233 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
234 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
237 -- Final Step building the Typed list
238 termListHead = maps <> cands
240 maps = set gt_listType (Just MapTerm)
241 <$> monoScoredInclHead
242 <> monoScoredExclHead
243 <> multScoredInclHead
244 <> multScoredExclHead
246 cands = set gt_listType (Just CandidateTerm)
247 <$> monoScoredInclTail
248 <> monoScoredExclTail
249 <> multScoredInclTail
250 <> multScoredExclTail
252 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
254 -- printDebug "monoScoredInclHead" monoScoredInclHead
255 -- printDebug "monoScoredExclHead" monoScoredExclTail
257 -- printDebug "multScoredInclHead" multScoredInclHead
258 -- printDebug "multScoredExclTail" multScoredExclTail
260 let result = Map.unionsWith (<>)
261 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
262 <> (List.concat $ map toNgramsElement $ termListTail)
263 <> (List.concat $ map toNgramsElement $ stopTerms)
266 -- printDebug "\n result \n" r
271 toNgramsElement :: GroupedText a -> [NgramsElement]
272 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
273 [parentElem] <> childrenElems
276 children = Set.toList setNgrams
277 parentElem = mkNgramsElement (NgramsTerm parent)
278 (fromMaybe CandidateTerm listType)
280 (mSetFromList (NgramsTerm <$> children))
281 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
282 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
284 ) (NgramsTerm <$> children)
287 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
288 toGargList l n = (l,n)
291 isStopTerm :: StopSize -> Text -> Bool
292 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
294 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
296 ------------------------------------------------------------------------------