2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
14 module Gargantext.Core.Text.List
18 import Control.Lens ((^.), set)
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
22 import Data.Text (Text)
23 import qualified Data.Char as Char
24 import qualified Data.List as List
25 import qualified Data.Map as Map
26 import qualified Data.Set as Set
27 import qualified Data.Text as Text
29 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
30 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
31 import Gargantext.API.Ngrams.Types (RepoCmdM)
32 import Gargantext.Core.Text.List.Social (flowSocialList, invertForw)
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
34 import Gargantext.Core.Text.Group
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Node (defaultList)
41 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
42 import Gargantext.Database.Query.Tree.Error (HasTreeError)
43 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
44 import Gargantext.Prelude
47 -- | TODO improve grouping functions of Authors, Sources, Institutes..
48 buildNgramsLists :: ( RepoCmdM env err m
57 -> m (Map NgramsType [NgramsElement])
58 buildNgramsLists user gp uCid mCid = do
59 ngTerms <- buildNgramsTermsList user uCid mCid gp
60 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
61 [ (Authors , MapListSize 9)
62 , (Sources , MapListSize 9)
63 , (Institutes, MapListSize 9)
66 pure $ Map.unions $ [ngTerms] <> othersTerms
69 data MapListSize = MapListSize Int
71 buildNgramsOthersList ::( HasNodeError err
79 -> (NgramsType, MapListSize)
80 -> m (Map NgramsType [NgramsElement])
81 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
83 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
87 grouped = toGroupedText groupIt (Set.size . snd) fst snd
88 (Map.toList $ Map.mapWithKey (\k (a,b) -> (Set.delete k a, b)) $ ngs)
90 socialLists <- flowSocialList user nt (Set.fromList $ Map.keys ngs)
93 groupedWithList = map (addListType (invertForw socialLists)) grouped
94 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
95 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm) tailTerms
97 listSize = mapListSize - (List.length mapTerms)
98 (mapTerms', candiTerms) = List.splitAt listSize $ List.sortOn (Down . _gt_score) $ Map.elems tailTerms'
100 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
101 <> (List.concat $ map toNgramsElement mapTerms)
102 <> (List.concat $ map toNgramsElement $ map (set gt_listType (Just MapTerm)) mapTerms')
103 <> (List.concat $ map toNgramsElement $ map (set gt_listType (Just CandidateTerm)) candiTerms)
107 buildNgramsTermsList :: ( HasNodeError err
116 -> m (Map NgramsType [NgramsElement])
117 buildNgramsTermsList user uCid mCid groupParams = do
119 -- Computing global speGen score
120 allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
122 -- printDebug "head candidates" (List.take 10 $ allTerms)
123 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
125 -- First remove stops terms
126 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst allTerms)
127 -- printDebug "\n * socialLists * \n" socialLists
129 printDebug "\n * socialLists * \n" socialLists
132 socialStop = fromMaybe Set.empty $ Map.lookup StopTerm socialLists
133 _socialMap = fromMaybe Set.empty $ Map.lookup MapTerm socialLists
134 _socialCand = fromMaybe Set.empty $ Map.lookup CandidateTerm socialLists
135 -- stopTerms ignored for now (need to be tagged already)
136 -- (stopTerms, candidateTerms) = List.partition ((\t -> Set.member t socialStop) . fst) allTerms
137 -- (mapTerms, candidateTerms) = List.partition ((\t -> Set.member t socialMap ) . fst) allTerms
139 -- printDebug "stopTerms" stopTerms
141 -- Grouping the ngrams and keeping the maximum score for label
142 let grouped = toGroupedText (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) allTerms
144 groupedWithList = map (addListType (invertForw socialLists)) grouped
146 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
147 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
149 -- printDebug "\n * stopTerms * \n" stopTerms
150 -- splitting monterms and multiterms to take proportional candidates
152 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
153 monoSize = 0.4 :: Double
154 multSize = 1 - monoSize
156 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
158 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
159 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
161 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
162 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
163 -- printDebug "groupedMultHead" (List.length groupedMultHead)
164 -- printDebug "groupedMultTail" (List.length groupedMultTail)
167 -- Get Local Scores now for selected grouped ngrams
168 selectedTerms = Set.toList $ List.foldl'
169 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
173 (groupedMonoHead <> groupedMultHead)
175 -- TO remove (and remove HasNodeError instance)
176 userListId <- defaultList uCid
177 masterListId <- defaultList mCid
179 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
182 mapGroups = Map.fromList
183 $ map (\g -> (g ^. gt_stem, g))
184 $ groupedMonoHead <> groupedMultHead
186 -- grouping with Set NodeId
187 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup groupParams k
188 in case Map.lookup k' mapGroups' of
189 Nothing -> mapGroups'
190 Just g -> case Map.lookup k mapTextDocIds of
191 Nothing -> mapGroups'
192 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
195 $ Map.keys mapTextDocIds
197 -- compute cooccurrences
198 mapCooc = Map.filter (>2)
199 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
200 | (t1, s1) <- mapStemNodeIds
201 , (t2, s2) <- mapStemNodeIds
204 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
205 -- printDebug "mapCooc" mapCooc
209 mapScores f = Map.fromList
210 $ map (\(Scored t g s') -> (t, f (g,s')))
215 groupsWithScores = catMaybes
217 -> case Map.lookup stem mapScores' of
219 Just s' -> Just $ g { _gt_score = s'}
220 ) $ Map.toList contextsAdded
222 mapScores' = mapScores identity
223 -- adapt2 TOCHECK with DC
224 -- printDebug "groupsWithScores" groupsWithScores
226 -- sort / partition / split
227 -- filter mono/multi again
228 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
229 -- filter with max score
230 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
232 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
233 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
237 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
238 inclSize = 0.4 :: Double
239 exclSize = 1 - inclSize
240 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
242 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
243 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
245 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
246 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
249 -- Final Step building the Typed list
250 termListHead = maps <> cands
252 maps = set gt_listType (Just MapTerm)
253 <$> monoScoredInclHead
254 <> monoScoredExclHead
255 <> multScoredInclHead
256 <> multScoredExclHead
258 cands = set gt_listType (Just CandidateTerm)
259 <$> monoScoredInclTail
260 <> monoScoredExclTail
261 <> multScoredInclTail
262 <> multScoredExclTail
264 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
266 -- printDebug "monoScoredInclHead" monoScoredInclHead
267 -- printDebug "monoScoredExclHead" monoScoredExclTail
269 -- printDebug "multScoredInclHead" multScoredInclHead
270 -- printDebug "multScoredExclTail" multScoredExclTail
272 let result = Map.unionsWith (<>)
273 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
274 <> (List.concat $ map toNgramsElement $ termListTail)
275 <> (List.concat $ map toNgramsElement $ stopTerms)
278 -- printDebug "\n result \n" r
283 toNgramsElement :: GroupedText a -> [NgramsElement]
284 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
285 [parentElem] <> childrenElems
288 children = Set.toList setNgrams
289 parentElem = mkNgramsElement (NgramsTerm parent)
290 (fromMaybe CandidateTerm listType)
292 (mSetFromList (NgramsTerm <$> children))
293 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
294 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
296 ) (NgramsTerm <$> children)
299 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
300 toGargList l n = (l,n)
303 isStopTerm :: StopSize -> Text -> Bool
304 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
306 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
308 ------------------------------------------------------------------------------