]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[FIX] memory leak, useable ngrams table version (WIP)
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18
19 import Control.Lens ((^.), set, view)
20 import Data.Maybe (fromMaybe, catMaybes)
21 import Data.Ord (Down(..))
22 import Data.Map (Map)
23 import Data.Set (Set)
24 import Data.Text (Text)
25 import qualified Data.Char as Char
26 import qualified Data.List as List
27 import qualified Data.Map as Map
28 import qualified Data.Set as Set
29 import qualified Data.Text as Text
30
31 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
32 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
33 import Gargantext.API.Ngrams.Types (RepoCmdM)
34 import Gargantext.Core.Text.List.Social (flowSocialList, flowSocialList', FlowSocialListPriority(..), invertForw)
35 import Gargantext.Core.Text.List.Social.Scores -- (FlowListScores)
36 import Gargantext.Core.Text.List.Group
37 import Gargantext.Core.Text.List.Group.WithStem
38 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
39 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
40 import Gargantext.Core.Types.Individu (User(..))
41 import Gargantext.Database.Admin.Types.Node (NodeId)
42 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
43 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
44 import Gargantext.Database.Prelude (CmdM)
45 import Gargantext.Database.Query.Table.Node (defaultList)
46 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
47 import Gargantext.Database.Query.Tree.Error (HasTreeError)
48 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
49 import Gargantext.Prelude
50
51
52 -- | TODO improve grouping functions of Authors, Sources, Institutes..
53 buildNgramsLists :: ( RepoCmdM env err m
54 , CmdM env err m
55 , HasTreeError err
56 , HasNodeError err
57 )
58 => User
59 -> GroupParams
60 -> UserCorpusId
61 -> MasterCorpusId
62 -> m (Map NgramsType [NgramsElement])
63 buildNgramsLists user gp uCid mCid = do
64 ngTerms <- buildNgramsTermsList user uCid mCid gp
65 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
66 [ (Authors , MapListSize 9)
67 , (Sources , MapListSize 9)
68 , (Institutes, MapListSize 9)
69 ]
70
71 pure $ Map.unions $ [ngTerms] <> othersTerms
72
73
74 data MapListSize = MapListSize { unMapListSize :: !Int }
75
76 buildNgramsOthersList ::( HasNodeError err
77 , CmdM env err m
78 , RepoCmdM env err m
79 , HasTreeError err
80 )
81 => User
82 -> UserCorpusId
83 -> (Text -> Text)
84 -> (NgramsType, MapListSize)
85 -> m (Map NgramsType [NgramsElement])
86 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
87 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
88
89 socialLists' :: Map Text FlowListScores
90 <- flowSocialList' MySelfFirst user nt (Set.fromList $ Map.keys ngs')
91 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
92
93 printDebug "flowSocialList'" (Map.filter (not . ((==) Map.empty) . view fls_parents) socialLists')
94
95 let
96 groupParams = GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-}
97 groupedWithList = toGroupedText groupParams socialLists' ngs'
98
99 printDebug "groupedWithList" (Map.map (\v -> (view gt_label v, view gt_children v)) $ Map.filter (\v -> (Set.size $ view gt_children v) > 0) groupedWithList)
100
101 let
102 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
103 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm) tailTerms
104
105 listSize = mapListSize - (List.length mapTerms)
106 (mapTerms', candiTerms) = List.splitAt listSize
107 $ List.sortOn (Down . _gt_score)
108 $ Map.elems tailTerms'
109
110 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
111 <> (List.concat $ map toNgramsElement mapTerms )
112 <> (List.concat $ map toNgramsElement
113 $ map (set gt_listType (Just MapTerm )) mapTerms' )
114 <> (List.concat $ map toNgramsElement
115 $ map (set gt_listType (Just CandidateTerm)) candiTerms)
116 )]
117
118
119 -- TODO use ListIds
120 buildNgramsTermsList :: ( HasNodeError err
121 , CmdM env err m
122 , RepoCmdM env err m
123 , HasTreeError err
124 )
125 => User
126 -> UserCorpusId
127 -> MasterCorpusId
128 -> GroupParams
129 -> m (Map NgramsType [NgramsElement])
130 buildNgramsTermsList user uCid mCid groupParams = do
131
132 -- Computing global speGen score
133 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
134
135 -- printDebug "head candidates" (List.take 10 $ allTerms)
136 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
137
138 -- First remove stops terms
139 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
140 -- printDebug "\n * socialLists * \n" socialLists
141
142 -- Grouping the ngrams and keeping the maximum score for label
143 let grouped = groupedTextWithStem ( GroupedTextParams (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
144
145 groupedWithList = map (addListType (invertForw socialLists)) grouped
146
147 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
148 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
149
150 -- printDebug "\n * stopTerms * \n" stopTerms
151 -- splitting monterms and multiterms to take proportional candidates
152 let
153 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
154 monoSize = 0.4 :: Double
155 multSize = 1 - monoSize
156
157 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
158
159 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
160 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
161
162 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
163 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
164 -- printDebug "groupedMultHead" (List.length groupedMultHead)
165 -- printDebug "groupedMultTail" (List.length groupedMultTail)
166
167 let
168 -- Get Local Scores now for selected grouped ngrams
169 selectedTerms = Set.toList $ List.foldl'
170 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
171 $ Set.insert l' g
172 )
173 Set.empty
174 (groupedMonoHead <> groupedMultHead)
175
176 -- TO remove (and remove HasNodeError instance)
177 userListId <- defaultList uCid
178 masterListId <- defaultList mCid
179
180 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
181
182 let
183 mapGroups = Map.fromList
184 $ map (\g -> (g ^. gt_stem, g))
185 $ groupedMonoHead <> groupedMultHead
186
187 -- grouping with Set NodeId
188 contextsAdded = foldl' (\mapGroups' k ->
189 let k' = ngramsGroup groupParams k in
190 case Map.lookup k' mapGroups' of
191 Nothing -> mapGroups'
192 Just g -> case Map.lookup k mapTextDocIds of
193 Nothing -> mapGroups'
194 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
195 )
196 mapGroups
197 $ Map.keys mapTextDocIds
198
199 -- compute cooccurrences
200 mapCooc = Map.filter (>2)
201 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
202 | (t1, s1) <- mapStemNodeIds
203 , (t2, s2) <- mapStemNodeIds
204 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
205 ]
206 where
207 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
208 -- printDebug "mapCooc" mapCooc
209
210 let
211 -- computing scores
212 mapScores f = Map.fromList
213 $ map (\(Scored t g s') -> (t, f (g,s')))
214 $ normalizeGlobal
215 $ map normalizeLocal
216 $ scored' mapCooc
217
218 groupsWithScores = catMaybes
219 $ map (\(stem, g)
220 -> case Map.lookup stem mapScores' of
221 Nothing -> Nothing
222 Just s' -> Just $ g { _gt_score = s'}
223 ) $ Map.toList contextsAdded
224 where
225 mapScores' = mapScores identity
226 -- adapt2 TOCHECK with DC
227 -- printDebug "groupsWithScores" groupsWithScores
228 let
229 -- sort / partition / split
230 -- filter mono/multi again
231 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
232 -- filter with max score
233 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
234
235 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
236 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
237
238 -- splitAt
239 let
240 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
241 inclSize = 0.4 :: Double
242 exclSize = 1 - inclSize
243 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
244
245 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
246 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
247
248 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
249 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
250
251
252 -- Final Step building the Typed list
253 termListHead = maps <> cands
254 where
255 maps = set gt_listType (Just MapTerm)
256 <$> monoScoredInclHead
257 <> monoScoredExclHead
258 <> multScoredInclHead
259 <> multScoredExclHead
260
261 cands = set gt_listType (Just CandidateTerm)
262 <$> monoScoredInclTail
263 <> monoScoredExclTail
264 <> multScoredInclTail
265 <> multScoredExclTail
266
267 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
268
269 -- printDebug "monoScoredInclHead" monoScoredInclHead
270 -- printDebug "monoScoredExclHead" monoScoredExclTail
271 -- printDebug "multScoredInclHead" multScoredInclHead
272 -- printDebug "multScoredExclTail" multScoredExclTail
273
274 let result = Map.unionsWith (<>)
275 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
276 <> (List.concat $ map toNgramsElement $ termListTail)
277 <> (List.concat $ map toNgramsElement $ stopTerms)
278 )]
279 ]
280 -- printDebug "\n result \n" r
281 pure result
282
283
284
285 toNgramsElement :: GroupedText a -> [NgramsElement]
286 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
287 [parentElem] <> childrenElems
288 where
289 parent = label
290 children = Set.toList setNgrams
291 parentElem = mkNgramsElement (NgramsTerm parent)
292 (fromMaybe CandidateTerm listType)
293 Nothing
294 (mSetFromList (NgramsTerm <$> children))
295 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
296 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
297 (mSetFromList [])
298 ) (NgramsTerm <$> children)
299
300
301 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
302 toGargList l n = (l,n)
303
304
305 isStopTerm :: StopSize -> Text -> Bool
306 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
307 where
308 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
309
310 ------------------------------------------------------------------------------