]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[FIX] Clean Text before sending it to NLP micro services + tests + clean code for...
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14 {-# LANGUAGE BangPatterns #-}
15
16 module Gargantext.Core.Text.List
17 where
18
19 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
20 import Data.HashMap.Strict (HashMap)
21 import Data.HashSet (HashSet)
22 import Data.Map.Strict (Map)
23 import Data.Monoid (mempty)
24 import Data.Ord (Down(..))
25 import Data.Set (Set)
26 -- import Data.Text (Text)
27 import Data.Tuple.Extra (both)
28 import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
29 import Gargantext.Core.NodeStory
30 import Gargantext.Core.Text (size)
31 import Gargantext.Core.Text.List.Group
32 import Gargantext.Core.Text.List.Group.Prelude
33 import Gargantext.Core.Text.List.Group.WithStem
34 import Gargantext.Core.Text.List.Social
35 import Gargantext.Core.Text.List.Social.Prelude
36 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
37 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
38 import Gargantext.Core.Types.Individu (User(..))
39 import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsUser, getContextsByNgramsOnlyUser)
40 import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
41 import Gargantext.Database.Admin.Types.Node (NodeId)
42 import Gargantext.Database.Prelude (CmdM)
43 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
44 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
45 import Gargantext.Database.Query.Table.Node (defaultList)
46 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
47 import Gargantext.Database.Query.Tree.Error (HasTreeError)
48 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
49 import Gargantext.Prelude
50 import qualified Data.HashMap.Strict as HashMap
51 import qualified Data.HashSet as HashSet
52 import qualified Data.List as List
53 import qualified Data.Map.Strict as Map
54 import qualified Data.Set as Set
55 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
56
57 {-
58 -- TODO maybe useful for later
59 isStopTerm :: StopSize -> Text -> Bool
60 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
61 where
62 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
63 -}
64
65
66 -- | TODO improve grouping functions of Authors, Sources, Institutes..
67 buildNgramsLists :: ( HasNodeStory env err m
68 , CmdM env err m
69 , HasTreeError err
70 , HasNodeError err
71 )
72 => User
73 -> UserCorpusId
74 -> MasterCorpusId
75 -> Maybe FlowSocialListWith
76 -> GroupParams
77 -> m (Map NgramsType [NgramsElement])
78 buildNgramsLists user uCid mCid mfslw gp = do
79 ngTerms <- buildNgramsTermsList user uCid mCid mfslw gp (NgramsTerms, MapListSize 350)
80 othersTerms <- mapM (buildNgramsOthersList user uCid mfslw GroupIdentity)
81 [ (Authors , MapListSize 9, MaxListSize 1000)
82 , (Sources , MapListSize 9, MaxListSize 1000)
83 , (Institutes, MapListSize 9, MaxListSize 1000)
84 ]
85
86 pure $ Map.unions $ [ngTerms] <> othersTerms
87
88
89 data MapListSize = MapListSize { unMapListSize :: !Int }
90 data MaxListSize = MaxListSize { unMaxListSize :: !Int }
91
92 buildNgramsOthersList :: ( HasNodeError err
93 , CmdM env err m
94 , HasNodeStory env err m
95 , HasTreeError err
96 )
97 => User
98 -> UserCorpusId
99 -> Maybe FlowSocialListWith
100 -> GroupParams
101 -> (NgramsType, MapListSize, MaxListSize)
102 -> m (Map NgramsType [NgramsElement])
103 buildNgramsOthersList user uCid mfslw _groupParams (nt, MapListSize mapListSize, MaxListSize maxListSize) = do
104 allTerms :: HashMap NgramsTerm (Set NodeId) <- getContextsByNgramsUser uCid nt
105
106 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
107 socialLists :: FlowCont NgramsTerm FlowListScores
108 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
109 $ HashMap.fromList
110 $ List.zip (HashMap.keys allTerms)
111 (List.cycle [mempty])
112 )
113 let
114 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
115
116 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
117 $ view flc_scores groupedWithList
118
119 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
120
121 listSize = mapListSize - (List.length mapTerms)
122 (mapTerms', candiTerms) = both HashMap.fromList
123 $ List.splitAt listSize
124 $ List.take maxListSize
125 $ List.sortOn (Down . viewScore . snd)
126 $ HashMap.toList tailTerms'
127
128
129 pure $ Map.fromList [( nt, List.take maxListSize $ (toNgramsElement stopTerms)
130 <> (toNgramsElement mapTerms )
131 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
132 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
133 )]
134
135
136 getGroupParams :: ( HasNodeError err
137 , CmdM env err m
138 , HasNodeStory env err m
139 , HasTreeError err
140 )
141 => GroupParams -> HashSet Ngrams -> m GroupParams
142 getGroupParams gp@(GroupWithPosTag l nsc _m) ng = do
143 !hashMap <- HashMap.fromList <$> selectLems l nsc (HashSet.toList ng)
144 -- printDebug "hashMap" hashMap
145 pure $ over gwl_map (\x -> x <> hashMap) gp
146 getGroupParams gp _ = pure gp
147
148
149 -- TODO use ListIds
150 buildNgramsTermsList :: ( HasNodeError err
151 , CmdM env err m
152 , HasNodeStory env err m
153 , HasTreeError err
154 )
155 => User
156 -> UserCorpusId
157 -> MasterCorpusId
158 -> Maybe FlowSocialListWith
159 -> GroupParams
160 -> (NgramsType, MapListSize)
161 -> m (Map NgramsType [NgramsElement])
162 buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSize) = do
163
164 -- Filter 0 With Double
165 -- Computing global speGen score
166 -- printDebug "[buildNgramsTermsList: Sample List] / start" nt
167 !(allTerms :: HashMap NgramsTerm Double) <- getTficf_withSample uCid mCid nt
168
169 -- printDebug "[buildNgramsTermsList: Sample List / end]" (nt, HashMap.size allTerms)
170
171 -- printDebug "[buildNgramsTermsList: Flow Social List / start]" nt
172
173 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
174 !(socialLists :: FlowCont NgramsTerm FlowListScores)
175 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
176 $ HashMap.fromList
177 $ List.zip (HashMap.keys allTerms)
178 (List.cycle [mempty])
179 )
180 -- printDebug "[buildNgramsTermsList: Flow Social List / end]" nt
181
182 let !ngramsKeys = HashSet.fromList
183 $ List.take mapListSize
184 $ HashSet.toList
185 $ HashMap.keysSet allTerms
186
187 -- printDebug "[buildNgramsTermsList: ngramsKeys]" (HashSet.size ngramsKeys)
188
189 !groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
190
191 -- printDebug "[buildNgramsTermsList: groupParams']" ("" :: Text)
192
193 let
194 !socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
195 !groupedWithList = toGroupedTree socialLists_Stemmed allTerms
196 !(stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
197 $ HashMap.filter (\g -> (view gts'_score g) > 1)
198 $ view flc_scores groupedWithList
199
200 !(groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
201
202 -- printDebug "[buildNgramsTermsList] socialLists" socialLists
203 -- printDebug "[buildNgramsTermsList] socialLists with scores" socialLists_Stemmed
204 -- printDebug "[buildNgramsTermsList] groupedWithList" groupedWithList
205 -- printDebug "[buildNgramsTermsList] stopTerms" stopTerms
206
207 -- splitting monterms and multiterms to take proportional candidates
208 -- use % of list if to big, or Int if too small
209 let
210 !listSizeGlobal = 2000 :: Double
211 !monoSize = 0.4 :: Double
212 !multSize = 1 - monoSize
213
214 splitAt n' ns = both (HashMap.fromListWith (<>))
215 $ List.splitAt (round $ n' * listSizeGlobal)
216 $ List.sortOn (viewScore . snd)
217 $ HashMap.toList ns
218
219 !(groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
220 !(groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
221
222 -------------------------
223 -- Filter 1 With Set NodeId and SpeGen
224 !selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
225
226 -- printDebug "[buildNgramsTermsList: selectedTerms]" selectedTerms
227
228 -- TODO remove (and remove HasNodeError instance)
229 !userListId <- defaultList uCid
230 !masterListId <- defaultList mCid
231
232 !mapTextDocIds <- getContextsByNgramsOnlyUser uCid
233 [userListId, masterListId]
234 nt
235 selectedTerms
236
237
238 -- printDebug "[buildNgramsTermsList: mapTextDocIds]" mapTextDocIds
239
240 let
241 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
242 !groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
243 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
244
245
246 -- printDebug "[buildNgramsTermsList: groupedTreeScores_SetNodeId]" groupedTreeScores_SetNodeId
247
248 -- Coocurrences computation
249 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
250 let !mapCooc = HashMap.filter (>1) -- removing cooc of 1
251 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
252 | (t1, s1) <- mapStemNodeIds
253 , (t2, s2) <- mapStemNodeIds
254 ]
255 where
256 mapStemNodeIds = HashMap.toList
257 $ HashMap.map viewScores
258 $ groupedTreeScores_SetNodeId
259 let
260 -- computing scores
261 mapScores f = HashMap.fromList
262 $ map (\g -> (view scored_terms g, f g))
263 $ normalizeGlobal
264 $ map normalizeLocal
265 $ scored'
266 $ Map.fromList -- TODO remove this
267 $ HashMap.toList mapCooc
268
269 let
270 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
271 !groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
272
273 let
274 -- sort / partition / split
275 -- filter mono/multi again
276 !(monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
277
278 -- filter with max score
279 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
280 > (view scored_speExc $ view gts'_score g)
281 )
282
283 !(monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
284 !(multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
285
286 -- splitAt
287 let
288 -- use % of list if to big, or Int if to small
289 !mapSize = 1000 :: Double
290 !canSize = mapSize * 2 :: Double
291
292 !inclSize = 0.4 :: Double
293 !exclSize = 1 - inclSize
294
295 splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
296 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
297
298 monoInc_size n = splitAt' n $ monoSize * inclSize / 2
299 multExc_size n = splitAt' n $ multSize * exclSize / 2
300
301
302 !(mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
303 !(mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
304
305 !(mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
306 !(mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
307
308
309 !(canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
310 !(canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
311
312 !(canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
313 !(canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
314
315 ------------------------------------------------------------
316 -- Final Step building the Typed list
317 -- Candidates Terms need to be filtered
318 let
319 !maps = setListType (Just MapTerm)
320 $ mapMonoScoredInclHead
321 <> mapMonoScoredExclHead
322 <> mapMultScoredInclHead
323 <> mapMultScoredExclHead
324
325 -- An original way to filter to start with
326 !cands = setListType (Just CandidateTerm)
327 $ canMonoScoredIncHead
328 <> canMonoScoredExclHead
329 <> canMulScoredInclHead
330 <> canMultScoredExclHead
331
332 -- TODO count it too
333 !cands' = setListType (Just CandidateTerm)
334 {-\$ groupedMonoTail
335 <>-} groupedMultTail
336
337 -- Quick FIX
338 !candNgramsElement = List.take 1000
339 $ toNgramsElement cands <> toNgramsElement cands'
340
341 !result = Map.unionsWith (<>)
342 [ Map.fromList [( nt, toNgramsElement maps
343 <> toNgramsElement stopTerms
344 <> candNgramsElement
345 )]
346 ]
347
348 pure result