]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[FIX] Quick Fix of the ngrams building list
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14 {-# LANGUAGE BangPatterns #-}
15
16 module Gargantext.Core.Text.List
17 where
18
19 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
20 import Data.HashMap.Strict (HashMap)
21 import Data.HashSet (HashSet)
22 import Data.Map (Map)
23 import Data.Monoid (mempty)
24 import Data.Ord (Down(..))
25 import Data.Set (Set)
26 import Data.Text (Text)
27 import Data.Tuple.Extra (both)
28 import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
29 import Gargantext.Core.NodeStory
30 import Gargantext.Core.Text (size)
31 import Gargantext.Core.Text.List.Group
32 import Gargantext.Core.Text.List.Group.Prelude
33 import Gargantext.Core.Text.List.Group.WithStem
34 import Gargantext.Core.Text.List.Social
35 import Gargantext.Core.Text.List.Social.Prelude
36 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
37 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
38 import Gargantext.Core.Types.Individu (User(..))
39 import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsUser, getContextsByNgramsOnlyUser)
40 import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
41 import Gargantext.Database.Admin.Types.Node (NodeId)
42 import Gargantext.Database.Prelude (CmdM)
43 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
44 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
45 import Gargantext.Database.Query.Table.Node (defaultList)
46 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
47 import Gargantext.Database.Query.Tree.Error (HasTreeError)
48 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
49 import Gargantext.Prelude
50 import qualified Data.HashMap.Strict as HashMap
51 import qualified Data.HashSet as HashSet
52 import qualified Data.List as List
53 import qualified Data.Map as Map
54 import qualified Data.Set as Set
55 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
56
57 {-
58 -- TODO maybe useful for later
59 isStopTerm :: StopSize -> Text -> Bool
60 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
61 where
62 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
63 -}
64
65
66 -- | TODO improve grouping functions of Authors, Sources, Institutes..
67 buildNgramsLists :: ( HasNodeStory env err m
68 , CmdM env err m
69 , HasTreeError err
70 , HasNodeError err
71 )
72 => User
73 -> UserCorpusId
74 -> MasterCorpusId
75 -> Maybe FlowSocialListWith
76 -> GroupParams
77 -> m (Map NgramsType [NgramsElement])
78 buildNgramsLists user uCid mCid mfslw gp = do
79 ngTerms <- buildNgramsTermsList user uCid mCid mfslw gp (NgramsTerms, MapListSize 350)
80 othersTerms <- mapM (buildNgramsOthersList user uCid mfslw GroupIdentity)
81 [ (Authors , MapListSize 9, MaxListSize 1000)
82 , (Sources , MapListSize 9, MaxListSize 1000)
83 , (Institutes, MapListSize 9, MaxListSize 1000)
84 ]
85
86 pure $ Map.unions $ [ngTerms] <> othersTerms
87
88
89 data MapListSize = MapListSize { unMapListSize :: !Int }
90 data MaxListSize = MaxListSize { unMaxListSize :: !Int }
91
92 buildNgramsOthersList :: ( HasNodeError err
93 , CmdM env err m
94 , HasNodeStory env err m
95 , HasTreeError err
96 )
97 => User
98 -> UserCorpusId
99 -> Maybe FlowSocialListWith
100 -> GroupParams
101 -> (NgramsType, MapListSize, MaxListSize)
102 -> m (Map NgramsType [NgramsElement])
103 buildNgramsOthersList user uCid mfslw _groupParams (nt, MapListSize mapListSize, MaxListSize maxListSize) = do
104 allTerms :: HashMap NgramsTerm (Set NodeId) <- getContextsByNgramsUser uCid nt
105
106 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
107 socialLists :: FlowCont NgramsTerm FlowListScores
108 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
109 $ HashMap.fromList
110 $ List.zip (HashMap.keys allTerms)
111 (List.cycle [mempty])
112 )
113 let
114 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
115
116 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
117 $ view flc_scores groupedWithList
118
119 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
120
121 listSize = mapListSize - (List.length mapTerms)
122 (mapTerms', candiTerms) = both HashMap.fromList
123 $ List.splitAt listSize
124 $ List.take maxListSize
125 $ List.sortOn (Down . viewScore . snd)
126 $ HashMap.toList tailTerms'
127
128
129 pure $ Map.fromList [( nt, List.take maxListSize $ (toNgramsElement stopTerms)
130 <> (toNgramsElement mapTerms )
131 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
132 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
133 )]
134
135
136 getGroupParams :: ( HasNodeError err
137 , CmdM env err m
138 , HasNodeStory env err m
139 , HasTreeError err
140 )
141 => GroupParams -> HashSet Ngrams -> m GroupParams
142 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
143 !hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
144 -- printDebug "hashMap" hashMap
145 pure $ over gwl_map (\x -> x <> hashMap) gp
146 getGroupParams gp _ = pure gp
147
148
149 -- TODO use ListIds
150 buildNgramsTermsList :: ( HasNodeError err
151 , CmdM env err m
152 , HasNodeStory env err m
153 , HasTreeError err
154 )
155 => User
156 -> UserCorpusId
157 -> MasterCorpusId
158 -> Maybe FlowSocialListWith
159 -> GroupParams
160 -> (NgramsType, MapListSize)
161 -> m (Map NgramsType [NgramsElement])
162 buildNgramsTermsList user uCid mCid mfslw groupParams (nt, _mapListSize)= do
163
164 -- Filter 0 With Double
165 -- Computing global speGen score
166 printDebug "[buildNgramsTermsList: Sample List] / start" nt
167 !(allTerms :: HashMap NgramsTerm Double) <- getTficf_withSample uCid mCid nt
168
169 printDebug "[buildNgramsTermsList: Sample List / end]" (nt, HashMap.size allTerms)
170
171 printDebug "[buildNgramsTermsList: Flow Social List / start]" nt
172
173 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
174 !(socialLists :: FlowCont NgramsTerm FlowListScores)
175 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
176 $ HashMap.fromList
177 $ List.zip (HashMap.keys allTerms)
178 (List.cycle [mempty])
179 )
180 printDebug "[buildNgramsTermsList: Flow Social List / end]" nt
181
182 let !ngramsKeys = HashSet.fromList $ List.take 1000 $ HashSet.toList $ HashMap.keysSet allTerms
183
184 printDebug "[buildNgramsTermsList: ngramsKeys]" (HashSet.size ngramsKeys)
185
186 !groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
187
188 printDebug "[buildNgramsTermsList: groupParams']" (""::Text)
189
190 let
191 !socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
192 --printDebug "socialLists_Stemmed" socialLists_Stemmed
193 !groupedWithList = toGroupedTree socialLists_Stemmed allTerms
194 !(stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
195 $ HashMap.filter (\g -> (view gts'_score g) > 1)
196 $ view flc_scores groupedWithList
197
198 !(groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
199
200 printDebug "[buildNgramsTermsList] stopTerms" stopTerms
201
202 -- splitting monterms and multiterms to take proportional candidates
203 -- use % of list if to big, or Int if too small
204 let
205 !listSizeGlobal = 2000 :: Double
206 !monoSize = 0.4 :: Double
207 !multSize = 1 - monoSize
208
209 splitAt n' ns = both (HashMap.fromListWith (<>))
210 $ List.splitAt (round $ n' * listSizeGlobal)
211 $ List.sortOn (viewScore . snd)
212 $ HashMap.toList ns
213
214 !(groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
215 !(groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
216
217 -------------------------
218 -- Filter 1 With Set NodeId and SpeGen
219 !selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
220
221 printDebug "[buildNgramsTermsList: selectedTerms]" selectedTerms
222
223 -- TODO remove (and remove HasNodeError instance)
224 !userListId <- defaultList uCid
225 !masterListId <- defaultList mCid
226
227 !mapTextDocIds <- getContextsByNgramsOnlyUser uCid
228 [userListId, masterListId]
229 nt
230 selectedTerms
231
232
233 printDebug "[buildNgramsTermsList: mapTextDocIds]" mapTextDocIds
234
235 let
236 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
237 !groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
238 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
239
240
241 printDebug "[buildNgramsTermsList: groupedTreeScores_SetNodeId]" groupedTreeScores_SetNodeId
242
243 -- Coocurrences computation
244 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
245 let !mapCooc = HashMap.filter (>1) -- removing cooc of 1
246 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
247 | (t1, s1) <- mapStemNodeIds
248 , (t2, s2) <- mapStemNodeIds
249 ]
250 where
251 mapStemNodeIds = HashMap.toList
252 $ HashMap.map viewScores
253 $ groupedTreeScores_SetNodeId
254 let
255 -- computing scores
256 mapScores f = HashMap.fromList
257 $ map (\g -> (view scored_terms g, f g))
258 $ normalizeGlobal
259 $ map normalizeLocal
260 $ scored'
261 $ Map.fromList -- TODO remove this
262 $ HashMap.toList mapCooc
263
264 let
265 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
266 !groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
267
268 let
269 -- sort / partition / split
270 -- filter mono/multi again
271 !(monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
272
273 -- filter with max score
274 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
275 > (view scored_speExc $ view gts'_score g)
276 )
277
278 !(monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
279 !(multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
280
281 -- splitAt
282 let
283 -- use % of list if to big, or Int if to small
284 !mapSize = 1000 :: Double
285 !canSize = mapSize * 2 :: Double
286
287 !inclSize = 0.4 :: Double
288 !exclSize = 1 - inclSize
289
290 splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
291 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
292
293 monoInc_size n = splitAt' n $ monoSize * inclSize / 2
294 multExc_size n = splitAt' n $ multSize * exclSize / 2
295
296
297 !(mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
298 !(mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
299
300 !(mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
301 !(mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
302
303
304 !(canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
305 !(canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
306
307 !(canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
308 !(canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
309
310 ------------------------------------------------------------
311 -- Final Step building the Typed list
312 -- Candidates Terms need to be filtered
313 let
314 !maps = setListType (Just MapTerm)
315 $ mapMonoScoredInclHead
316 <> mapMonoScoredExclHead
317 <> mapMultScoredInclHead
318 <> mapMultScoredExclHead
319
320 -- An original way to filter to start with
321 !cands = setListType (Just CandidateTerm)
322 $ canMonoScoredIncHead
323 <> canMonoScoredExclHead
324 <> canMulScoredInclHead
325 <> canMultScoredExclHead
326
327 -- TODO count it too
328 !cands' = setListType (Just CandidateTerm)
329 {-\$ groupedMonoTail
330 <>-} groupedMultTail
331
332 -- Quick FIX
333 !candNgramsElement = List.take 1000
334 $ toNgramsElement cands <> toNgramsElement cands'
335
336 !result = Map.unionsWith (<>)
337 [ Map.fromList [( nt, toNgramsElement maps
338 <> toNgramsElement stopTerms
339 <> candNgramsElement
340 )]
341 ]
342
343 pure result