]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[OPTIM] HashMap Ngrams ...
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
20 import Data.Map (Map)
21 import Data.Monoid (mempty)
22 import Data.Ord (Down(..))
23 import Data.Set (Set)
24 import Data.Tuple.Extra (both)
25 import Gargantext.API.Ngrams.Types (NgramsElement, RepoCmdM, NgramsTerm(..))
26 import Gargantext.Core.Text (size)
27 import Gargantext.Core.Text.List.Group
28 import Gargantext.Core.Text.List.Group.Prelude
29 import Gargantext.Core.Text.List.Group.WithStem
30 import Gargantext.Core.Text.List.Social
31 import Gargantext.Core.Text.List.Social.Prelude
32 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
33 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
34 import Gargantext.Core.Types.Individu (User(..))
35 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
36 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
37 import Gargantext.Database.Admin.Types.Node (NodeId)
38 import Gargantext.Database.Prelude (CmdM)
39 import Gargantext.Database.Query.Table.Node (defaultList)
40 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
41 import Gargantext.Database.Query.Tree.Error (HasTreeError)
42 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
43 import Gargantext.Prelude
44 import qualified Data.HashMap.Strict as HashMap
45 import qualified Data.List as List
46 import qualified Data.Map as Map
47 import qualified Data.Set as Set
48 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
49
50 {-
51 -- TODO maybe useful for later
52 isStopTerm :: StopSize -> Text -> Bool
53 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
54 where
55 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
56 -}
57
58
59 -- | TODO improve grouping functions of Authors, Sources, Institutes..
60 buildNgramsLists :: ( RepoCmdM env err m
61 , CmdM env err m
62 , HasTreeError err
63 , HasNodeError err
64 )
65 => User
66 -> GroupParams
67 -> UserCorpusId
68 -> MasterCorpusId
69 -> m (Map NgramsType [NgramsElement])
70 buildNgramsLists user gp uCid mCid = do
71 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
72 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
73 [ (Authors , MapListSize 9)
74 , (Sources , MapListSize 9)
75 , (Institutes, MapListSize 9)
76 ]
77
78 pure $ Map.unions $ [ngTerms] <> othersTerms
79
80
81 data MapListSize = MapListSize { unMapListSize :: !Int }
82
83 buildNgramsOthersList ::( HasNodeError err
84 , CmdM env err m
85 , RepoCmdM env err m
86 , HasTreeError err
87 )
88 => User
89 -> UserCorpusId
90 -> GroupParams
91 -> (NgramsType, MapListSize)
92 -> m (Map NgramsType [NgramsElement])
93 buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
94 allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
95
96 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
97 socialLists :: FlowCont NgramsTerm FlowListScores
98 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
99 $ HashMap.fromList
100 $ List.zip (HashMap.keys allTerms)
101 (List.cycle [mempty])
102 )
103 {-
104 if nt == Sources -- Authors
105 then printDebug "flowSocialList" socialLists
106 else printDebug "flowSocialList" ""
107 -}
108 let
109 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
110 {-
111 if nt == Sources -- Authors
112 then printDebug "groupedWithList" groupedWithList
113 else printDebug "groupedWithList" ""
114 -}
115
116 let
117 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
118 $ view flc_scores groupedWithList
119
120 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
121
122 listSize = mapListSize - (List.length mapTerms)
123 (mapTerms', candiTerms) = both HashMap.fromList
124 $ List.splitAt listSize
125 $ List.sortOn (Down . viewScore . snd)
126 $ HashMap.toList tailTerms'
127
128 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
129 <> (toNgramsElement mapTerms )
130 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
131 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
132 )]
133
134
135 -- TODO use ListIds
136 buildNgramsTermsList :: ( HasNodeError err
137 , CmdM env err m
138 , RepoCmdM env err m
139 , HasTreeError err
140 )
141 => User
142 -> UserCorpusId
143 -> MasterCorpusId
144 -> GroupParams
145 -> (NgramsType, MapListSize)
146 -> m (Map NgramsType [NgramsElement])
147 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
148
149 -- | Filter 0 With Double
150 -- Computing global speGen score
151 allTerms :: HashMap NgramsTerm Double <- getTficf uCid mCid nt
152
153 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
154 socialLists :: FlowCont NgramsTerm FlowListScores
155 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
156 $ HashMap.fromList
157 $ List.zip (HashMap.keys allTerms)
158 (List.cycle [mempty])
159 )
160
161 let socialLists_Stemmed = addScoreStem groupParams (HashMap.keysSet allTerms) socialLists
162 printDebug "socialLists_Stemmed" socialLists_Stemmed
163 let groupedWithList = toGroupedTree {- groupParams -} socialLists_Stemmed allTerms
164 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
165 $ view flc_scores groupedWithList
166
167 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
168
169 -- printDebug "stopTerms" stopTerms
170
171 -- splitting monterms and multiterms to take proportional candidates
172 let
173 -- use % of list if to big, or Int if too small
174 listSizeGlobal = 2000 :: Double
175 monoSize = 0.4 :: Double
176 multSize = 1 - monoSize
177
178 splitAt n' ns = both (HashMap.fromListWith (<>))
179 $ List.splitAt (round $ n' * listSizeGlobal)
180 $ List.sortOn (viewScore . snd)
181 $ HashMap.toList ns
182
183 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
184 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
185
186 -------------------------
187 -- Filter 1 With Set NodeId and SpeGen
188 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
189
190
191 -- TO remove (and remove HasNodeError instance)
192 userListId <- defaultList uCid
193 masterListId <- defaultList mCid
194
195
196 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
197 [userListId, masterListId]
198 nt
199 selectedTerms
200
201 let
202 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
203 groupedTreeScores_SetNodeId = setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
204
205 -- | Coocurrences computation
206 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
207 let mapCooc = HashMap.filter (>2)
208 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
209 | (t1, s1) <- mapStemNodeIds
210 , (t2, s2) <- mapStemNodeIds
211 ]
212 where
213 mapStemNodeIds = HashMap.toList
214 $ HashMap.map viewScores
215 $ groupedTreeScores_SetNodeId
216 let
217 -- computing scores
218 mapScores f = HashMap.fromList
219 $ map (\g -> (view scored_terms g, f g))
220 $ normalizeGlobal
221 $ map normalizeLocal
222 $ scored'
223 $ Map.fromList -- TODO remove this
224 $ HashMap.toList mapCooc
225
226 let
227 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
228 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity)
229 ( groupedMonoHead
230 <> groupedMultHead
231 )
232
233 let
234 -- sort / partition / split
235 -- filter mono/multi again
236 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
237
238 -- filter with max score
239 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
240 > (view scored_speExc $ view gts'_score g)
241 )
242
243 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
244 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
245
246 -- splitAt
247 let
248 -- use % of list if to big, or Int if to small
249 listSizeLocal = 1000 :: Double
250 inclSize = 0.4 :: Double
251 exclSize = 1 - inclSize
252
253 splitAt' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
254 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
255
256
257 monoInc_size = splitAt' $ monoSize * inclSize / 2
258 (monoScoredInclHead, monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl
259 (monoScoredExclHead, monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl
260
261 multExc_size = splitAt' $ multSize * exclSize / 2
262 (multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl
263 (multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl
264
265 ------------------------------------------------------------
266 -- Final Step building the Typed list
267 termListHead = maps <> cands
268 where
269 maps = setListType (Just MapTerm)
270 $ monoScoredInclHead
271 <> monoScoredExclHead
272 <> multScoredInclHead
273 <> multScoredExclHead
274
275 cands = setListType (Just CandidateTerm)
276 $ monoScoredInclTail
277 <> monoScoredExclTail
278 <> multScoredInclTail
279 <> multScoredExclTail
280
281 termListTail = (setListType (Just CandidateTerm)) (groupedMonoTail <> groupedMultTail)
282
283 let result = Map.unionsWith (<>)
284 [ Map.fromList [( nt, toNgramsElement termListHead
285 <> toNgramsElement termListTail
286 <> toNgramsElement stopTerms
287 )]
288 ]
289
290 -- printDebug "result" result
291
292 pure result