]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[OPTIM] Flow List / Metrics TFICF with sample
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
20 import Data.HashSet (HashSet)
21 import Data.Map (Map)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
24 import Data.Set (Set)
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
27 import Gargantext.Core.NodeStory
28 import Gargantext.Core.Text (size)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
43 import Gargantext.Database.Query.Table.Node (defaultList)
44 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
45 import Gargantext.Database.Query.Tree.Error (HasTreeError)
46 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
47 import Gargantext.Prelude
48 import qualified Data.HashMap.Strict as HashMap
49 import qualified Data.HashSet as HashSet
50 import qualified Data.List as List
51 import qualified Data.Map as Map
52 import qualified Data.Set as Set
53 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
54
55 {-
56 -- TODO maybe useful for later
57 isStopTerm :: StopSize -> Text -> Bool
58 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
59 where
60 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
61 -}
62
63
64 -- | TODO improve grouping functions of Authors, Sources, Institutes..
65 buildNgramsLists :: ( HasNodeStory env err m
66 , CmdM env err m
67 , HasTreeError err
68 , HasNodeError err
69 )
70 => GroupParams
71 -> User
72 -> UserCorpusId
73 -> MasterCorpusId
74 -> m (Map NgramsType [NgramsElement])
75 buildNgramsLists gp user uCid mCid = do
76 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
77 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
78 [ (Authors , MapListSize 9)
79 , (Sources , MapListSize 9)
80 , (Institutes, MapListSize 9)
81 ]
82
83 pure $ Map.unions $ [ngTerms] <> othersTerms
84
85
86 data MapListSize = MapListSize { unMapListSize :: !Int }
87
88 buildNgramsOthersList ::( HasNodeError err
89 , CmdM env err m
90 , HasNodeStory env err m
91 , HasTreeError err
92 )
93 => User
94 -> UserCorpusId
95 -> GroupParams
96 -> (NgramsType, MapListSize)
97 -> m (Map NgramsType [NgramsElement])
98 buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
99 allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
100
101 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
102 socialLists :: FlowCont NgramsTerm FlowListScores
103 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
104 $ HashMap.fromList
105 $ List.zip (HashMap.keys allTerms)
106 (List.cycle [mempty])
107 )
108 let
109 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
110
111 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
112 $ view flc_scores groupedWithList
113
114 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
115
116 listSize = mapListSize - (List.length mapTerms)
117 (mapTerms', candiTerms) = both HashMap.fromList
118 $ List.splitAt listSize
119 $ List.sortOn (Down . viewScore . snd)
120 $ HashMap.toList tailTerms'
121
122
123 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
124 <> (toNgramsElement mapTerms )
125 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
126 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
127 )]
128
129
130 getGroupParams :: ( HasNodeError err
131 , CmdM env err m
132 , HasNodeStory env err m
133 , HasTreeError err
134 )
135 => GroupParams -> HashSet Ngrams -> m GroupParams
136 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
137 hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
138 -- printDebug "hashMap" hashMap
139 pure $ over gwl_map (\x -> x <> hashMap) gp
140 getGroupParams gp _ = pure gp
141
142
143 -- TODO use ListIds
144 buildNgramsTermsList :: ( HasNodeError err
145 , CmdM env err m
146 , HasNodeStory env err m
147 , HasTreeError err
148 )
149 => User
150 -> UserCorpusId
151 -> MasterCorpusId
152 -> GroupParams
153 -> (NgramsType, MapListSize)
154 -> m (Map NgramsType [NgramsElement])
155 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
156
157 -- Filter 0 With Double
158 -- Computing global speGen score
159 printDebug "[buldNgramsTermsList: Sample List] / start" nt
160 allTerms :: HashMap NgramsTerm Double <- getTficf_withSample uCid mCid nt
161 printDebug "[buldNgramsTermsList: Sample List / end]" nt
162
163 printDebug "[buldNgramsTermsList: Flow Social List / start]" nt
164 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
165 socialLists :: FlowCont NgramsTerm FlowListScores
166 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
167 $ HashMap.fromList
168 $ List.zip (HashMap.keys allTerms)
169 (List.cycle [mempty])
170 )
171 printDebug "[buldNgramsTermsList: Flow Social List / end]" nt
172
173 let ngramsKeys = HashMap.keysSet allTerms
174
175 groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
176
177 let
178 socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
179 --printDebug "socialLists_Stemmed" socialLists_Stemmed
180 groupedWithList = toGroupedTree socialLists_Stemmed allTerms
181 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
182 $ HashMap.filter (\g -> (view gts'_score g) > 1)
183 $ view flc_scores groupedWithList
184
185 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
186
187 -- printDebug "stopTerms" stopTerms
188
189 -- splitting monterms and multiterms to take proportional candidates
190 -- use % of list if to big, or Int if too small
191 listSizeGlobal = 2000 :: Double
192 monoSize = 0.4 :: Double
193 multSize = 1 - monoSize
194
195 splitAt n' ns = both (HashMap.fromListWith (<>))
196 $ List.splitAt (round $ n' * listSizeGlobal)
197 $ List.sortOn (viewScore . snd)
198 $ HashMap.toList ns
199
200 (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
201 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
202
203 -------------------------
204 -- Filter 1 With Set NodeId and SpeGen
205 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
206
207
208 -- TODO remove (and remove HasNodeError instance)
209 userListId <- defaultList uCid
210 masterListId <- defaultList mCid
211
212 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
213 [userListId, masterListId]
214 nt
215 selectedTerms
216
217 let
218 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
219 groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
220 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
221
222
223 printDebug "groupedTreeScores_SetNodeId" groupedTreeScores_SetNodeId
224
225 -- Coocurrences computation
226 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
227 let mapCooc = HashMap.filter (>1) -- removing cooc of 1
228 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
229 | (t1, s1) <- mapStemNodeIds
230 , (t2, s2) <- mapStemNodeIds
231 ]
232 where
233 mapStemNodeIds = HashMap.toList
234 $ HashMap.map viewScores
235 $ groupedTreeScores_SetNodeId
236 let
237 -- computing scores
238 mapScores f = HashMap.fromList
239 $ map (\g -> (view scored_terms g, f g))
240 $ normalizeGlobal
241 $ map normalizeLocal
242 $ scored'
243 $ Map.fromList -- TODO remove this
244 $ HashMap.toList mapCooc
245
246 let
247 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
248 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
249
250 let
251 -- sort / partition / split
252 -- filter mono/multi again
253 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
254
255 -- filter with max score
256 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
257 > (view scored_speExc $ view gts'_score g)
258 )
259
260 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
261 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
262
263 -- splitAt
264 let
265 -- use % of list if to big, or Int if to small
266 mapSize = 1000 :: Double
267 canSize = mapSize * 5 :: Double
268
269 inclSize = 0.4 :: Double
270 exclSize = 1 - inclSize
271
272 splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
273 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
274
275 monoInc_size n = splitAt' n $ monoSize * inclSize / 2
276 multExc_size n = splitAt' n $ multSize * exclSize / 2
277
278
279 (mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
280 (mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
281
282 (mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
283 (mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
284
285
286 (canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
287 (canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
288
289 (canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
290 (canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
291
292 ------------------------------------------------------------
293 -- Final Step building the Typed list
294 -- Candidates Terms need to be filtered
295 let
296 maps = setListType (Just MapTerm)
297 $ mapMonoScoredInclHead
298 <> mapMonoScoredExclHead
299 <> mapMultScoredInclHead
300 <> mapMultScoredExclHead
301
302 -- An original way to filter to start with
303 cands = setListType (Just CandidateTerm)
304 $ canMonoScoredIncHead
305 <> canMonoScoredExclHead
306 <> canMulScoredInclHead
307 <> canMultScoredExclHead
308
309 -- TODO count it too
310 cands' = setListType (Just CandidateTerm)
311 {-\$ groupedMonoTail
312 <>-} groupedMultTail
313
314 -- Quick FIX
315 candNgramsElement = List.take 5000
316 $ toNgramsElement cands <> toNgramsElement cands'
317
318 result = Map.unionsWith (<>)
319 [ Map.fromList [( nt, toNgramsElement maps
320 <> toNgramsElement stopTerms
321 <> candNgramsElement
322 )]
323 ]
324
325 pure result