]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[VERSION] +1 to 0.0.5.5.1
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
20 import Data.HashSet (HashSet)
21 import Data.Map (Map)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
24 import Data.Set (Set)
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
27 import Gargantext.Core.NodeStory
28 import Gargantext.Core.Text (size)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsUser, getContextsByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
43 import Gargantext.Database.Query.Table.Node (defaultList)
44 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
45 import Gargantext.Database.Query.Tree.Error (HasTreeError)
46 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
47 import Gargantext.Prelude
48 import qualified Data.HashMap.Strict as HashMap
49 import qualified Data.HashSet as HashSet
50 import qualified Data.List as List
51 import qualified Data.Map as Map
52 import qualified Data.Set as Set
53 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
54
55 {-
56 -- TODO maybe useful for later
57 isStopTerm :: StopSize -> Text -> Bool
58 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
59 where
60 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
61 -}
62
63
64 -- | TODO improve grouping functions of Authors, Sources, Institutes..
65 buildNgramsLists :: ( HasNodeStory env err m
66 , CmdM env err m
67 , HasTreeError err
68 , HasNodeError err
69 )
70 => User
71 -> UserCorpusId
72 -> MasterCorpusId
73 -> Maybe FlowSocialListWith
74 -> GroupParams
75 -> m (Map NgramsType [NgramsElement])
76 buildNgramsLists user uCid mCid mfslw gp = do
77 ngTerms <- buildNgramsTermsList user uCid mCid mfslw gp (NgramsTerms, MapListSize 350)
78 othersTerms <- mapM (buildNgramsOthersList user uCid mfslw GroupIdentity)
79 [ (Authors , MapListSize 9)
80 , (Sources , MapListSize 9)
81 , (Institutes, MapListSize 9)
82 ]
83
84 pure $ Map.unions $ [ngTerms] <> othersTerms
85
86
87 data MapListSize = MapListSize { unMapListSize :: !Int }
88
89 buildNgramsOthersList :: ( HasNodeError err
90 , CmdM env err m
91 , HasNodeStory env err m
92 , HasTreeError err
93 )
94 => User
95 -> UserCorpusId
96 -> Maybe FlowSocialListWith
97 -> GroupParams
98 -> (NgramsType, MapListSize)
99 -> m (Map NgramsType [NgramsElement])
100 buildNgramsOthersList user uCid mfslw _groupParams (nt, MapListSize mapListSize) = do
101 allTerms :: HashMap NgramsTerm (Set NodeId) <- getContextsByNgramsUser uCid nt
102
103 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
104 socialLists :: FlowCont NgramsTerm FlowListScores
105 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
106 $ HashMap.fromList
107 $ List.zip (HashMap.keys allTerms)
108 (List.cycle [mempty])
109 )
110 let
111 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
112
113 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
114 $ view flc_scores groupedWithList
115
116 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
117
118 listSize = mapListSize - (List.length mapTerms)
119 (mapTerms', candiTerms) = both HashMap.fromList
120 $ List.splitAt listSize
121 $ List.sortOn (Down . viewScore . snd)
122 $ HashMap.toList tailTerms'
123
124
125 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
126 <> (toNgramsElement mapTerms )
127 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
128 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
129 )]
130
131
132 getGroupParams :: ( HasNodeError err
133 , CmdM env err m
134 , HasNodeStory env err m
135 , HasTreeError err
136 )
137 => GroupParams -> HashSet Ngrams -> m GroupParams
138 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
139 hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
140 -- printDebug "hashMap" hashMap
141 pure $ over gwl_map (\x -> x <> hashMap) gp
142 getGroupParams gp _ = pure gp
143
144
145 -- TODO use ListIds
146 buildNgramsTermsList :: ( HasNodeError err
147 , CmdM env err m
148 , HasNodeStory env err m
149 , HasTreeError err
150 )
151 => User
152 -> UserCorpusId
153 -> MasterCorpusId
154 -> Maybe FlowSocialListWith
155 -> GroupParams
156 -> (NgramsType, MapListSize)
157 -> m (Map NgramsType [NgramsElement])
158 buildNgramsTermsList user uCid mCid mfslw groupParams (nt, _mapListSize)= do
159
160 -- Filter 0 With Double
161 -- Computing global speGen score
162 printDebug "[buildNgramsTermsList: Sample List] / start" nt
163 allTerms :: HashMap NgramsTerm Double <- getTficf_withSample uCid mCid nt
164 printDebug "[buildNgramsTermsList: Sample List / end]" (nt, HashMap.size allTerms)
165
166 printDebug "[buildNgramsTermsList: Flow Social List / start]" nt
167 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
168 socialLists :: FlowCont NgramsTerm FlowListScores
169 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
170 $ HashMap.fromList
171 $ List.zip (HashMap.keys allTerms)
172 (List.cycle [mempty])
173 )
174 printDebug "[buildNgramsTermsList: Flow Social List / end]" nt
175
176 let ngramsKeys = HashMap.keysSet allTerms
177
178 groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
179
180 let
181 socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
182 --printDebug "socialLists_Stemmed" socialLists_Stemmed
183 groupedWithList = toGroupedTree socialLists_Stemmed allTerms
184 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
185 $ HashMap.filter (\g -> (view gts'_score g) > 1)
186 $ view flc_scores groupedWithList
187
188 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
189
190 -- printDebug "stopTerms" stopTerms
191
192 -- splitting monterms and multiterms to take proportional candidates
193 -- use % of list if to big, or Int if too small
194 listSizeGlobal = 2000 :: Double
195 monoSize = 0.4 :: Double
196 multSize = 1 - monoSize
197
198 splitAt n' ns = both (HashMap.fromListWith (<>))
199 $ List.splitAt (round $ n' * listSizeGlobal)
200 $ List.sortOn (viewScore . snd)
201 $ HashMap.toList ns
202
203 (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
204 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
205
206 -------------------------
207 -- Filter 1 With Set NodeId and SpeGen
208 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
209
210
211 -- TODO remove (and remove HasNodeError instance)
212 userListId <- defaultList uCid
213 masterListId <- defaultList mCid
214
215 mapTextDocIds <- getContextsByNgramsOnlyUser uCid
216 [userListId, masterListId]
217 nt
218 selectedTerms
219
220
221 -- printDebug "mapTextDocIds" mapTextDocIds
222
223 let
224 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
225 groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
226 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
227
228
229 --printDebug "groupedTreeScores_SetNodeId" groupedTreeScores_SetNodeId
230
231 -- Coocurrences computation
232 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
233 let mapCooc = HashMap.filter (>1) -- removing cooc of 1
234 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
235 | (t1, s1) <- mapStemNodeIds
236 , (t2, s2) <- mapStemNodeIds
237 ]
238 where
239 mapStemNodeIds = HashMap.toList
240 $ HashMap.map viewScores
241 $ groupedTreeScores_SetNodeId
242 let
243 -- computing scores
244 mapScores f = HashMap.fromList
245 $ map (\g -> (view scored_terms g, f g))
246 $ normalizeGlobal
247 $ map normalizeLocal
248 $ scored'
249 $ Map.fromList -- TODO remove this
250 $ HashMap.toList mapCooc
251
252 let
253 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
254 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
255
256 let
257 -- sort / partition / split
258 -- filter mono/multi again
259 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
260
261 -- filter with max score
262 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
263 > (view scored_speExc $ view gts'_score g)
264 )
265
266 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
267 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
268
269 -- splitAt
270 let
271 -- use % of list if to big, or Int if to small
272 mapSize = 1000 :: Double
273 canSize = mapSize * 5 :: Double
274
275 inclSize = 0.4 :: Double
276 exclSize = 1 - inclSize
277
278 splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
279 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
280
281 monoInc_size n = splitAt' n $ monoSize * inclSize / 2
282 multExc_size n = splitAt' n $ multSize * exclSize / 2
283
284
285 (mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
286 (mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
287
288 (mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
289 (mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
290
291
292 (canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
293 (canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
294
295 (canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
296 (canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
297
298 ------------------------------------------------------------
299 -- Final Step building the Typed list
300 -- Candidates Terms need to be filtered
301 let
302 maps = setListType (Just MapTerm)
303 $ mapMonoScoredInclHead
304 <> mapMonoScoredExclHead
305 <> mapMultScoredInclHead
306 <> mapMultScoredExclHead
307
308 -- An original way to filter to start with
309 cands = setListType (Just CandidateTerm)
310 $ canMonoScoredIncHead
311 <> canMonoScoredExclHead
312 <> canMulScoredInclHead
313 <> canMultScoredExclHead
314
315 -- TODO count it too
316 cands' = setListType (Just CandidateTerm)
317 {-\$ groupedMonoTail
318 <>-} groupedMultTail
319
320 -- Quick FIX
321 candNgramsElement = List.take 5000
322 $ toNgramsElement cands <> toNgramsElement cands'
323
324 result = Map.unionsWith (<>)
325 [ Map.fromList [( nt, toNgramsElement maps
326 <> toNgramsElement stopTerms
327 <> candNgramsElement
328 )]
329 ]
330
331 pure result