]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[PARAM] List cut of Candidate Ngrams
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
20 import Data.HashSet (HashSet)
21 import Data.Map (Map)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
24 import Data.Set (Set)
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement, RepoCmdM, NgramsTerm(..))
27 import Gargantext.Core.Text (size)
28 import Gargantext.Core.Text.List.Group
29 import Gargantext.Core.Text.List.Group.Prelude
30 import Gargantext.Core.Text.List.Group.WithStem
31 import Gargantext.Core.Text.List.Social
32 import Gargantext.Core.Text.List.Social.Prelude
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
35 import Gargantext.Core.Types.Individu (User(..))
36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
38 import Gargantext.Database.Admin.Types.Node (NodeId)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
43 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
44 import Gargantext.Database.Query.Tree.Error (HasTreeError)
45 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
46 import Gargantext.Prelude
47 import qualified Data.HashMap.Strict as HashMap
48 import qualified Data.List as List
49 import qualified Data.Map as Map
50 import qualified Data.Set as Set
51 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
52 import qualified Data.HashSet as HashSet
53
54 {-
55 -- TODO maybe useful for later
56 isStopTerm :: StopSize -> Text -> Bool
57 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
58 where
59 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
60 -}
61
62
63 -- | TODO improve grouping functions of Authors, Sources, Institutes..
64 buildNgramsLists :: ( RepoCmdM env err m
65 , CmdM env err m
66 , HasTreeError err
67 , HasNodeError err
68 )
69 => GroupParams
70 -> User
71 -> UserCorpusId
72 -> MasterCorpusId
73 -> m (Map NgramsType [NgramsElement])
74 buildNgramsLists gp user uCid mCid = do
75 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
76 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
77 [ (Authors , MapListSize 9)
78 , (Sources , MapListSize 9)
79 , (Institutes, MapListSize 9)
80 ]
81
82 pure $ Map.unions $ [ngTerms] <> othersTerms
83
84
85 data MapListSize = MapListSize { unMapListSize :: !Int }
86
87 buildNgramsOthersList ::( HasNodeError err
88 , CmdM env err m
89 , RepoCmdM env err m
90 , HasTreeError err
91 )
92 => User
93 -> UserCorpusId
94 -> GroupParams
95 -> (NgramsType, MapListSize)
96 -> m (Map NgramsType [NgramsElement])
97 buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
98 allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
99
100 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
101 socialLists :: FlowCont NgramsTerm FlowListScores
102 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
103 $ HashMap.fromList
104 $ List.zip (HashMap.keys allTerms)
105 (List.cycle [mempty])
106 )
107 {-
108 if nt == Sources -- Authors
109 then printDebug "flowSocialList" socialLists
110 else printDebug "flowSocialList" ""
111 -}
112 let
113 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
114 {-
115 if nt == Sources -- Authors
116 then printDebug "groupedWithList" groupedWithList
117 else printDebug "groupedWithList" ""
118 -}
119
120 let
121 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
122 $ view flc_scores groupedWithList
123
124 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
125
126 listSize = mapListSize - (List.length mapTerms)
127 (mapTerms', candiTerms) = both HashMap.fromList
128 $ List.splitAt listSize
129 $ List.sortOn (Down . viewScore . snd)
130 $ HashMap.toList tailTerms'
131
132 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
133 <> (toNgramsElement mapTerms )
134 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
135 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
136 )]
137
138
139 getGroupParams :: ( HasNodeError err
140 , CmdM env err m
141 , RepoCmdM env err m
142 , HasTreeError err
143 )
144 => GroupParams -> HashSet Ngrams -> m GroupParams
145 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
146 hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
147 -- printDebug "hashMap" hashMap
148 pure $ over gwl_map (\x -> x <> hashMap) gp
149 getGroupParams gp _ = pure gp
150
151
152
153
154 -- TODO use ListIds
155 buildNgramsTermsList :: ( HasNodeError err
156 , CmdM env err m
157 , RepoCmdM env err m
158 , HasTreeError err
159 )
160 => User
161 -> UserCorpusId
162 -> MasterCorpusId
163 -> GroupParams
164 -> (NgramsType, MapListSize)
165 -> m (Map NgramsType [NgramsElement])
166 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
167
168 -- | Filter 0 With Double
169 -- Computing global speGen score
170 allTerms :: HashMap NgramsTerm Double <- getTficf uCid mCid nt
171
172 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
173 socialLists :: FlowCont NgramsTerm FlowListScores
174 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
175 $ HashMap.fromList
176 $ List.zip (HashMap.keys allTerms)
177 (List.cycle [mempty])
178 )
179 let ngramsKeys = HashMap.keysSet allTerms
180
181 groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
182 let socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
183 --printDebug "socialLists_Stemmed" socialLists_Stemmed
184 let groupedWithList = toGroupedTree socialLists_Stemmed allTerms
185 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
186 $ view flc_scores groupedWithList
187
188 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
189
190 -- printDebug "stopTerms" stopTerms
191
192 -- splitting monterms and multiterms to take proportional candidates
193 let
194 -- use % of list if to big, or Int if too small
195 listSizeGlobal = 2000 :: Double
196 monoSize = 0.4 :: Double
197 multSize = 1 - monoSize
198
199 splitAt n' ns = both (HashMap.fromListWith (<>))
200 $ List.splitAt (round $ n' * listSizeGlobal)
201 $ List.sortOn (viewScore . snd)
202 $ HashMap.toList ns
203
204 (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
205 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
206
207 -------------------------
208 -- Filter 1 With Set NodeId and SpeGen
209 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
210
211
212 -- TO remove (and remove HasNodeError instance)
213 userListId <- defaultList uCid
214 masterListId <- defaultList mCid
215
216
217 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
218 [userListId, masterListId]
219 nt
220 selectedTerms
221
222 let
223 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
224 groupedTreeScores_SetNodeId = setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
225
226 -- | Coocurrences computation
227 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
228 let mapCooc = HashMap.filter (>2)
229 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
230 | (t1, s1) <- mapStemNodeIds
231 , (t2, s2) <- mapStemNodeIds
232 ]
233 where
234 mapStemNodeIds = HashMap.toList
235 $ HashMap.map viewScores
236 $ groupedTreeScores_SetNodeId
237 let
238 -- computing scores
239 mapScores f = HashMap.fromList
240 $ map (\g -> (view scored_terms g, f g))
241 $ normalizeGlobal
242 $ map normalizeLocal
243 $ scored'
244 $ Map.fromList -- TODO remove this
245 $ HashMap.toList mapCooc
246
247 let
248 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
249 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity)
250 ( groupedMonoHead
251 <> groupedMultHead
252 )
253
254 let
255 -- sort / partition / split
256 -- filter mono/multi again
257 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
258
259 -- filter with max score
260 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
261 > (view scored_speExc $ view gts'_score g)
262 )
263
264 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
265 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
266
267 -- splitAt
268 let
269 -- use % of list if to big, or Int if to small
270 listSizeLocal = 1000 :: Double
271 inclSize = 0.4 :: Double
272 exclSize = 1 - inclSize
273
274 splitAt' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
275 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
276
277
278 monoInc_size = splitAt' $ monoSize * inclSize / 2
279 (monoScoredInclHead, _monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl
280 (monoScoredExclHead, _monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl
281
282 multExc_size = splitAt' $ multSize * exclSize / 2
283 (multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl
284 (multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl
285
286 ------------------------------------------------------------
287 -- Final Step building the Typed list
288 -- Candidates Terms need to be filtered
289 let
290 maps = setListType (Just MapTerm)
291 $ monoScoredInclHead
292 <> monoScoredExclHead
293 <> multScoredInclHead
294 <> multScoredExclHead
295
296 -- An original way to filter to start with
297 cands = setListType (Just CandidateTerm)
298 $ {- monoScoredInclTail
299 <> monoScoredExclTail
300 <> -} multScoredInclTail
301 <> multScoredExclTail
302
303 cands' = setListType (Just CandidateTerm)
304 {-$ groupedMonoTail
305 <>-} groupedMultTail
306
307 result = Map.unionsWith (<>)
308 [ Map.fromList [( nt, toNgramsElement maps
309 <> toNgramsElement cands
310 <> toNgramsElement cands'
311 <> toNgramsElement stopTerms
312 )]
313 ]
314
315 -- printDebug "result" result
316
317 pure result