]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[FIX] candidate size
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
20 import Data.HashSet (HashSet)
21 import Data.Map (Map)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
24 import Data.Set (Set)
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement, RepoCmdM, NgramsTerm(..))
27 import Gargantext.Core.Text (size)
28 import Gargantext.Core.Text.List.Group
29 import Gargantext.Core.Text.List.Group.Prelude
30 import Gargantext.Core.Text.List.Group.WithStem
31 import Gargantext.Core.Text.List.Social
32 import Gargantext.Core.Text.List.Social.Prelude
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
35 import Gargantext.Core.Types.Individu (User(..))
36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
38 import Gargantext.Database.Admin.Types.Node (NodeId)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
43 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
44 import Gargantext.Database.Query.Tree.Error (HasTreeError)
45 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
46 import Gargantext.Prelude
47 import qualified Data.HashMap.Strict as HashMap
48 import qualified Data.List as List
49 import qualified Data.Map as Map
50 import qualified Data.Set as Set
51 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
52 import qualified Data.HashSet as HashSet
53
54 {-
55 -- TODO maybe useful for later
56 isStopTerm :: StopSize -> Text -> Bool
57 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
58 where
59 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
60 -}
61
62
63 -- | TODO improve grouping functions of Authors, Sources, Institutes..
64 buildNgramsLists :: ( RepoCmdM env err m
65 , CmdM env err m
66 , HasTreeError err
67 , HasNodeError err
68 )
69 => GroupParams
70 -> User
71 -> UserCorpusId
72 -> MasterCorpusId
73 -> m (Map NgramsType [NgramsElement])
74 buildNgramsLists gp user uCid mCid = do
75 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
76 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
77 [ (Authors , MapListSize 9)
78 , (Sources , MapListSize 9)
79 , (Institutes, MapListSize 9)
80 ]
81
82 pure $ Map.unions $ [ngTerms] <> othersTerms
83
84
85 data MapListSize = MapListSize { unMapListSize :: !Int }
86
87 buildNgramsOthersList ::( HasNodeError err
88 , CmdM env err m
89 , RepoCmdM env err m
90 , HasTreeError err
91 )
92 => User
93 -> UserCorpusId
94 -> GroupParams
95 -> (NgramsType, MapListSize)
96 -> m (Map NgramsType [NgramsElement])
97 buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
98 allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
99
100 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
101 socialLists :: FlowCont NgramsTerm FlowListScores
102 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
103 $ HashMap.fromList
104 $ List.zip (HashMap.keys allTerms)
105 (List.cycle [mempty])
106 )
107 let
108 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
109
110 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
111 $ view flc_scores groupedWithList
112
113 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
114
115 listSize = mapListSize - (List.length mapTerms)
116 (mapTerms', candiTerms) = both HashMap.fromList
117 $ List.splitAt listSize
118 $ List.sortOn (Down . viewScore . snd)
119 $ HashMap.toList tailTerms'
120
121 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
122 <> (toNgramsElement mapTerms )
123 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
124 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
125 )]
126
127
128 getGroupParams :: ( HasNodeError err
129 , CmdM env err m
130 , RepoCmdM env err m
131 , HasTreeError err
132 )
133 => GroupParams -> HashSet Ngrams -> m GroupParams
134 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
135 hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
136 -- printDebug "hashMap" hashMap
137 pure $ over gwl_map (\x -> x <> hashMap) gp
138 getGroupParams gp _ = pure gp
139
140
141 -- TODO use ListIds
142 buildNgramsTermsList :: ( HasNodeError err
143 , CmdM env err m
144 , RepoCmdM env err m
145 , HasTreeError err
146 )
147 => User
148 -> UserCorpusId
149 -> MasterCorpusId
150 -> GroupParams
151 -> (NgramsType, MapListSize)
152 -> m (Map NgramsType [NgramsElement])
153 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
154
155 -- Filter 0 With Double
156 -- Computing global speGen score
157 allTerms :: HashMap NgramsTerm Double <- getTficf uCid mCid nt
158
159 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
160 socialLists :: FlowCont NgramsTerm FlowListScores
161 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
162 $ HashMap.fromList
163 $ List.zip (HashMap.keys allTerms)
164 (List.cycle [mempty])
165 )
166 let ngramsKeys = HashMap.keysSet allTerms
167
168 groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
169
170 let
171 socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
172 --printDebug "socialLists_Stemmed" socialLists_Stemmed
173 groupedWithList = toGroupedTree socialLists_Stemmed allTerms
174 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
175 $ HashMap.filter (\g -> (view gts'_score g) > 1)
176 $ view flc_scores groupedWithList
177
178 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
179
180 -- printDebug "stopTerms" stopTerms
181
182 -- splitting monterms and multiterms to take proportional candidates
183 -- use % of list if to big, or Int if too small
184 listSizeGlobal = 2000 :: Double
185 monoSize = 0.4 :: Double
186 multSize = 1 - monoSize
187
188 splitAt n' ns = both (HashMap.fromListWith (<>))
189 $ List.splitAt (round $ n' * listSizeGlobal)
190 $ List.sortOn (viewScore . snd)
191 $ HashMap.toList ns
192
193 (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
194 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
195
196 -------------------------
197 -- Filter 1 With Set NodeId and SpeGen
198 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
199
200
201 -- TODO remove (and remove HasNodeError instance)
202 userListId <- defaultList uCid
203 masterListId <- defaultList mCid
204
205 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
206 [userListId, masterListId]
207 nt
208 selectedTerms
209
210 let
211 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
212 groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
213 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
214
215
216 printDebug "groupedTreeScores_SetNodeId" groupedTreeScores_SetNodeId
217
218 -- Coocurrences computation
219 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
220 let mapCooc = HashMap.filter (>1) -- removing cooc of 1
221 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
222 | (t1, s1) <- mapStemNodeIds
223 , (t2, s2) <- mapStemNodeIds
224 ]
225 where
226 mapStemNodeIds = HashMap.toList
227 $ HashMap.map viewScores
228 $ groupedTreeScores_SetNodeId
229 let
230 -- computing scores
231 mapScores f = HashMap.fromList
232 $ map (\g -> (view scored_terms g, f g))
233 $ normalizeGlobal
234 $ map normalizeLocal
235 $ scored'
236 $ Map.fromList -- TODO remove this
237 $ HashMap.toList mapCooc
238
239 let
240 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
241 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
242
243 let
244 -- sort / partition / split
245 -- filter mono/multi again
246 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
247
248 -- filter with max score
249 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
250 > (view scored_speExc $ view gts'_score g)
251 )
252
253 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
254 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
255
256 -- splitAt
257 let
258 -- use % of list if to big, or Int if to small
259 mapSize = 1000 :: Double
260 canSize = mapSize * 5 :: Double
261
262 inclSize = 0.4 :: Double
263 exclSize = 1 - inclSize
264
265 splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
266 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
267
268 monoInc_size n = splitAt' n $ monoSize * inclSize / 2
269 multExc_size n = splitAt' n $ multSize * exclSize / 2
270
271
272 (mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
273 (mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
274
275 (mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
276 (mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
277
278
279 (canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
280 (canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
281
282 (canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
283 (canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
284
285 ------------------------------------------------------------
286 -- Final Step building the Typed list
287 -- Candidates Terms need to be filtered
288 let
289 maps = setListType (Just MapTerm)
290 $ mapMonoScoredInclHead
291 <> mapMonoScoredExclHead
292 <> mapMultScoredInclHead
293 <> mapMultScoredExclHead
294
295 -- An original way to filter to start with
296 cands = setListType (Just CandidateTerm)
297 $ canMonoScoredIncHead
298 <> canMonoScoredExclHead
299 <> canMulScoredInclHead
300 <> canMultScoredExclHead
301
302 -- TODO count it too
303 cands' = setListType (Just CandidateTerm)
304 {-\$ groupedMonoTail
305 <>-} groupedMultTail
306
307 result = Map.unionsWith (<>)
308 [ Map.fromList [( nt, toNgramsElement maps
309 <> toNgramsElement cands
310 <> toNgramsElement cands'
311 <> toNgramsElement stopTerms
312 )]
313 ]
314
315 pure result