]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
Merge branch 'dev' into 97-dev-istex-search
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
20 import Data.HashSet (HashSet)
21 import Data.Map (Map)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
24 import Data.Set (Set)
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
27 import Gargantext.Core.NodeStory
28 import Gargantext.Core.Text (size)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
43 import Gargantext.Database.Query.Table.Node (defaultList)
44 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
45 import Gargantext.Database.Query.Tree.Error (HasTreeError)
46 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
47 import Gargantext.Prelude
48 import qualified Data.HashMap.Strict as HashMap
49 import qualified Data.HashSet as HashSet
50 import qualified Data.List as List
51 import qualified Data.Map as Map
52 import qualified Data.Set as Set
53 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
54
55 {-
56 -- TODO maybe useful for later
57 isStopTerm :: StopSize -> Text -> Bool
58 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
59 where
60 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
61 -}
62
63
64 -- | TODO improve grouping functions of Authors, Sources, Institutes..
65 buildNgramsLists :: ( HasNodeStory env err m
66 , CmdM env err m
67 , HasTreeError err
68 , HasNodeError err
69 )
70 => User
71 -> UserCorpusId
72 -> MasterCorpusId
73 -> Maybe FlowSocialListWith
74 -> GroupParams
75 -> m (Map NgramsType [NgramsElement])
76 buildNgramsLists user uCid mCid mfslw gp = do
77 ngTerms <- buildNgramsTermsList user uCid mCid mfslw gp (NgramsTerms, MapListSize 350)
78 othersTerms <- mapM (buildNgramsOthersList user uCid mfslw GroupIdentity)
79 [ (Authors , MapListSize 9)
80 , (Sources , MapListSize 9)
81 , (Institutes, MapListSize 9)
82 ]
83
84 pure $ Map.unions $ [ngTerms] <> othersTerms
85
86
87 data MapListSize = MapListSize { unMapListSize :: !Int }
88
89 buildNgramsOthersList :: ( HasNodeError err
90 , CmdM env err m
91 , HasNodeStory env err m
92 , HasTreeError err
93 )
94 => User
95 -> UserCorpusId
96 -> Maybe FlowSocialListWith
97 -> GroupParams
98 -> (NgramsType, MapListSize)
99 -> m (Map NgramsType [NgramsElement])
100 buildNgramsOthersList user uCid mfslw _groupParams (nt, MapListSize mapListSize) = do
101 allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
102
103 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
104 socialLists :: FlowCont NgramsTerm FlowListScores
105 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
106 $ HashMap.fromList
107 $ List.zip (HashMap.keys allTerms)
108 (List.cycle [mempty])
109 )
110 let
111 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
112
113 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
114 $ view flc_scores groupedWithList
115
116 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
117
118 listSize = mapListSize - (List.length mapTerms)
119 (mapTerms', candiTerms) = both HashMap.fromList
120 $ List.splitAt listSize
121 $ List.sortOn (Down . viewScore . snd)
122 $ HashMap.toList tailTerms'
123
124
125 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
126 <> (toNgramsElement mapTerms )
127 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
128 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
129 )]
130
131
132 getGroupParams :: ( HasNodeError err
133 , CmdM env err m
134 , HasNodeStory env err m
135 , HasTreeError err
136 )
137 => GroupParams -> HashSet Ngrams -> m GroupParams
138 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
139 hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
140 -- printDebug "hashMap" hashMap
141 pure $ over gwl_map (\x -> x <> hashMap) gp
142 getGroupParams gp _ = pure gp
143
144
145 -- TODO use ListIds
146 buildNgramsTermsList :: ( HasNodeError err
147 , CmdM env err m
148 , HasNodeStory env err m
149 , HasTreeError err
150 )
151 => User
152 -> UserCorpusId
153 -> MasterCorpusId
154 -> Maybe FlowSocialListWith
155 -> GroupParams
156 -> (NgramsType, MapListSize)
157 -> m (Map NgramsType [NgramsElement])
158 buildNgramsTermsList user uCid mCid mfslw groupParams (nt, _mapListSize)= do
159
160 -- Filter 0 With Double
161 -- Computing global speGen score
162 printDebug "[buldNgramsTermsList: Sample List] / start" nt
163 allTerms :: HashMap NgramsTerm Double <- getTficf_withSample uCid mCid nt
164 printDebug "[buldNgramsTermsList: Sample List / end]" nt
165
166 printDebug "[buldNgramsTermsList: Flow Social List / start]" nt
167 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
168 socialLists :: FlowCont NgramsTerm FlowListScores
169 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
170 $ HashMap.fromList
171 $ List.zip (HashMap.keys allTerms)
172 (List.cycle [mempty])
173 )
174 printDebug "[buldNgramsTermsList: Flow Social List / end]" nt
175
176 let ngramsKeys = HashMap.keysSet allTerms
177
178 groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
179
180 let
181 socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
182 --printDebug "socialLists_Stemmed" socialLists_Stemmed
183 groupedWithList = toGroupedTree socialLists_Stemmed allTerms
184 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
185 $ HashMap.filter (\g -> (view gts'_score g) > 1)
186 $ view flc_scores groupedWithList
187
188 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
189
190 -- printDebug "stopTerms" stopTerms
191
192 -- splitting monterms and multiterms to take proportional candidates
193 -- use % of list if to big, or Int if too small
194 listSizeGlobal = 2000 :: Double
195 monoSize = 0.4 :: Double
196 multSize = 1 - monoSize
197
198 splitAt n' ns = both (HashMap.fromListWith (<>))
199 $ List.splitAt (round $ n' * listSizeGlobal)
200 $ List.sortOn (viewScore . snd)
201 $ HashMap.toList ns
202
203 (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
204 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
205
206 -------------------------
207 -- Filter 1 With Set NodeId and SpeGen
208 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
209
210
211 -- TODO remove (and remove HasNodeError instance)
212 userListId <- defaultList uCid
213 masterListId <- defaultList mCid
214
215 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
216 [userListId, masterListId]
217 nt
218 selectedTerms
219
220 let
221 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
222 groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
223 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
224
225
226 --printDebug "groupedTreeScores_SetNodeId" groupedTreeScores_SetNodeId
227
228 -- Coocurrences computation
229 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
230 let mapCooc = HashMap.filter (>1) -- removing cooc of 1
231 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
232 | (t1, s1) <- mapStemNodeIds
233 , (t2, s2) <- mapStemNodeIds
234 ]
235 where
236 mapStemNodeIds = HashMap.toList
237 $ HashMap.map viewScores
238 $ groupedTreeScores_SetNodeId
239 let
240 -- computing scores
241 mapScores f = HashMap.fromList
242 $ map (\g -> (view scored_terms g, f g))
243 $ normalizeGlobal
244 $ map normalizeLocal
245 $ scored'
246 $ Map.fromList -- TODO remove this
247 $ HashMap.toList mapCooc
248
249 let
250 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
251 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
252
253 let
254 -- sort / partition / split
255 -- filter mono/multi again
256 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
257
258 -- filter with max score
259 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
260 > (view scored_speExc $ view gts'_score g)
261 )
262
263 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
264 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
265
266 -- splitAt
267 let
268 -- use % of list if to big, or Int if to small
269 mapSize = 1000 :: Double
270 canSize = mapSize * 5 :: Double
271
272 inclSize = 0.4 :: Double
273 exclSize = 1 - inclSize
274
275 splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
276 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
277
278 monoInc_size n = splitAt' n $ monoSize * inclSize / 2
279 multExc_size n = splitAt' n $ multSize * exclSize / 2
280
281
282 (mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
283 (mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
284
285 (mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
286 (mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
287
288
289 (canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
290 (canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
291
292 (canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
293 (canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
294
295 ------------------------------------------------------------
296 -- Final Step building the Typed list
297 -- Candidates Terms need to be filtered
298 let
299 maps = setListType (Just MapTerm)
300 $ mapMonoScoredInclHead
301 <> mapMonoScoredExclHead
302 <> mapMultScoredInclHead
303 <> mapMultScoredExclHead
304
305 -- An original way to filter to start with
306 cands = setListType (Just CandidateTerm)
307 $ canMonoScoredIncHead
308 <> canMonoScoredExclHead
309 <> canMulScoredInclHead
310 <> canMultScoredExclHead
311
312 -- TODO count it too
313 cands' = setListType (Just CandidateTerm)
314 {-\$ groupedMonoTail
315 <>-} groupedMultTail
316
317 -- Quick FIX
318 candNgramsElement = List.take 5000
319 $ toNgramsElement cands <> toNgramsElement cands'
320
321 result = Map.unionsWith (<>)
322 [ Map.fromList [( nt, toNgramsElement maps
323 <> toNgramsElement stopTerms
324 <> candNgramsElement
325 )]
326 ]
327
328 pure result