]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
Merge branch '68-dev-garg-v3-csv-parser' of ssh://gitlab.iscpif.fr:20022/gargantext...
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
20 import Data.HashSet (HashSet)
21 import Data.Map (Map)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
24 import Data.Set (Set)
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement, RepoCmdM, NgramsTerm(..))
27 import Gargantext.Core.Text (size)
28 import Gargantext.Core.Text.List.Group
29 import Gargantext.Core.Text.List.Group.Prelude
30 import Gargantext.Core.Text.List.Group.WithStem
31 import Gargantext.Core.Text.List.Social
32 import Gargantext.Core.Text.List.Social.Prelude
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
35 import Gargantext.Core.Types.Individu (User(..))
36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
38 import Gargantext.Database.Admin.Types.Node (NodeId)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
43 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
44 import Gargantext.Database.Query.Tree.Error (HasTreeError)
45 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
46 import Gargantext.Prelude
47 import qualified Data.HashMap.Strict as HashMap
48 import qualified Data.List as List
49 import qualified Data.Map as Map
50 import qualified Data.Set as Set
51 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
52 import qualified Data.HashSet as HashSet
53
54 {-
55 -- TODO maybe useful for later
56 isStopTerm :: StopSize -> Text -> Bool
57 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
58 where
59 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
60 -}
61
62
63 -- | TODO improve grouping functions of Authors, Sources, Institutes..
64 buildNgramsLists :: ( RepoCmdM env err m
65 , CmdM env err m
66 , HasTreeError err
67 , HasNodeError err
68 )
69 => GroupParams
70 -> User
71 -> UserCorpusId
72 -> MasterCorpusId
73 -> m (Map NgramsType [NgramsElement])
74 buildNgramsLists gp user uCid mCid = do
75 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
76 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
77 [ (Authors , MapListSize 9)
78 , (Sources , MapListSize 9)
79 , (Institutes, MapListSize 9)
80 ]
81
82 pure $ Map.unions $ [ngTerms] <> othersTerms
83
84
85 data MapListSize = MapListSize { unMapListSize :: !Int }
86
87 buildNgramsOthersList ::( HasNodeError err
88 , CmdM env err m
89 , RepoCmdM env err m
90 , HasTreeError err
91 )
92 => User
93 -> UserCorpusId
94 -> GroupParams
95 -> (NgramsType, MapListSize)
96 -> m (Map NgramsType [NgramsElement])
97 buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
98 allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
99
100 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
101 socialLists :: FlowCont NgramsTerm FlowListScores
102 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
103 $ HashMap.fromList
104 $ List.zip (HashMap.keys allTerms)
105 (List.cycle [mempty])
106 )
107 let
108 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
109
110 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
111 $ view flc_scores groupedWithList
112
113 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
114
115 listSize = mapListSize - (List.length mapTerms)
116 (mapTerms', candiTerms) = both HashMap.fromList
117 $ List.splitAt listSize
118 $ List.sortOn (Down . viewScore . snd)
119 $ HashMap.toList tailTerms'
120
121
122 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
123 <> (toNgramsElement mapTerms )
124 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
125 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
126 )]
127
128
129 getGroupParams :: ( HasNodeError err
130 , CmdM env err m
131 , RepoCmdM env err m
132 , HasTreeError err
133 )
134 => GroupParams -> HashSet Ngrams -> m GroupParams
135 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
136 hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
137 -- printDebug "hashMap" hashMap
138 pure $ over gwl_map (\x -> x <> hashMap) gp
139 getGroupParams gp _ = pure gp
140
141
142 -- TODO use ListIds
143 buildNgramsTermsList :: ( HasNodeError err
144 , CmdM env err m
145 , RepoCmdM env err m
146 , HasTreeError err
147 )
148 => User
149 -> UserCorpusId
150 -> MasterCorpusId
151 -> GroupParams
152 -> (NgramsType, MapListSize)
153 -> m (Map NgramsType [NgramsElement])
154 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
155
156 -- Filter 0 With Double
157 -- Computing global speGen score
158 allTerms :: HashMap NgramsTerm Double <- getTficf uCid mCid nt
159
160 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
161 socialLists :: FlowCont NgramsTerm FlowListScores
162 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
163 $ HashMap.fromList
164 $ List.zip (HashMap.keys allTerms)
165 (List.cycle [mempty])
166 )
167 let ngramsKeys = HashMap.keysSet allTerms
168
169 groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
170
171 let
172 socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
173 --printDebug "socialLists_Stemmed" socialLists_Stemmed
174 groupedWithList = toGroupedTree socialLists_Stemmed allTerms
175 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
176 $ HashMap.filter (\g -> (view gts'_score g) > 1)
177 $ view flc_scores groupedWithList
178
179 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
180
181 -- printDebug "stopTerms" stopTerms
182
183 -- splitting monterms and multiterms to take proportional candidates
184 -- use % of list if to big, or Int if too small
185 listSizeGlobal = 2000 :: Double
186 monoSize = 0.4 :: Double
187 multSize = 1 - monoSize
188
189 splitAt n' ns = both (HashMap.fromListWith (<>))
190 $ List.splitAt (round $ n' * listSizeGlobal)
191 $ List.sortOn (viewScore . snd)
192 $ HashMap.toList ns
193
194 (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
195 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
196
197 -------------------------
198 -- Filter 1 With Set NodeId and SpeGen
199 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
200
201
202 -- TODO remove (and remove HasNodeError instance)
203 userListId <- defaultList uCid
204 masterListId <- defaultList mCid
205
206 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
207 [userListId, masterListId]
208 nt
209 selectedTerms
210
211 let
212 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
213 groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
214 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
215
216
217 printDebug "groupedTreeScores_SetNodeId" groupedTreeScores_SetNodeId
218
219 -- Coocurrences computation
220 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
221 let mapCooc = HashMap.filter (>1) -- removing cooc of 1
222 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
223 | (t1, s1) <- mapStemNodeIds
224 , (t2, s2) <- mapStemNodeIds
225 ]
226 where
227 mapStemNodeIds = HashMap.toList
228 $ HashMap.map viewScores
229 $ groupedTreeScores_SetNodeId
230 let
231 -- computing scores
232 mapScores f = HashMap.fromList
233 $ map (\g -> (view scored_terms g, f g))
234 $ normalizeGlobal
235 $ map normalizeLocal
236 $ scored'
237 $ Map.fromList -- TODO remove this
238 $ HashMap.toList mapCooc
239
240 let
241 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
242 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
243
244 let
245 -- sort / partition / split
246 -- filter mono/multi again
247 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
248
249 -- filter with max score
250 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
251 > (view scored_speExc $ view gts'_score g)
252 )
253
254 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
255 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
256
257 -- splitAt
258 let
259 -- use % of list if to big, or Int if to small
260 mapSize = 1000 :: Double
261 canSize = mapSize * 5 :: Double
262
263 inclSize = 0.4 :: Double
264 exclSize = 1 - inclSize
265
266 splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
267 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
268
269 monoInc_size n = splitAt' n $ monoSize * inclSize / 2
270 multExc_size n = splitAt' n $ multSize * exclSize / 2
271
272
273 (mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
274 (mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
275
276 (mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
277 (mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
278
279
280 (canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
281 (canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
282
283 (canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
284 (canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
285
286 ------------------------------------------------------------
287 -- Final Step building the Typed list
288 -- Candidates Terms need to be filtered
289 let
290 maps = setListType (Just MapTerm)
291 $ mapMonoScoredInclHead
292 <> mapMonoScoredExclHead
293 <> mapMultScoredInclHead
294 <> mapMultScoredExclHead
295
296 -- An original way to filter to start with
297 cands = setListType (Just CandidateTerm)
298 $ canMonoScoredIncHead
299 <> canMonoScoredExclHead
300 <> canMulScoredInclHead
301 <> canMultScoredExclHead
302
303 -- TODO count it too
304 cands' = setListType (Just CandidateTerm)
305 {-\$ groupedMonoTail
306 <>-} groupedMultTail
307
308 -- Quick FIX
309 candNgramsElement = List.take 5000
310 $ toNgramsElement cands <> toNgramsElement cands'
311
312 result = Map.unionsWith (<>)
313 [ Map.fromList [( nt, toNgramsElement maps
314 <> toNgramsElement stopTerms
315 <> candNgramsElement
316 )]
317 ]
318
319 pure result