]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[FIX] merge
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
20 import Data.HashSet (HashSet)
21 import Data.Map (Map)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
24 import Data.Set (Set)
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
27 import Gargantext.Core.NodeStory
28 import Gargantext.Core.Text (size)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
43 import Gargantext.Database.Query.Table.Node (defaultList)
44 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
45 import Gargantext.Database.Query.Tree.Error (HasTreeError)
46 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
47 import Gargantext.Prelude
48 import qualified Data.HashMap.Strict as HashMap
49 import qualified Data.HashSet as HashSet
50 import qualified Data.List as List
51 import qualified Data.Map as Map
52 import qualified Data.Set as Set
53 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
54
55 {-
56 -- TODO maybe useful for later
57 isStopTerm :: StopSize -> Text -> Bool
58 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
59 where
60 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
61 -}
62
63
64 -- | TODO improve grouping functions of Authors, Sources, Institutes..
65 buildNgramsLists :: ( HasNodeStory env err m
66 , CmdM env err m
67 , HasTreeError err
68 , HasNodeError err
69 )
70 => GroupParams
71 -> User
72 -> UserCorpusId
73 -> MasterCorpusId
74 -> m (Map NgramsType [NgramsElement])
75 buildNgramsLists gp user uCid mCid = do
76 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
77 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
78 [ (Authors , MapListSize 9)
79 , (Sources , MapListSize 9)
80 , (Institutes, MapListSize 9)
81 ]
82
83 pure $ Map.unions $ [ngTerms] <> othersTerms
84
85
86 data MapListSize = MapListSize { unMapListSize :: !Int }
87
88 buildNgramsOthersList ::( HasNodeError err
89 , CmdM env err m
90 , HasNodeStory env err m
91 , HasTreeError err
92 )
93 => User
94 -> UserCorpusId
95 -> GroupParams
96 -> (NgramsType, MapListSize)
97 -> m (Map NgramsType [NgramsElement])
98 buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
99 allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
100
101 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
102 socialLists :: FlowCont NgramsTerm FlowListScores
103 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
104 $ HashMap.fromList
105 $ List.zip (HashMap.keys allTerms)
106 (List.cycle [mempty])
107 )
108 let
109 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
110
111 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
112 $ view flc_scores groupedWithList
113
114 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
115
116 listSize = mapListSize - (List.length mapTerms)
117 (mapTerms', candiTerms) = both HashMap.fromList
118 $ List.splitAt listSize
119 $ List.sortOn (Down . viewScore . snd)
120 $ HashMap.toList tailTerms'
121
122
123 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
124 <> (toNgramsElement mapTerms )
125 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
126 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
127 )]
128
129
130 getGroupParams :: ( HasNodeError err
131 , CmdM env err m
132 , HasNodeStory env err m
133 , HasTreeError err
134 )
135 => GroupParams -> HashSet Ngrams -> m GroupParams
136 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
137 hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
138 -- printDebug "hashMap" hashMap
139 pure $ over gwl_map (\x -> x <> hashMap) gp
140 getGroupParams gp _ = pure gp
141
142
143 -- TODO use ListIds
144 buildNgramsTermsList :: ( HasNodeError err
145 , CmdM env err m
146 , HasNodeStory env err m
147 , HasTreeError err
148 )
149 => User
150 -> UserCorpusId
151 -> MasterCorpusId
152 -> GroupParams
153 -> (NgramsType, MapListSize)
154 -> m (Map NgramsType [NgramsElement])
155 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
156
157 -- Filter 0 With Double
158 -- Computing global speGen score
159 allTerms :: HashMap NgramsTerm Double <- getTficf uCid mCid nt
160
161 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
162 socialLists :: FlowCont NgramsTerm FlowListScores
163 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
164 $ HashMap.fromList
165 $ List.zip (HashMap.keys allTerms)
166 (List.cycle [mempty])
167 )
168 let ngramsKeys = HashMap.keysSet allTerms
169
170 groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
171
172 let
173 socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
174 --printDebug "socialLists_Stemmed" socialLists_Stemmed
175 groupedWithList = toGroupedTree socialLists_Stemmed allTerms
176 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
177 $ HashMap.filter (\g -> (view gts'_score g) > 1)
178 $ view flc_scores groupedWithList
179
180 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
181
182 -- printDebug "stopTerms" stopTerms
183
184 -- splitting monterms and multiterms to take proportional candidates
185 -- use % of list if to big, or Int if too small
186 listSizeGlobal = 2000 :: Double
187 monoSize = 0.4 :: Double
188 multSize = 1 - monoSize
189
190 splitAt n' ns = both (HashMap.fromListWith (<>))
191 $ List.splitAt (round $ n' * listSizeGlobal)
192 $ List.sortOn (viewScore . snd)
193 $ HashMap.toList ns
194
195 (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
196 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
197
198 -------------------------
199 -- Filter 1 With Set NodeId and SpeGen
200 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
201
202
203 -- TODO remove (and remove HasNodeError instance)
204 userListId <- defaultList uCid
205 masterListId <- defaultList mCid
206
207 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
208 [userListId, masterListId]
209 nt
210 selectedTerms
211
212 let
213 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
214 groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
215 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
216
217
218 printDebug "groupedTreeScores_SetNodeId" groupedTreeScores_SetNodeId
219
220 -- Coocurrences computation
221 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
222 let mapCooc = HashMap.filter (>1) -- removing cooc of 1
223 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
224 | (t1, s1) <- mapStemNodeIds
225 , (t2, s2) <- mapStemNodeIds
226 ]
227 where
228 mapStemNodeIds = HashMap.toList
229 $ HashMap.map viewScores
230 $ groupedTreeScores_SetNodeId
231 let
232 -- computing scores
233 mapScores f = HashMap.fromList
234 $ map (\g -> (view scored_terms g, f g))
235 $ normalizeGlobal
236 $ map normalizeLocal
237 $ scored'
238 $ Map.fromList -- TODO remove this
239 $ HashMap.toList mapCooc
240
241 let
242 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
243 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
244
245 let
246 -- sort / partition / split
247 -- filter mono/multi again
248 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
249
250 -- filter with max score
251 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
252 > (view scored_speExc $ view gts'_score g)
253 )
254
255 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
256 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
257
258 -- splitAt
259 let
260 -- use % of list if to big, or Int if to small
261 mapSize = 1000 :: Double
262 canSize = mapSize * 5 :: Double
263
264 inclSize = 0.4 :: Double
265 exclSize = 1 - inclSize
266
267 splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
268 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
269
270 monoInc_size n = splitAt' n $ monoSize * inclSize / 2
271 multExc_size n = splitAt' n $ multSize * exclSize / 2
272
273
274 (mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
275 (mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
276
277 (mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
278 (mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
279
280
281 (canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
282 (canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
283
284 (canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
285 (canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
286
287 ------------------------------------------------------------
288 -- Final Step building the Typed list
289 -- Candidates Terms need to be filtered
290 let
291 maps = setListType (Just MapTerm)
292 $ mapMonoScoredInclHead
293 <> mapMonoScoredExclHead
294 <> mapMultScoredInclHead
295 <> mapMultScoredExclHead
296
297 -- An original way to filter to start with
298 cands = setListType (Just CandidateTerm)
299 $ canMonoScoredIncHead
300 <> canMonoScoredExclHead
301 <> canMulScoredInclHead
302 <> canMultScoredExclHead
303
304 -- TODO count it too
305 cands' = setListType (Just CandidateTerm)
306 {-\$ groupedMonoTail
307 <>-} groupedMultTail
308
309 -- Quick FIX
310 candNgramsElement = List.take 5000
311 $ toNgramsElement cands <> toNgramsElement cands'
312
313 result = Map.unionsWith (<>)
314 [ Map.fromList [( nt, toNgramsElement maps
315 <> toNgramsElement stopTerms
316 <> candNgramsElement
317 )]
318 ]
319
320 pure result