]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[conduit] implement conduit for Hal, Pubmed
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
20 import Data.HashSet (HashSet)
21 import Data.Map (Map)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
24 import Data.Set (Set)
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
27 import Gargantext.Core.NodeStory
28 import Gargantext.Core.Text (size)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsUser, getContextsByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
43 import Gargantext.Database.Query.Table.Node (defaultList)
44 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
45 import Gargantext.Database.Query.Tree.Error (HasTreeError)
46 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
47 import Gargantext.Prelude
48 import qualified Data.HashMap.Strict as HashMap
49 import qualified Data.HashSet as HashSet
50 import qualified Data.List as List
51 import qualified Data.Map as Map
52 import qualified Data.Set as Set
53 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
54
55 {-
56 -- TODO maybe useful for later
57 isStopTerm :: StopSize -> Text -> Bool
58 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
59 where
60 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
61 -}
62
63
64 -- | TODO improve grouping functions of Authors, Sources, Institutes..
65 buildNgramsLists :: ( HasNodeStory env err m
66 , CmdM env err m
67 , HasTreeError err
68 , HasNodeError err
69 )
70 => User
71 -> UserCorpusId
72 -> MasterCorpusId
73 -> Maybe FlowSocialListWith
74 -> GroupParams
75 -> m (Map NgramsType [NgramsElement])
76 buildNgramsLists user uCid mCid mfslw gp = do
77 ngTerms <- buildNgramsTermsList user uCid mCid mfslw gp (NgramsTerms, MapListSize 350)
78 othersTerms <- mapM (buildNgramsOthersList user uCid mfslw GroupIdentity)
79 [ (Authors , MapListSize 9, MaxListSize 1000)
80 , (Sources , MapListSize 9, MaxListSize 1000)
81 , (Institutes, MapListSize 9, MaxListSize 1000)
82 ]
83
84 pure $ Map.unions $ [ngTerms] <> othersTerms
85
86
87 data MapListSize = MapListSize { unMapListSize :: !Int }
88 data MaxListSize = MaxListSize { unMaxListSize :: !Int }
89
90 buildNgramsOthersList :: ( HasNodeError err
91 , CmdM env err m
92 , HasNodeStory env err m
93 , HasTreeError err
94 )
95 => User
96 -> UserCorpusId
97 -> Maybe FlowSocialListWith
98 -> GroupParams
99 -> (NgramsType, MapListSize, MaxListSize)
100 -> m (Map NgramsType [NgramsElement])
101 buildNgramsOthersList user uCid mfslw _groupParams (nt, MapListSize mapListSize, MaxListSize maxListSize) = do
102 allTerms :: HashMap NgramsTerm (Set NodeId) <- getContextsByNgramsUser uCid nt
103
104 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
105 socialLists :: FlowCont NgramsTerm FlowListScores
106 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
107 $ HashMap.fromList
108 $ List.zip (HashMap.keys allTerms)
109 (List.cycle [mempty])
110 )
111 let
112 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
113
114 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
115 $ view flc_scores groupedWithList
116
117 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
118
119 listSize = mapListSize - (List.length mapTerms)
120 (mapTerms', candiTerms) = both HashMap.fromList
121 $ List.splitAt listSize
122 $ List.take maxListSize
123 $ List.sortOn (Down . viewScore . snd)
124 $ HashMap.toList tailTerms'
125
126
127 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
128 <> (toNgramsElement mapTerms )
129 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
130 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
131 )]
132
133
134 getGroupParams :: ( HasNodeError err
135 , CmdM env err m
136 , HasNodeStory env err m
137 , HasTreeError err
138 )
139 => GroupParams -> HashSet Ngrams -> m GroupParams
140 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
141 hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
142 -- printDebug "hashMap" hashMap
143 pure $ over gwl_map (\x -> x <> hashMap) gp
144 getGroupParams gp _ = pure gp
145
146
147 -- TODO use ListIds
148 buildNgramsTermsList :: ( HasNodeError err
149 , CmdM env err m
150 , HasNodeStory env err m
151 , HasTreeError err
152 )
153 => User
154 -> UserCorpusId
155 -> MasterCorpusId
156 -> Maybe FlowSocialListWith
157 -> GroupParams
158 -> (NgramsType, MapListSize)
159 -> m (Map NgramsType [NgramsElement])
160 buildNgramsTermsList user uCid mCid mfslw groupParams (nt, _mapListSize)= do
161
162 -- Filter 0 With Double
163 -- Computing global speGen score
164 printDebug "[buildNgramsTermsList: Sample List] / start" nt
165 allTerms :: HashMap NgramsTerm Double <- getTficf_withSample uCid mCid nt
166 printDebug "[buildNgramsTermsList: Sample List / end]" (nt, HashMap.size allTerms)
167
168 printDebug "[buildNgramsTermsList: Flow Social List / start]" nt
169 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
170 socialLists :: FlowCont NgramsTerm FlowListScores
171 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
172 $ HashMap.fromList
173 $ List.zip (HashMap.keys allTerms)
174 (List.cycle [mempty])
175 )
176 printDebug "[buildNgramsTermsList: Flow Social List / end]" nt
177
178 let ngramsKeys = HashMap.keysSet allTerms
179
180 groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
181
182 let
183 socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
184 --printDebug "socialLists_Stemmed" socialLists_Stemmed
185 groupedWithList = toGroupedTree socialLists_Stemmed allTerms
186 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
187 $ HashMap.filter (\g -> (view gts'_score g) > 1)
188 $ view flc_scores groupedWithList
189
190 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
191
192 -- printDebug "stopTerms" stopTerms
193
194 -- splitting monterms and multiterms to take proportional candidates
195 -- use % of list if to big, or Int if too small
196 listSizeGlobal = 2000 :: Double
197 monoSize = 0.4 :: Double
198 multSize = 1 - monoSize
199
200 splitAt n' ns = both (HashMap.fromListWith (<>))
201 $ List.splitAt (round $ n' * listSizeGlobal)
202 $ List.sortOn (viewScore . snd)
203 $ HashMap.toList ns
204
205 (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
206 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
207
208 -------------------------
209 -- Filter 1 With Set NodeId and SpeGen
210 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
211
212
213 -- TODO remove (and remove HasNodeError instance)
214 userListId <- defaultList uCid
215 masterListId <- defaultList mCid
216
217 mapTextDocIds <- getContextsByNgramsOnlyUser uCid
218 [userListId, masterListId]
219 nt
220 selectedTerms
221
222
223 -- printDebug "mapTextDocIds" mapTextDocIds
224
225 let
226 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
227 groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
228 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
229
230
231 --printDebug "groupedTreeScores_SetNodeId" groupedTreeScores_SetNodeId
232
233 -- Coocurrences computation
234 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
235 let mapCooc = HashMap.filter (>1) -- removing cooc of 1
236 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
237 | (t1, s1) <- mapStemNodeIds
238 , (t2, s2) <- mapStemNodeIds
239 ]
240 where
241 mapStemNodeIds = HashMap.toList
242 $ HashMap.map viewScores
243 $ groupedTreeScores_SetNodeId
244 let
245 -- computing scores
246 mapScores f = HashMap.fromList
247 $ map (\g -> (view scored_terms g, f g))
248 $ normalizeGlobal
249 $ map normalizeLocal
250 $ scored'
251 $ Map.fromList -- TODO remove this
252 $ HashMap.toList mapCooc
253
254 let
255 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
256 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
257
258 let
259 -- sort / partition / split
260 -- filter mono/multi again
261 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
262
263 -- filter with max score
264 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
265 > (view scored_speExc $ view gts'_score g)
266 )
267
268 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
269 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
270
271 -- splitAt
272 let
273 -- use % of list if to big, or Int if to small
274 mapSize = 1000 :: Double
275 canSize = mapSize * 5 :: Double
276
277 inclSize = 0.4 :: Double
278 exclSize = 1 - inclSize
279
280 splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
281 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
282
283 monoInc_size n = splitAt' n $ monoSize * inclSize / 2
284 multExc_size n = splitAt' n $ multSize * exclSize / 2
285
286
287 (mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
288 (mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
289
290 (mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
291 (mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
292
293
294 (canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
295 (canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
296
297 (canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
298 (canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
299
300 ------------------------------------------------------------
301 -- Final Step building the Typed list
302 -- Candidates Terms need to be filtered
303 let
304 maps = setListType (Just MapTerm)
305 $ mapMonoScoredInclHead
306 <> mapMonoScoredExclHead
307 <> mapMultScoredInclHead
308 <> mapMultScoredExclHead
309
310 -- An original way to filter to start with
311 cands = setListType (Just CandidateTerm)
312 $ canMonoScoredIncHead
313 <> canMonoScoredExclHead
314 <> canMulScoredInclHead
315 <> canMultScoredExclHead
316
317 -- TODO count it too
318 cands' = setListType (Just CandidateTerm)
319 {-\$ groupedMonoTail
320 <>-} groupedMultTail
321
322 -- Quick FIX
323 candNgramsElement = List.take 5000
324 $ toNgramsElement cands <> toNgramsElement cands'
325
326 result = Map.unionsWith (<>)
327 [ Map.fromList [( nt, toNgramsElement maps
328 <> toNgramsElement stopTerms
329 <> candNgramsElement
330 )]
331 ]
332
333 pure result