]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[FIX] merge with dev-phylo
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
20 import Data.HashSet (HashSet)
21 import Data.Map (Map)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
24 import Data.Set (Set)
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement, RepoCmdM, NgramsTerm(..))
27 import Gargantext.Core.Text (size)
28 import Gargantext.Core.Text.List.Group
29 import Gargantext.Core.Text.List.Group.Prelude
30 import Gargantext.Core.Text.List.Group.WithStem
31 import Gargantext.Core.Text.List.Social
32 import Gargantext.Core.Text.List.Social.Prelude
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
35 import Gargantext.Core.Types.Individu (User(..))
36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
38 import Gargantext.Database.Admin.Types.Node (NodeId)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
43 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
44 import Gargantext.Database.Query.Tree.Error (HasTreeError)
45 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
46 import Gargantext.Prelude
47 import qualified Data.HashMap.Strict as HashMap
48 import qualified Data.List as List
49 import qualified Data.Map as Map
50 import qualified Data.Set as Set
51 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
52 import qualified Data.HashSet as HashSet
53
54 {-
55 -- TODO maybe useful for later
56 isStopTerm :: StopSize -> Text -> Bool
57 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
58 where
59 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
60 -}
61
62
63 -- | TODO improve grouping functions of Authors, Sources, Institutes..
64 buildNgramsLists :: ( RepoCmdM env err m
65 , CmdM env err m
66 , HasTreeError err
67 , HasNodeError err
68 )
69 => GroupParams
70 -> User
71 -> UserCorpusId
72 -> MasterCorpusId
73 -> m (Map NgramsType [NgramsElement])
74 buildNgramsLists gp user uCid mCid = do
75 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
76 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
77 [ (Authors , MapListSize 9)
78 , (Sources , MapListSize 9)
79 , (Institutes, MapListSize 9)
80 ]
81
82 pure $ Map.unions $ [ngTerms] <> othersTerms
83
84
85 data MapListSize = MapListSize { unMapListSize :: !Int }
86
87 buildNgramsOthersList ::( HasNodeError err
88 , CmdM env err m
89 , RepoCmdM env err m
90 , HasTreeError err
91 )
92 => User
93 -> UserCorpusId
94 -> GroupParams
95 -> (NgramsType, MapListSize)
96 -> m (Map NgramsType [NgramsElement])
97 buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
98 allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
99
100 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
101 socialLists :: FlowCont NgramsTerm FlowListScores
102 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
103 $ HashMap.fromList
104 $ List.zip (HashMap.keys allTerms)
105 (List.cycle [mempty])
106 )
107 let
108 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
109
110 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
111 $ view flc_scores groupedWithList
112
113 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
114
115 listSize = mapListSize - (List.length mapTerms)
116 (mapTerms', candiTerms) = both HashMap.fromList
117 $ List.splitAt listSize
118 $ List.sortOn (Down . viewScore . snd)
119 $ HashMap.toList tailTerms'
120
121 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
122 <> (toNgramsElement mapTerms )
123 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
124 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
125 )]
126
127
128 getGroupParams :: ( HasNodeError err
129 , CmdM env err m
130 , RepoCmdM env err m
131 , HasTreeError err
132 )
133 => GroupParams -> HashSet Ngrams -> m GroupParams
134 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
135 hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
136 -- printDebug "hashMap" hashMap
137 pure $ over gwl_map (\x -> x <> hashMap) gp
138 getGroupParams gp _ = pure gp
139
140
141 -- TODO use ListIds
142 buildNgramsTermsList :: ( HasNodeError err
143 , CmdM env err m
144 , RepoCmdM env err m
145 , HasTreeError err
146 )
147 => User
148 -> UserCorpusId
149 -> MasterCorpusId
150 -> GroupParams
151 -> (NgramsType, MapListSize)
152 -> m (Map NgramsType [NgramsElement])
153 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
154
155 -- | Filter 0 With Double
156 -- Computing global speGen score
157 allTerms :: HashMap NgramsTerm Double <- getTficf uCid mCid nt
158
159 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
160 socialLists :: FlowCont NgramsTerm FlowListScores
161 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
162 $ HashMap.fromList
163 $ List.zip (HashMap.keys allTerms)
164 (List.cycle [mempty])
165 )
166 let ngramsKeys = HashMap.keysSet allTerms
167
168 groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
169
170 let
171 socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
172 --printDebug "socialLists_Stemmed" socialLists_Stemmed
173 groupedWithList = toGroupedTree socialLists_Stemmed allTerms
174 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
175 $ view flc_scores groupedWithList
176
177 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
178
179 -- printDebug "stopTerms" stopTerms
180
181 -- splitting monterms and multiterms to take proportional candidates
182 -- use % of list if to big, or Int if too small
183 listSizeGlobal = 2000 :: Double
184 monoSize = 0.4 :: Double
185 multSize = 1 - monoSize
186
187 splitAt n' ns = both (HashMap.fromListWith (<>))
188 $ List.splitAt (round $ n' * listSizeGlobal)
189 $ List.sortOn (viewScore . snd)
190 $ HashMap.toList ns
191
192 (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
193 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
194
195 -------------------------
196 -- Filter 1 With Set NodeId and SpeGen
197 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
198
199
200 -- TODO remove (and remove HasNodeError instance)
201 userListId <- defaultList uCid
202 masterListId <- defaultList mCid
203
204 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
205 [userListId, masterListId]
206 nt
207 selectedTerms
208
209 let
210 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
211 groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
212 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
213
214
215 printDebug "groupedTreeScores_SetNodeId" groupedTreeScores_SetNodeId
216
217 -- | Coocurrences computation
218 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
219 let mapCooc = HashMap.filter (>1) -- removing cooc of 1
220 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
221 | (t1, s1) <- mapStemNodeIds
222 , (t2, s2) <- mapStemNodeIds
223 ]
224 where
225 mapStemNodeIds = HashMap.toList
226 $ HashMap.map viewScores
227 $ groupedTreeScores_SetNodeId
228 let
229 -- computing scores
230 mapScores f = HashMap.fromList
231 $ map (\g -> (view scored_terms g, f g))
232 $ normalizeGlobal
233 $ map normalizeLocal
234 $ scored'
235 $ Map.fromList -- TODO remove this
236 $ HashMap.toList mapCooc
237
238 let
239 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
240 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
241
242 let
243 -- sort / partition / split
244 -- filter mono/multi again
245 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
246
247 -- filter with max score
248 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
249 > (view scored_speExc $ view gts'_score g)
250 )
251
252 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
253 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
254
255 -- splitAt
256 let
257 -- use % of list if to big, or Int if to small
258 listSizeLocal = 1000 :: Double
259 inclSize = 0.4 :: Double
260 exclSize = 1 - inclSize
261
262 splitAt' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
263 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
264
265
266 monoInc_size = splitAt' $ monoSize * inclSize / 2
267 (monoScoredInclHead, _monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl
268 (monoScoredExclHead, _monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl
269
270 multExc_size = splitAt' $ multSize * exclSize / 2
271 (multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl
272 (multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl
273
274 ------------------------------------------------------------
275 -- Final Step building the Typed list
276 -- Candidates Terms need to be filtered
277 let
278 maps = setListType (Just MapTerm)
279 $ monoScoredInclHead
280 <> monoScoredExclHead
281 <> multScoredInclHead
282 <> multScoredExclHead
283
284 -- An original way to filter to start with
285 cands = setListType (Just CandidateTerm)
286 $ {- monoScoredInclTail
287 <> monoScoredExclTail
288 <> -} multScoredInclTail
289 <> multScoredExclTail
290
291 cands' = setListType (Just CandidateTerm)
292 {-$ groupedMonoTail
293 <>-} groupedMultTail
294
295 result = Map.unionsWith (<>)
296 [ Map.fromList [( nt, toNgramsElement maps
297 <> toNgramsElement cands
298 <> toNgramsElement cands'
299 <> toNgramsElement stopTerms
300 )]
301 ]
302
303 -- printDebug "result" result
304
305 pure result