]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[FEAT/FIX] Stemming -> Parent/Children -> Patch ok
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.Map (Map)
20 import Data.Monoid (mempty)
21 import Data.Ord (Down(..))
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import Data.Tuple.Extra (both)
25 import Gargantext.API.Ngrams.Types (NgramsElement, RepoCmdM, NgramsTerm(..))
26 import Gargantext.Core.Text (size)
27 import Gargantext.Core.Text.List.Group
28 import Gargantext.Core.Text.List.Group.Prelude
29 import Gargantext.Core.Text.List.Group.WithStem
30 import Gargantext.Core.Text.List.Social
31 import Gargantext.Core.Text.List.Social.Prelude
32 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
33 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
34 import Gargantext.Core.Types.Individu (User(..))
35 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
36 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
37 import Gargantext.Database.Admin.Types.Node (NodeId)
38 import Gargantext.Database.Prelude (CmdM)
39 import Gargantext.Database.Query.Table.Node (defaultList)
40 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
41 import Gargantext.Database.Query.Tree.Error (HasTreeError)
42 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
43 import Gargantext.Prelude
44 import qualified Data.List as List
45 import qualified Data.Map as Map
46 import qualified Data.Set as Set
47
48
49 {-
50 -- TODO maybe useful for later
51 isStopTerm :: StopSize -> Text -> Bool
52 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
53 where
54 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
55 -}
56
57
58 -- | TODO improve grouping functions of Authors, Sources, Institutes..
59 buildNgramsLists :: ( RepoCmdM env err m
60 , CmdM env err m
61 , HasTreeError err
62 , HasNodeError err
63 )
64 => User
65 -> GroupParams
66 -> UserCorpusId
67 -> MasterCorpusId
68 -> m (Map NgramsType [NgramsElement])
69 buildNgramsLists user gp uCid mCid = do
70 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
71 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
72 [ (Authors , MapListSize 9)
73 , (Sources , MapListSize 9)
74 , (Institutes, MapListSize 9)
75 ]
76
77 pure $ Map.unions $ [ngTerms] <> othersTerms
78
79
80 data MapListSize = MapListSize { unMapListSize :: !Int }
81
82 buildNgramsOthersList ::( HasNodeError err
83 , CmdM env err m
84 , RepoCmdM env err m
85 , HasTreeError err
86 )
87 => User
88 -> UserCorpusId
89 -> GroupParams
90 -> (NgramsType, MapListSize)
91 -> m (Map NgramsType [NgramsElement])
92 buildNgramsOthersList user uCid groupParams (nt, MapListSize mapListSize) = do
93 allTerms :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
94
95 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
96 socialLists :: FlowCont Text FlowListScores
97 <- flowSocialList MySelfFirst user nt ( FlowCont Map.empty
98 $ Map.fromList
99 $ List.zip (Map.keys allTerms)
100 (List.cycle [mempty])
101 )
102 {-
103 if nt == Sources -- Authors
104 then printDebug "flowSocialList" socialLists
105 else printDebug "flowSocialList" ""
106 -}
107 let
108 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
109 {-
110 if nt == Sources -- Authors
111 then printDebug "groupedWithList" groupedWithList
112 else printDebug "groupedWithList" ""
113 -}
114
115 let
116 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType)
117 $ view flc_scores groupedWithList
118
119 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
120
121 listSize = mapListSize - (List.length mapTerms)
122 (mapTerms', candiTerms) = both Map.fromList
123 $ List.splitAt listSize
124 $ List.sortOn (Down . viewScore . snd)
125 $ Map.toList tailTerms'
126
127 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
128 <> (toNgramsElement mapTerms )
129 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
130 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
131 )]
132
133
134 -- TODO use ListIds
135 buildNgramsTermsList :: ( HasNodeError err
136 , CmdM env err m
137 , RepoCmdM env err m
138 , HasTreeError err
139 )
140 => User
141 -> UserCorpusId
142 -> MasterCorpusId
143 -> GroupParams
144 -> (NgramsType, MapListSize)
145 -> m (Map NgramsType [NgramsElement])
146 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
147
148 -- | Filter 0 With Double
149 -- Computing global speGen score
150 allTerms :: Map Text Double <- getTficf uCid mCid nt
151
152 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
153 socialLists :: FlowCont Text FlowListScores
154 <- flowSocialList MySelfFirst user nt ( FlowCont Map.empty
155 $ Map.fromList
156 $ List.zip (Map.keys allTerms)
157 (List.cycle [mempty])
158 )
159
160 let socialLists_Stemmed = addScoreStem groupParams (Set.map NgramsTerm $ Map.keysSet allTerms) socialLists
161 printDebug "socialLists_Stemmed" socialLists_Stemmed
162 let groupedWithList = toGroupedTree {- groupParams -} socialLists_Stemmed allTerms
163 (stopTerms, candidateTerms) = Map.partition ((== Just StopTerm) . viewListType)
164 $ view flc_scores groupedWithList
165
166 (groupedMono, groupedMult) = Map.partitionWithKey (\t _v -> size t < 2) candidateTerms
167
168 -- printDebug "stopTerms" stopTerms
169
170 -- splitting monterms and multiterms to take proportional candidates
171 let
172 -- use % of list if to big, or Int if too small
173 listSizeGlobal = 2000 :: Double
174 monoSize = 0.4 :: Double
175 multSize = 1 - monoSize
176
177 splitAt n' ns = both (Map.fromListWith (<>))
178 $ List.splitAt (round $ n' * listSizeGlobal)
179 $ List.sortOn (viewScore . snd)
180 $ Map.toList ns
181
182 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
183 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
184
185 -------------------------
186 -- Filter 1 With Set NodeId and SpeGen
187 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
188
189
190 -- TO remove (and remove HasNodeError instance)
191 userListId <- defaultList uCid
192 masterListId <- defaultList mCid
193
194
195 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
196 [userListId, masterListId]
197 nt
198 selectedTerms
199
200 let
201 groupedTreeScores_SetNodeId :: Map Text (GroupedTreeScores (Set NodeId))
202 groupedTreeScores_SetNodeId = setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
203
204 -- | Coocurrences computation
205 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
206 let mapCooc = Map.filter (>2)
207 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
208 | (t1, s1) <- mapStemNodeIds
209 , (t2, s2) <- mapStemNodeIds
210 ]
211 where
212 mapStemNodeIds = Map.toList
213 $ Map.map viewScores
214 $ groupedTreeScores_SetNodeId
215 let
216 -- computing scores
217 mapScores f = Map.fromList
218 $ map (\g -> (view scored_terms g, f g))
219 $ normalizeGlobal
220 $ map normalizeLocal
221 $ scored' mapCooc
222
223 let
224 groupedTreeScores_SpeGen :: Map Text (GroupedTreeScores (Scored Text))
225 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity)
226 ( groupedMonoHead
227 <> groupedMultHead
228 )
229
230 let
231 -- sort / partition / split
232 -- filter mono/multi again
233 (monoScored, multScored) = Map.partitionWithKey (\t _v -> size t < 2) groupedTreeScores_SpeGen
234
235 -- filter with max score
236 partitionWithMaxScore = Map.partition (\g -> (view scored_genInc $ view gts'_score g)
237 > (view scored_speExc $ view gts'_score g)
238 )
239
240 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
241 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
242
243 -- splitAt
244 let
245 -- use % of list if to big, or Int if to small
246 listSizeLocal = 1000 :: Double
247 inclSize = 0.4 :: Double
248 exclSize = 1 - inclSize
249
250 splitAt' n' = (both (Map.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
251 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . Map.toList
252
253
254 monoInc_size = splitAt' $ monoSize * inclSize / 2
255 (monoScoredInclHead, monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl
256 (monoScoredExclHead, monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl
257
258 multExc_size = splitAt' $ multSize * exclSize / 2
259 (multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl
260 (multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl
261
262
263 ------------------------------------------------------------
264
265 -- Final Step building the Typed list
266 termListHead = maps <> cands
267 where
268 maps = setListType (Just MapTerm)
269 $ monoScoredInclHead
270 <> monoScoredExclHead
271 <> multScoredInclHead
272 <> multScoredExclHead
273
274 cands = setListType (Just CandidateTerm)
275 $ monoScoredInclTail
276 <> monoScoredExclTail
277 <> multScoredInclTail
278 <> multScoredExclTail
279
280 termListTail = (setListType (Just CandidateTerm)) (groupedMonoTail <> groupedMultTail)
281
282 let result = Map.unionsWith (<>)
283 [ Map.fromList [( nt, toNgramsElement termListHead
284 <> toNgramsElement termListTail
285 <> toNgramsElement stopTerms
286 )]
287 ]
288
289 -- printDebug "result" result
290
291 pure result