]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[FIX] ListType working now with history patch
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14
15 module Gargantext.Core.Text.List
16 where
17
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.Map (Map)
20 import Data.Monoid (mempty)
21 import Data.Ord (Down(..))
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import Data.Tuple.Extra (both)
25 import Gargantext.API.Ngrams.Types (NgramsElement)
26 import Gargantext.API.Ngrams.Types (RepoCmdM)
27 import Gargantext.Core.Text (size)
28 import Gargantext.Core.Text.List.Group
29 import Gargantext.Core.Text.List.Group.Prelude
30 import Gargantext.Core.Text.List.Group.WithStem
31 import Gargantext.Core.Text.List.Social
32 import Gargantext.Core.Text.List.Social.Prelude
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
35 import Gargantext.Core.Types.Individu (User(..))
36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
38 import Gargantext.Database.Admin.Types.Node (NodeId)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Node (defaultList)
41 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
42 import Gargantext.Database.Query.Tree.Error (HasTreeError)
43 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
44 import Gargantext.Prelude
45 import qualified Data.List as List
46 import qualified Data.Map as Map
47 import qualified Data.Set as Set
48
49
50 {-
51 -- TODO maybe useful for later
52 isStopTerm :: StopSize -> Text -> Bool
53 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
54 where
55 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
56 -}
57
58
59 -- | TODO improve grouping functions of Authors, Sources, Institutes..
60 buildNgramsLists :: ( RepoCmdM env err m
61 , CmdM env err m
62 , HasTreeError err
63 , HasNodeError err
64 )
65 => User
66 -> GroupParams
67 -> UserCorpusId
68 -> MasterCorpusId
69 -> m (Map NgramsType [NgramsElement])
70 buildNgramsLists user gp uCid mCid = do
71 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
72 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
73 [ (Authors , MapListSize 9)
74 , (Sources , MapListSize 9)
75 , (Institutes, MapListSize 9)
76 ]
77
78 pure $ Map.unions $ [ngTerms] <> othersTerms
79
80
81 data MapListSize = MapListSize { unMapListSize :: !Int }
82
83 buildNgramsOthersList ::( HasNodeError err
84 , CmdM env err m
85 , RepoCmdM env err m
86 , HasTreeError err
87 )
88 => User
89 -> UserCorpusId
90 -> GroupParams
91 -> (NgramsType, MapListSize)
92 -> m (Map NgramsType [NgramsElement])
93 buildNgramsOthersList user uCid groupParams (nt, MapListSize mapListSize) = do
94 allTerms :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
95
96 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
97 socialLists :: FlowCont Text FlowListScores
98 <- flowSocialList MySelfFirst user nt ( FlowCont Map.empty
99 $ Map.fromList
100 $ List.zip (Map.keys allTerms)
101 (List.cycle [mempty])
102 )
103
104 if nt == Authors
105 then printDebug "flowSocialList" socialLists
106 else printDebug "flowSocialList" ""
107
108 let
109 groupedWithList = toGroupedTree groupParams socialLists allTerms
110
111 if nt == Authors
112 then printDebug "groupedWithList" groupedWithList
113 else printDebug "groupedWithList" ""
114
115
116 let
117 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType)
118 $ view flc_scores groupedWithList
119
120 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
121
122 listSize = mapListSize - (List.length mapTerms)
123 (mapTerms', candiTerms) = both Map.fromList
124 $ List.splitAt listSize
125 $ List.sortOn (Down . viewScore . snd)
126 $ Map.toList tailTerms'
127
128 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
129 <> (toNgramsElement mapTerms )
130 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
131 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
132 )]
133
134
135 -- TODO use ListIds
136 buildNgramsTermsList :: ( HasNodeError err
137 , CmdM env err m
138 , RepoCmdM env err m
139 , HasTreeError err
140 )
141 => User
142 -> UserCorpusId
143 -> MasterCorpusId
144 -> GroupParams
145 -> (NgramsType, MapListSize)
146 -> m (Map NgramsType [NgramsElement])
147 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
148
149 -- | Filter 0 With Double
150 -- Computing global speGen score
151 allTerms :: Map Text Double <- getTficf uCid mCid nt
152
153 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
154 socialLists :: FlowCont Text FlowListScores
155 <- flowSocialList MySelfFirst user nt ( FlowCont Map.empty
156 $ Map.fromList
157 $ List.zip (Map.keys allTerms)
158 (List.cycle [mempty])
159 )
160
161 let groupedWithList = toGroupedTree groupParams socialLists allTerms
162 (stopTerms, candidateTerms) = Map.partition ((== Just StopTerm) . viewListType)
163 $ view flc_scores groupedWithList
164
165 (groupedMono, groupedMult) = Map.partitionWithKey (\t _v -> size t < 2) candidateTerms
166
167 -- printDebug "stopTerms" stopTerms
168
169 -- splitting monterms and multiterms to take proportional candidates
170 let
171 -- use % of list if to big, or Int if too small
172 listSizeGlobal = 2000 :: Double
173 monoSize = 0.4 :: Double
174 multSize = 1 - monoSize
175
176 splitAt n' ns = both (Map.fromListWith (<>))
177 $ List.splitAt (round $ n' * listSizeGlobal)
178 $ List.sortOn (viewScore . snd)
179 $ Map.toList ns
180
181 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
182 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
183
184 -------------------------
185 -- Filter 1 With Set NodeId and SpeGen
186 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
187
188
189 -- TO remove (and remove HasNodeError instance)
190 userListId <- defaultList uCid
191 masterListId <- defaultList mCid
192
193
194 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
195 [userListId, masterListId]
196 nt
197 selectedTerms
198
199 let
200 groupedTreeScores_SetNodeId :: Map Text (GroupedTreeScores (Set NodeId))
201 groupedTreeScores_SetNodeId = setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
202
203 -- | Coocurrences computation
204 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
205 let mapCooc = Map.filter (>2)
206 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
207 | (t1, s1) <- mapStemNodeIds
208 , (t2, s2) <- mapStemNodeIds
209 ]
210 where
211 mapStemNodeIds = Map.toList
212 $ Map.map viewScores
213 $ groupedTreeScores_SetNodeId
214 let
215 -- computing scores
216 mapScores f = Map.fromList
217 $ map (\g -> (view scored_terms g, f g))
218 $ normalizeGlobal
219 $ map normalizeLocal
220 $ scored' mapCooc
221
222 let
223 groupedTreeScores_SpeGen :: Map Text (GroupedTreeScores (Scored Text))
224 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity)
225 ( groupedMonoHead
226 <> groupedMultHead
227 )
228
229 let
230 -- sort / partition / split
231 -- filter mono/multi again
232 (monoScored, multScored) = Map.partitionWithKey (\t _v -> size t < 2) groupedTreeScores_SpeGen
233
234 -- filter with max score
235 partitionWithMaxScore = Map.partition (\g -> (view scored_genInc $ view gts'_score g)
236 > (view scored_speExc $ view gts'_score g)
237 )
238
239 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
240 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
241
242 -- splitAt
243 let
244 -- use % of list if to big, or Int if to small
245 listSizeLocal = 1000 :: Double
246 inclSize = 0.4 :: Double
247 exclSize = 1 - inclSize
248
249 splitAt' n' = (both (Map.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
250 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . Map.toList
251
252
253 monoInc_size = splitAt' $ monoSize * inclSize / 2
254 (monoScoredInclHead, monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl
255 (monoScoredExclHead, monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl
256
257 multExc_size = splitAt' $ multSize * exclSize / 2
258 (multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl
259 (multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl
260
261
262 ------------------------------------------------------------
263
264 -- Final Step building the Typed list
265 termListHead = maps <> cands
266 where
267 maps = setListType (Just MapTerm)
268 $ monoScoredInclHead
269 <> monoScoredExclHead
270 <> multScoredInclHead
271 <> multScoredExclHead
272
273 cands = setListType (Just CandidateTerm)
274 $ monoScoredInclTail
275 <> monoScoredExclTail
276 <> multScoredInclTail
277 <> multScoredExclTail
278
279 termListTail = (setListType (Just CandidateTerm)) (groupedMonoTail <> groupedMultTail)
280
281 let result = Map.unionsWith (<>)
282 [ Map.fromList [( nt, toNgramsElement termListHead
283 <> toNgramsElement termListTail
284 <> toNgramsElement stopTerms
285 )]
286 ]
287
288 -- printDebug "result" result
289
290 pure result