]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[Social List] increments with listIds either Private or Shared, need group filtering...
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13
14 module Gargantext.Core.Text.List
15 where
16
17
18 import Control.Lens (makeLenses)
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
21 import Data.Map (Map)
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import qualified Data.Char as Char
25 import qualified Data.List as List
26 import qualified Data.Map as Map
27 import qualified Data.Set as Set
28 import qualified Data.Text as Text
29
30 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
31 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
32 import Gargantext.Core (Lang(..))
33 import Gargantext.Core.Text (size)
34 import Gargantext.Core.Text.List.Learn (Model(..))
35 import Gargantext.Core.Text.List.Social (flowSocialList)
36 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
37 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
38 import Gargantext.Core.Types.Individu (User(..))
39 import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
40 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
41 import Gargantext.API.Ngrams.Types (RepoCmdM)
42 import Gargantext.Database.Admin.Types.Node (NodeId)
43 import Gargantext.Database.Prelude (Cmd, CmdM)
44 import Gargantext.Database.Query.Table.Node (defaultList)
45 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
46 import Gargantext.Database.Query.Tree.Error (HasTreeError)
47 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
48 import Gargantext.Prelude
49
50
51 data NgramsListBuilder = BuilderStepO { stemSize :: !Int
52 , stemX :: !Int
53 , stopSize :: !Int
54 }
55 | BuilderStep1 { withModel :: !Model }
56 | BuilderStepN { withModel :: !Model }
57 | Tficf { nlb_lang :: !Lang
58 , nlb_group1 :: !Int
59 , nlb_group2 :: !Int
60 , nlb_stopSize :: !StopSize
61 , nlb_userCorpusId :: !UserCorpusId
62 , nlb_masterCorpusId :: !MasterCorpusId
63 }
64
65
66 data StopSize = StopSize {unStopSize :: !Int}
67
68 -- | TODO improve grouping functions of Authors, Sources, Institutes..
69 buildNgramsLists :: ( RepoCmdM env err m
70 , CmdM env err m
71 , HasTreeError err
72 , HasNodeError err
73 )
74 => User
75 -> Lang
76 -> Int
77 -> Int
78 -> StopSize
79 -> UserCorpusId
80 -> MasterCorpusId
81 -> m (Map NgramsType [NgramsElement])
82 buildNgramsLists user l n m s uCid mCid = do
83 ngTerms <- buildNgramsTermsList user l n m s uCid mCid
84 othersTerms <- mapM (buildNgramsOthersList user uCid identity)
85 [Authors, Sources, Institutes]
86 pure $ Map.unions $ othersTerms <> [ngTerms]
87
88
89 buildNgramsOthersList :: (-- RepoCmdM env err m
90 -- , CmdM env err m
91 HasNodeError err
92 -- , HasTreeError err
93 )
94 => User
95 -> UserCorpusId
96 -> (Text -> Text)
97 -> NgramsType
98 -> Cmd err (Map NgramsType [NgramsElement])
99 buildNgramsOthersList _user uCid groupIt nt = do
100 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
101
102 let
103 listSize = 9
104 all' = List.sortOn (Down . Set.size . snd . snd)
105 $ Map.toList ngs
106
107 (graphTerms, candiTerms) = List.splitAt listSize all'
108
109 pure $ Map.unionsWith (<>) [ toElements nt MapTerm graphTerms
110 , toElements nt CandidateTerm candiTerms
111 ]
112
113 toElements :: Ord k => k -> ListType -> [(Text, b)] -> Map k [NgramsElement]
114 toElements nType lType x =
115 Map.fromList [(nType, [ mkNgramsElement (NgramsTerm t) lType Nothing (mSetFromList [])
116 | (t, _ns) <- x
117 ]
118 )]
119
120 -- TODO use ListIds
121 buildNgramsTermsList :: ( HasNodeError err
122 , CmdM env err m
123 , RepoCmdM env err m
124 , HasTreeError err
125 )
126 => User
127 -> Lang
128 -> Int
129 -> Int
130 -> StopSize
131 -> UserCorpusId
132 -> MasterCorpusId
133 -> m (Map NgramsType [NgramsElement])
134 buildNgramsTermsList user l n m _s uCid mCid = do
135
136 -- Computing global speGen score
137 allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
138
139 -- printDebug "head candidates" (List.take 10 $ allTerms)
140 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
141
142 -- First remove stops terms
143 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst allTerms)
144
145 printDebug "\n * socialLists * \n" socialLists
146
147 let
148 _socialMap = fromMaybe Set.empty $ Map.lookup MapTerm socialLists
149 _socialCand = fromMaybe Set.empty $ Map.lookup CandidateTerm socialLists
150 socialStop = fromMaybe Set.empty $ Map.lookup StopTerm socialLists
151 -- stopTerms ignored for now (need to be tagged already)
152 (stopTerms, candidateTerms) = List.partition ((\t -> Set.member t socialStop) . fst) allTerms
153
154 printDebug "\n * stopTerms * \n" stopTerms
155
156 -- Grouping the ngrams and keeping the maximum score for label
157 let grouped = groupStems'
158 $ map (\(t,d) -> let stem = ngramsGroup l n m t
159 in ( stem
160 , GroupedText Nothing t d Set.empty (size t) stem Set.empty
161 )
162 ) candidateTerms
163
164 (groupedMono, groupedMult) = Map.partition (\gt -> _gt_size gt < 2) grouped
165
166 -- printDebug "groupedMult" groupedMult
167 -- splitting monterms and multiterms to take proportional candidates
168 let
169 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
170 monoSize = 0.4 :: Double
171 multSize = 1 - monoSize
172
173 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
174
175 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
176 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
177
178 printDebug "groupedMonoHead" (List.length groupedMonoHead)
179 printDebug "groupedMonoTail" (List.length groupedMonoHead)
180 printDebug "groupedMultHead" (List.length groupedMultHead)
181 printDebug "groupedMultTail" (List.length groupedMultTail)
182
183 let
184 -- Get Local Scores now for selected grouped ngrams
185 selectedTerms = Set.toList $ List.foldl'
186 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
187 $ Set.union g
188 $ Set.singleton l'
189 )
190 Set.empty
191 (groupedMonoHead <> groupedMultHead)
192
193 -- TO remove (and remove HasNodeError instance)
194 userListId <- defaultList uCid
195 masterListId <- defaultList mCid
196
197 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
198 let
199 mapGroups = Map.fromList
200 $ map (\g -> (_gt_stem g, g))
201 $ groupedMonoHead <> groupedMultHead
202
203 -- grouping with Set NodeId
204 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup l n m k
205 in case Map.lookup k' mapGroups' of
206 Nothing -> mapGroups'
207 Just g -> case Map.lookup k mapTextDocIds of
208 Nothing -> mapGroups'
209 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
210 )
211 mapGroups
212 $ Map.keys mapTextDocIds
213
214 -- compute cooccurrences
215 mapCooc = Map.filter (>2) $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
216 | (t1, s1) <- mapStemNodeIds
217 , (t2, s2) <- mapStemNodeIds
218 ]
219 where
220 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
221 -- printDebug "mapCooc" mapCooc
222
223 let
224 -- computing scores
225 mapScores f = Map.fromList
226 $ map (\(Scored t g s') -> (t, f (g,s')))
227 $ normalizeGlobal
228 $ map normalizeLocal
229 $ scored' mapCooc
230
231 groupsWithScores = catMaybes
232 $ map (\(stem, g)
233 -> case Map.lookup stem mapScores' of
234 Nothing -> Nothing
235 Just s' -> Just $ g { _gt_score = s'}
236 ) $ Map.toList contextsAdded
237 where
238 mapScores' = mapScores identity
239 -- adapt2 TOCHECK with DC
240 -- printDebug "groupsWithScores" groupsWithScores
241 let
242 -- sort / partition / split
243 -- filter mono/multi again
244 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
245 -- filter with max score
246 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
247
248 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
249 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
250
251 -- splitAt
252 let
253 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
254 inclSize = 0.4 :: Double
255 exclSize = 1 - inclSize
256 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
257
258 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
259 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
260
261 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
262 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
263
264
265 -- Final Step building the Typed list
266 -- (map (toGargList $ Just StopTerm) stopTerms) -- Removing stops (needs social score)
267 termListHead =
268 (map (\g -> g { _gt_listType = Just MapTerm} ) ( monoScoredInclHead
269 <> monoScoredExclHead
270 <> multScoredInclHead
271 <> multScoredExclHead
272 )
273 )
274 <> (map (\g -> g { _gt_listType = Just CandidateTerm }) ( monoScoredInclTail
275 <> monoScoredExclTail
276 <> multScoredInclTail
277 <> multScoredExclTail
278 )
279 )
280
281 termListTail = map (\g -> g { _gt_listType = Just CandidateTerm }) ( groupedMonoTail <> groupedMultTail)
282
283 -- printDebug "monoScoredInclHead" monoScoredInclHead
284 -- printDebug "monoScoredExclHead" monoScoredExclTail
285 --
286 printDebug "multScoredInclHead" multScoredInclHead
287 printDebug "multScoredExclTail" multScoredExclTail
288
289 let result = Map.unionsWith (<>)
290 [ Map.fromList [(
291 NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
292 <> (List.concat $ map toNgramsElement $ termListTail)
293 )]
294 , toElements NgramsTerms StopTerm stopTerms
295 ]
296 -- printDebug "\n result \n" r
297 pure result
298
299 groupStems :: [(Stem, GroupedText Double)] -> [GroupedText Double]
300 groupStems = Map.elems . groupStems'
301
302 groupStems' :: [(Stem, GroupedText Double)] -> Map Stem (GroupedText Double)
303 groupStems' = Map.fromListWith grouping
304 where
305 grouping (GroupedText lt1 label1 score1 group1 s1 stem1 nodes1)
306 (GroupedText lt2 label2 score2 group2 s2 stem2 nodes2)
307 | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr) s1 stem1 nodes
308 | otherwise = GroupedText lt label2 score2 (Set.insert label1 gr) s2 stem2 nodes
309 where
310 lt = lt1 <> lt2
311 gr = Set.union group1 group2
312 nodes = Set.union nodes1 nodes2
313
314
315
316
317 toNgramsElement :: GroupedText a -> [NgramsElement]
318 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
319 [parentElem] <> childrenElems
320 where
321 parent = label
322 children = Set.toList setNgrams
323 parentElem = mkNgramsElement (NgramsTerm parent)
324 (fromMaybe CandidateTerm listType)
325 Nothing
326 (mSetFromList (NgramsTerm <$> children))
327 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
328 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
329 (mSetFromList [])
330 ) (NgramsTerm <$> children)
331
332
333 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
334 toGargList l n = (l,n)
335
336
337 isStopTerm :: StopSize -> Text -> Bool
338 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
339 where
340 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
341
342
343 ------------------------------------------------------------------------------
344 type Group = Lang -> Int -> Int -> Text -> Text
345 type Stem = Text
346 type Label = Text
347 data GroupedText score =
348 GroupedText { _gt_listType :: !(Maybe ListType)
349 , _gt_label :: !Label
350 , _gt_score :: !score
351 , _gt_group :: !(Set Text)
352 , _gt_size :: !Int
353 , _gt_stem :: !Stem
354 , _gt_nodes :: !(Set NodeId)
355 }
356 instance Show score => Show (GroupedText score) where
357 show (GroupedText _ l s _ _ _ _) = show l <> ":" <> show s
358
359 instance (Eq a) => Eq (GroupedText a) where
360 (==) (GroupedText _ _ score1 _ _ _ _)
361 (GroupedText _ _ score2 _ _ _ _) = (==) score1 score2
362
363 instance (Eq a, Ord a) => Ord (GroupedText a) where
364 compare (GroupedText _ _ score1 _ _ _ _)
365 (GroupedText _ _ score2 _ _ _ _) = compare score1 score2
366
367
368
369 -- Lenses Instances
370 makeLenses 'GroupedText