]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[FIX] Monads dependencies, flowSocialList integration to flow (WIP)
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13
14 module Gargantext.Core.Text.List
15 where
16
17
18 import Control.Lens (makeLenses)
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
21 import Data.Map (Map)
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import qualified Data.Char as Char
25 import qualified Data.List as List
26 import qualified Data.Map as Map
27 import qualified Data.Set as Set
28 import qualified Data.Text as Text
29
30 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
31 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
32 import Gargantext.Core (Lang(..))
33 import Gargantext.Core.Text (size)
34 import Gargantext.Core.Text.List.Learn (Model(..))
35 import Gargantext.Core.Text.List.Social (flowSocialList)
36 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
37 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
38 import Gargantext.Core.Types.Individu (User(..))
39 import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
40 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
41 import Gargantext.API.Ngrams.Types (RepoCmdM)
42 import Gargantext.Database.Admin.Types.Node (NodeId)
43 import Gargantext.Database.Prelude (Cmd, CmdM)
44 import Gargantext.Database.Query.Table.Node (defaultList)
45 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
46 import Gargantext.Database.Query.Tree.Error (HasTreeError)
47 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
48 import Gargantext.Prelude
49
50
51 data NgramsListBuilder = BuilderStepO { stemSize :: !Int
52 , stemX :: !Int
53 , stopSize :: !Int
54 }
55 | BuilderStep1 { withModel :: !Model }
56 | BuilderStepN { withModel :: !Model }
57 | Tficf { nlb_lang :: !Lang
58 , nlb_group1 :: !Int
59 , nlb_group2 :: !Int
60 , nlb_stopSize :: !StopSize
61 , nlb_userCorpusId :: !UserCorpusId
62 , nlb_masterCorpusId :: !MasterCorpusId
63 }
64
65
66 data StopSize = StopSize {unStopSize :: !Int}
67
68 -- | TODO improve grouping functions of Authors, Sources, Institutes..
69 buildNgramsLists :: ( RepoCmdM env err m
70 , CmdM env err m
71 , HasTreeError err
72 , HasNodeError err
73 )
74 => User
75 -> Lang
76 -> Int
77 -> Int
78 -> StopSize
79 -> UserCorpusId
80 -> MasterCorpusId
81 -> m (Map NgramsType [NgramsElement])
82 buildNgramsLists user l n m s uCid mCid = do
83 ngTerms <- buildNgramsTermsList user l n m s uCid mCid
84 othersTerms <- mapM (buildNgramsOthersList user uCid identity)
85 [Authors, Sources, Institutes]
86 pure $ Map.unions $ othersTerms <> [ngTerms]
87
88
89 buildNgramsOthersList :: (-- RepoCmdM env err m
90 -- , CmdM env err m
91 HasNodeError err
92 -- , HasTreeError err
93 )
94 => User
95 -> UserCorpusId
96 -> (Text -> Text)
97 -> NgramsType
98 -> Cmd err (Map NgramsType [NgramsElement])
99 buildNgramsOthersList user uCid groupIt nt = do
100 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
101
102 let
103 listSize = 9
104 all' = List.reverse
105 $ List.sortOn (Set.size . snd . snd)
106 $ Map.toList ngs
107
108 graphTerms = List.take listSize all'
109 candiTerms = List.drop listSize all'
110
111 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
112 , toElements CandidateTerm candiTerms
113 ]
114 where
115 toElements nType x =
116 Map.fromList [(nt, [ mkNgramsElement (NgramsTerm t) nType Nothing (mSetFromList [])
117 | (t, _ns) <- x
118 ]
119 )]
120
121 -- TODO use ListIds
122 buildNgramsTermsList :: ( HasNodeError err
123 , CmdM env err m
124 , RepoCmdM env err m
125 , HasTreeError err
126 )
127 => User
128 -> Lang
129 -> Int
130 -> Int
131 -> StopSize
132 -> UserCorpusId
133 -> MasterCorpusId
134 -> m (Map NgramsType [NgramsElement])
135 buildNgramsTermsList user l n m s uCid mCid = do
136
137 -- Computing global speGen score
138 allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
139
140 -- printDebug "head candidates" (List.take 10 $ allTerms)
141 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
142
143 -- First remove stops terms
144 mapSocialList <- flowSocialList user NgramsTerms (Set.fromList $ map fst allTerms)
145
146 let
147 -- stopTerms ignored for now (need to be tagged already)
148 (_stopTerms, candidateTerms) = List.partition ((isStopTerm s) . fst) allTerms
149
150 -- Grouping the ngrams and keeping the maximum score for label
151 let grouped = groupStems'
152 $ map (\(t,d) -> let stem = ngramsGroup l n m t
153 in ( stem
154 , GroupedText Nothing t d Set.empty (size t) stem Set.empty
155 )
156 ) candidateTerms
157
158 (groupedMono, groupedMult) = Map.partition (\gt -> _gt_size gt < 2) grouped
159
160 -- printDebug "groupedMult" groupedMult
161 -- splitting monterms and multiterms to take proportional candidates
162 let
163 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
164 monoSize = 0.4 :: Double
165 multSize = 1 - monoSize
166
167 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
168
169 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
170 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
171
172 printDebug "groupedMonoHead" (List.length groupedMonoHead)
173 printDebug "groupedMonoTail" (List.length groupedMonoHead)
174 printDebug "groupedMultHead" (List.length groupedMultHead)
175 printDebug "groupedMultTail" (List.length groupedMultTail)
176
177 let
178 -- Get Local Scores now for selected grouped ngrams
179 selectedTerms = Set.toList $ List.foldl'
180 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
181 $ Set.union g
182 $ Set.singleton l'
183 )
184 Set.empty
185 (groupedMonoHead <> groupedMultHead)
186
187 -- TO remove (and remove HasNodeError instance)
188 userListId <- defaultList uCid
189 masterListId <- defaultList mCid
190
191 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
192 let
193 mapGroups = Map.fromList
194 $ map (\g -> (_gt_stem g, g))
195 $ groupedMonoHead <> groupedMultHead
196
197 -- grouping with Set NodeId
198 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup l n m k
199 in case Map.lookup k' mapGroups' of
200 Nothing -> mapGroups'
201 Just g -> case Map.lookup k mapTextDocIds of
202 Nothing -> mapGroups'
203 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
204 )
205 mapGroups
206 $ Map.keys mapTextDocIds
207
208 -- compute cooccurrences
209 mapCooc = Map.filter (>2) $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
210 | (t1, s1) <- mapStemNodeIds
211 , (t2, s2) <- mapStemNodeIds
212 ]
213 where
214 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
215 -- printDebug "mapCooc" mapCooc
216
217 let
218 -- computing scores
219 mapScores f = Map.fromList
220 $ map (\(Scored t g s') -> (t, f (g,s')))
221 $ normalizeGlobal
222 $ map normalizeLocal
223 $ scored' mapCooc
224
225 groupsWithScores = catMaybes
226 $ map (\(stem, g)
227 -> case Map.lookup stem mapScores' of
228 Nothing -> Nothing
229 Just s' -> Just $ g { _gt_score = s'}
230 ) $ Map.toList contextsAdded
231 where
232 mapScores' = mapScores identity
233 -- adapt2 TOCHECK with DC
234 -- printDebug "groupsWithScores" groupsWithScores
235 let
236 -- sort / partition / split
237 -- filter mono/multi again
238 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
239 -- filter with max score
240 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
241
242 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
243 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
244
245 -- splitAt
246 let
247 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
248 inclSize = 0.4 :: Double
249 exclSize = 1 - inclSize
250 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
251
252 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
253 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
254
255 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
256 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
257
258
259 -- Final Step building the Typed list
260 -- (map (toGargList $ Just StopTerm) stopTerms) -- Removing stops (needs social score)
261 termListHead =
262 (map (\g -> g { _gt_listType = Just MapTerm} ) ( monoScoredInclHead
263 <> monoScoredExclHead
264 <> multScoredInclHead
265 <> multScoredExclHead
266 )
267 )
268 <> (map (\g -> g { _gt_listType = Just CandidateTerm }) ( monoScoredInclTail
269 <> monoScoredExclTail
270 <> multScoredInclTail
271 <> multScoredExclTail
272 )
273 )
274
275 termListTail = map (\g -> g { _gt_listType = Just CandidateTerm }) ( groupedMonoTail <> groupedMultTail)
276
277 -- printDebug "monoScoredInclHead" monoScoredInclHead
278 -- printDebug "monoScoredExclHead" monoScoredExclTail
279 --
280 printDebug "multScoredInclHead" multScoredInclHead
281 printDebug "multScoredExclTail" multScoredExclTail
282
283
284
285 pure $ Map.fromList [(NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
286 <> (List.concat $ map toNgramsElement $ termListTail)
287 )
288 ]
289
290 groupStems :: [(Stem, GroupedText Double)] -> [GroupedText Double]
291 groupStems = Map.elems . groupStems'
292
293 groupStems' :: [(Stem, GroupedText Double)] -> Map Stem (GroupedText Double)
294 groupStems' = Map.fromListWith grouping
295 where
296 grouping (GroupedText lt1 label1 score1 group1 s1 stem1 nodes1)
297 (GroupedText lt2 label2 score2 group2 s2 stem2 nodes2)
298 | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr) s1 stem1 nodes
299 | otherwise = GroupedText lt label2 score2 (Set.insert label1 gr) s2 stem2 nodes
300 where
301 lt = lt1 <> lt2
302 gr = Set.union group1 group2
303 nodes = Set.union nodes1 nodes2
304
305
306
307
308 toNgramsElement :: GroupedText a -> [NgramsElement]
309 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
310 [parentElem] <> childrenElems
311 where
312 parent = label
313 children = Set.toList setNgrams
314 parentElem = mkNgramsElement (NgramsTerm parent)
315 (fromMaybe CandidateTerm listType)
316 Nothing
317 (mSetFromList (NgramsTerm <$> children))
318 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
319 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
320 (mSetFromList [])
321 ) (NgramsTerm <$> children)
322
323
324 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
325 toGargList l n = (l,n)
326
327
328 isStopTerm :: StopSize -> Text -> Bool
329 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
330 where
331 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
332
333
334 ------------------------------------------------------------------------------
335 type Group = Lang -> Int -> Int -> Text -> Text
336 type Stem = Text
337 type Label = Text
338 data GroupedText score =
339 GroupedText { _gt_listType :: !(Maybe ListType)
340 , _gt_label :: !Label
341 , _gt_score :: !score
342 , _gt_group :: !(Set Text)
343 , _gt_size :: !Int
344 , _gt_stem :: !Stem
345 , _gt_nodes :: !(Set NodeId)
346 }
347 instance Show score => Show (GroupedText score) where
348 show (GroupedText _ l s _ _ _ _) = show l <> ":" <> show s
349
350 instance (Eq a) => Eq (GroupedText a) where
351 (==) (GroupedText _ _ score1 _ _ _ _)
352 (GroupedText _ _ score2 _ _ _ _) = (==) score1 score2
353
354 instance (Eq a, Ord a) => Ord (GroupedText a) where
355 compare (GroupedText _ _ score1 _ _ _ _)
356 (GroupedText _ _ score2 _ _ _ _) = compare score1 score2
357
358
359
360 -- Lenses Instances
361 makeLenses 'GroupedText