]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[TextFlow] Score with normalization
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13
14 module Gargantext.Core.Text.List
15 where
16
17 import Control.Lens (makeLenses)
18 import Data.Maybe (fromMaybe, catMaybes)
19 import Data.Ord (Down(..))
20 import Data.Map (Map)
21 import Data.Set (Set)
22 import Data.Text (Text)
23 import qualified Data.Char as Char
24 import qualified Data.List as List
25 import qualified Data.Map as Map
26 import qualified Data.Set as Set
27 import qualified Data.Text as Text
28
29 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
30 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
31 import Gargantext.Core (Lang(..))
32 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
33 import Gargantext.Database.Admin.Types.Node (NodeId)
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
35 import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
36 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
37 import Gargantext.Database.Query.Table.Node (defaultList)
38 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
39 import Gargantext.Database.Prelude (Cmd)
40 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
41
42 import Gargantext.Prelude
43 import Gargantext.Core.Text (size)
44 import Gargantext.Core.Text.List.Learn (Model(..))
45 -- import Gargantext.Core.Text.Metrics (takeScored)
46
47
48 data NgramsListBuilder = BuilderStepO { stemSize :: !Int
49 , stemX :: !Int
50 , stopSize :: !Int
51 }
52 | BuilderStep1 { withModel :: !Model }
53 | BuilderStepN { withModel :: !Model }
54 | Tficf { nlb_lang :: !Lang
55 , nlb_group1 :: !Int
56 , nlb_group2 :: !Int
57 , nlb_stopSize :: !StopSize
58 , nlb_userCorpusId :: !UserCorpusId
59 , nlb_masterCorpusId :: !MasterCorpusId
60 }
61
62
63 data StopSize = StopSize {unStopSize :: !Int}
64
65 -- | TODO improve grouping functions of Authors, Sources, Institutes..
66 buildNgramsLists :: HasNodeError err
67 => Lang
68 -> Int
69 -> Int
70 -> StopSize
71 -> UserCorpusId
72 -> MasterCorpusId
73 -> Cmd err (Map NgramsType [NgramsElement])
74 buildNgramsLists l n m s uCid mCid = do
75 ngTerms <- buildNgramsTermsList l n m s uCid mCid
76 othersTerms <- mapM (buildNgramsOthersList uCid identity)
77 [Authors, Sources, Institutes]
78 pure $ Map.unions $ othersTerms <> [ngTerms]
79
80
81 buildNgramsOthersList :: UserCorpusId
82 -> (Text -> Text)
83 -> NgramsType
84 -> Cmd err (Map NgramsType [NgramsElement])
85 buildNgramsOthersList uCid groupIt nt = do
86 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
87
88 let
89 listSize = 9
90 all' = List.reverse
91 $ List.sortOn (Set.size . snd . snd)
92 $ Map.toList ngs
93
94 graphTerms = List.take listSize all'
95 candiTerms = List.drop listSize all'
96
97 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
98 , toElements CandidateTerm candiTerms
99 ]
100 where
101 toElements nType x =
102 Map.fromList [(nt, [ mkNgramsElement (NgramsTerm t) nType Nothing (mSetFromList [])
103 | (t, _ns) <- x
104 ]
105 )]
106
107 -- TODO use ListIds
108 buildNgramsTermsList :: HasNodeError err
109 => Lang
110 -> Int
111 -> Int
112 -> StopSize
113 -> UserCorpusId
114 -> MasterCorpusId
115 -> Cmd err (Map NgramsType [NgramsElement])
116 buildNgramsTermsList l n m s uCid mCid = do
117
118 -- Computing global speGen score
119 allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
120
121 -- printDebug "head candidates" (List.take 10 $ allTerms)
122 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
123
124 -- First remove stops terms
125 let
126 -- stopTerms ignored for now (need to be tagged already)
127 (_stopTerms, candidateTerms) = List.partition ((isStopTerm s) . fst) allTerms
128
129 -- Grouping the ngrams and keeping the maximum score for label
130 let grouped = groupStems'
131 $ map (\(t,d) -> let stem = ngramsGroup l n m t
132 in ( stem
133 , GroupedText Nothing t d Set.empty (size t) stem Set.empty
134 )
135 ) candidateTerms
136
137 (groupedMono, groupedMult) = Map.partition (\gt -> _gt_size gt < 2) grouped
138
139 -- printDebug "groupedMult" groupedMult
140 -- splitting monterms and multiterms to take proportional candidates
141 let
142 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
143 monoSize = 0.4 :: Double
144 multSize = 1 - monoSize
145
146 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
147
148 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
149 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
150
151 printDebug "groupedMonoHead" (List.length groupedMonoHead)
152 printDebug "groupedMonoTail" (List.length groupedMonoHead)
153 printDebug "groupedMultHead" (List.length groupedMultHead)
154 printDebug "groupedMultTail" (List.length groupedMultTail)
155
156 let
157 -- Get Local Scores now for selected grouped ngrams
158 selectedTerms = Set.toList $ List.foldl'
159 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
160 $ Set.union g
161 $ Set.singleton l'
162 )
163 Set.empty
164 (groupedMonoHead <> groupedMultHead)
165
166 -- TO remove (and remove HasNodeError instance)
167 userListId <- defaultList uCid
168 masterListId <- defaultList mCid
169
170 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
171 let
172 mapGroups = Map.fromList
173 $ map (\g -> (_gt_stem g, g))
174 $ groupedMonoHead <> groupedMultHead
175
176 -- grouping with Set NodeId
177 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup l n m k
178 in case Map.lookup k' mapGroups' of
179 Nothing -> mapGroups'
180 Just g -> case Map.lookup k mapTextDocIds of
181 Nothing -> mapGroups'
182 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
183 )
184 mapGroups
185 $ Map.keys mapTextDocIds
186
187 -- compute cooccurrences
188 mapCooc = Map.filter (>2) $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
189 | (t1, s1) <- mapStemNodeIds
190 , (t2, s2) <- mapStemNodeIds
191 ]
192 where
193 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
194 -- printDebug "mapCooc" mapCooc
195
196 let
197 -- computing scores
198 mapScores f = Map.fromList
199 $ map (\(Scored t g s') -> (t, f (g,s')))
200 $ normalizeGlobal
201 $ map normalizeLocal
202 $ scored' mapCooc
203
204 groupsWithScores = catMaybes
205 $ map (\(stem, g)
206 -> case Map.lookup stem mapScores' of
207 Nothing -> Nothing
208 Just s' -> Just $ g { _gt_score = s'}
209 ) $ Map.toList contextsAdded
210 where
211 mapScores' = mapScores identity
212 -- adapt2 TOCHECK with DC
213 -- printDebug "groupsWithScores" groupsWithScores
214 let
215 -- sort / partition / split
216 -- filter mono/multi again
217 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
218 -- filter with max score
219 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
220
221 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
222 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
223
224 -- splitAt
225 let
226 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
227 inclSize = 0.4 :: Double
228 exclSize = 1 - inclSize
229 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
230
231 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
232 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
233
234 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
235 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
236
237
238 -- Final Step building the Typed list
239 -- (map (toGargList $ Just StopTerm) stopTerms) -- Removing stops (needs social score)
240 termListHead =
241 (map (\g -> g { _gt_listType = Just MapTerm} ) ( monoScoredInclHead
242 <> monoScoredExclHead
243 <> multScoredInclHead
244 <> multScoredExclHead
245 )
246 )
247 <> (map (\g -> g { _gt_listType = Just CandidateTerm }) ( monoScoredInclTail
248 <> monoScoredExclTail
249 <> multScoredInclTail
250 <> multScoredExclTail
251 )
252 )
253
254 termListTail = map (\g -> g { _gt_listType = Just CandidateTerm }) ( groupedMonoTail <> groupedMultTail)
255
256 -- printDebug "monoScoredInclHead" monoScoredInclHead
257 -- printDebug "monoScoredExclHead" monoScoredExclTail
258 --
259 printDebug "multScoredInclHead" multScoredInclHead
260 printDebug "multScoredExclTail" multScoredExclTail
261
262
263
264 pure $ Map.fromList [(NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
265 <> (List.concat $ map toNgramsElement $ termListTail)
266 )
267 ]
268
269 groupStems :: [(Stem, GroupedText Double)] -> [GroupedText Double]
270 groupStems = Map.elems . groupStems'
271
272 groupStems' :: [(Stem, GroupedText Double)] -> Map Stem (GroupedText Double)
273 groupStems' = Map.fromListWith grouping
274 where
275 grouping (GroupedText lt1 label1 score1 group1 s1 stem1 nodes1)
276 (GroupedText lt2 label2 score2 group2 s2 stem2 nodes2)
277 | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr) s1 stem1 nodes
278 | otherwise = GroupedText lt label2 score2 (Set.insert label1 gr) s2 stem2 nodes
279 where
280 lt = lt1 <> lt2
281 gr = Set.union group1 group2
282 nodes = Set.union nodes1 nodes2
283
284
285
286
287 toNgramsElement :: GroupedText a -> [NgramsElement]
288 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
289 [parentElem] <> childrenElems
290 where
291 parent = label
292 children = Set.toList setNgrams
293 parentElem = mkNgramsElement (NgramsTerm parent)
294 (fromMaybe CandidateTerm listType)
295 Nothing
296 (mSetFromList (NgramsTerm <$> children))
297 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
298 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
299 (mSetFromList [])
300 ) (NgramsTerm <$> children)
301
302
303 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
304 toGargList l n = (l,n)
305
306
307 isStopTerm :: StopSize -> Text -> Bool
308 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
309 where
310 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
311
312
313 ------------------------------------------------------------------------------
314 type Group = Lang -> Int -> Int -> Text -> Text
315 type Stem = Text
316 type Label = Text
317 data GroupedText score =
318 GroupedText { _gt_listType :: !(Maybe ListType)
319 , _gt_label :: !Label
320 , _gt_score :: !score
321 , _gt_group :: !(Set Text)
322 , _gt_size :: !Int
323 , _gt_stem :: !Stem
324 , _gt_nodes :: !(Set NodeId)
325 }
326 instance Show score => Show (GroupedText score) where
327 show (GroupedText _ l s _ _ _ _) = show l <> ":" <> show s
328
329 instance (Eq a) => Eq (GroupedText a) where
330 (==) (GroupedText _ _ score1 _ _ _ _)
331 (GroupedText _ _ score2 _ _ _ _) = (==) score1 score2
332
333 instance (Eq a, Ord a) => Ord (GroupedText a) where
334 compare (GroupedText _ _ score1 _ _ _ _)
335 (GroupedText _ _ score2 _ _ _ _) = compare score1 score2
336
337
338
339 -- Lenses Instances
340 makeLenses 'GroupedText