]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[TextFlow] MapList building, score needs normalization
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13
14 module Gargantext.Core.Text.List
15 where
16
17 import Control.Lens (makeLenses)
18 import Data.Maybe (fromMaybe, catMaybes)
19 import Data.Ord (Down(..))
20 import Data.Map (Map)
21 import Data.Set (Set)
22 import Data.Text (Text)
23 import qualified Data.Char as Char
24 import qualified Data.List as List
25 import qualified Data.Map as Map
26 import qualified Data.Set as Set
27 import qualified Data.Text as Text
28
29 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
30 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
31 import Gargantext.Core (Lang(..))
32 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
33 import Gargantext.Database.Admin.Types.Node (NodeId)
34 import Gargantext.Core.Text.Metrics (scored', Scored(..))
35 import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
36 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
37 import Gargantext.Database.Query.Table.Node (defaultList)
38 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
39 import Gargantext.Database.Prelude (Cmd)
40 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
41
42 import Gargantext.Prelude
43 import Gargantext.Core.Text (size)
44 import Gargantext.Core.Text.List.Learn (Model(..))
45 -- import Gargantext.Core.Text.Metrics (takeScored)
46
47
48 data NgramsListBuilder = BuilderStepO { stemSize :: !Int
49 , stemX :: !Int
50 , stopSize :: !Int
51 }
52 | BuilderStep1 { withModel :: !Model }
53 | BuilderStepN { withModel :: !Model }
54 | Tficf { nlb_lang :: !Lang
55 , nlb_group1 :: !Int
56 , nlb_group2 :: !Int
57 , nlb_stopSize :: !StopSize
58 , nlb_userCorpusId :: !UserCorpusId
59 , nlb_masterCorpusId :: !MasterCorpusId
60 }
61
62
63 data StopSize = StopSize {unStopSize :: !Int}
64
65 -- | TODO improve grouping functions of Authors, Sources, Institutes..
66 buildNgramsLists :: HasNodeError err
67 => Lang
68 -> Int
69 -> Int
70 -> StopSize
71 -> UserCorpusId
72 -> MasterCorpusId
73 -> Cmd err (Map NgramsType [NgramsElement])
74 buildNgramsLists l n m s uCid mCid = do
75 ngTerms <- buildNgramsTermsList l n m s uCid mCid
76 othersTerms <- mapM (buildNgramsOthersList uCid identity)
77 [Authors, Sources, Institutes]
78 pure $ Map.unions $ othersTerms <> [ngTerms]
79
80
81 buildNgramsOthersList :: UserCorpusId
82 -> (Text -> Text)
83 -> NgramsType
84 -> Cmd err (Map NgramsType [NgramsElement])
85 buildNgramsOthersList uCid groupIt nt = do
86 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
87
88 let
89 listSize = 9
90 all' = List.reverse
91 $ List.sortOn (Set.size . snd . snd)
92 $ Map.toList ngs
93
94 graphTerms = List.take listSize all'
95 candiTerms = List.drop listSize all'
96
97 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
98 , toElements CandidateTerm candiTerms
99 ]
100 where
101 toElements nType x =
102 Map.fromList [(nt, [ mkNgramsElement (NgramsTerm t) nType Nothing (mSetFromList [])
103 | (t, _ns) <- x
104 ]
105 )]
106
107 -- TODO use ListIds
108 buildNgramsTermsList :: HasNodeError err
109 => Lang
110 -> Int
111 -> Int
112 -> StopSize
113 -> UserCorpusId
114 -> MasterCorpusId
115 -> Cmd err (Map NgramsType [NgramsElement])
116 buildNgramsTermsList l n m s uCid mCid = do
117
118 -- Computing global speGen score
119 allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
120
121 -- printDebug "head candidates" (List.take 10 $ allTerms)
122 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
123
124 -- First remove stops terms
125 let
126 -- stopTerms ignored for now (need to be tagged already)
127 (_stopTerms, candidateTerms) = List.partition ((isStopTerm s) . fst) allTerms
128
129 -- Grouping the ngrams and keeping the maximum score for label
130 let grouped = groupStems'
131 $ map (\(t,d) -> let stem = ngramsGroup l n m t
132 in ( stem
133 , GroupedText Nothing t d Set.empty (size t) stem Set.empty
134 )
135 ) candidateTerms
136
137 (groupedMono, groupedMult) = Map.partition (\gt -> _gt_size gt < 2) grouped
138
139 -- printDebug "groupedMult" groupedMult
140 -- splitting monterms and multiterms to take proportional candidates
141 let
142 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
143 monoSize = 0.4 :: Double
144 multSize = 1 - monoSize
145
146 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
147
148 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
149 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
150
151 printDebug "groupedMonoHead" (List.length groupedMonoHead)
152 printDebug "groupedMonoTail" (List.length groupedMonoHead)
153 printDebug "groupedMultHead" (List.length groupedMultHead)
154 printDebug "groupedMultTail" (List.length groupedMultTail)
155
156 let
157 -- Get Local Scores now for selected grouped ngrams
158 selectedTerms = Set.toList $ List.foldl'
159 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
160 $ Set.union g
161 $ Set.singleton l'
162 )
163 Set.empty
164 (groupedMonoHead <> groupedMultHead)
165
166 -- TO remove (and remove HasNodeError instance)
167 userListId <- defaultList uCid
168 masterListId <- defaultList mCid
169
170 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
171 let
172 mapGroups = Map.fromList
173 $ map (\g -> (_gt_stem g, g))
174 $ groupedMonoHead <> groupedMultHead
175
176 -- grouping with Set NodeId
177 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup l n m k
178 in case Map.lookup k' mapGroups' of
179 Nothing -> mapGroups'
180 Just g -> case Map.lookup k mapTextDocIds of
181 Nothing -> mapGroups'
182 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
183 )
184 mapGroups
185 $ Map.keys mapTextDocIds
186
187 -- compute cooccurrences
188 mapCooc = Map.filter (>2) $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
189 | (t1, s1) <- mapStemNodeIds
190 , (t2, s2) <- mapStemNodeIds
191 ]
192 where
193 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
194 -- printDebug "mapCooc" mapCooc
195
196 let
197 -- computing scores
198 mapScores f = Map.fromList $ map (\(Scored t g s') -> (t, f (g,s'))) $ scored' mapCooc
199
200 groupsWithScores = catMaybes
201 $ map (\(stem, g)
202 -> case Map.lookup stem mapScores' of
203 Nothing -> Nothing
204 Just s' -> Just $ g { _gt_score = s'}
205 ) $ Map.toList contextsAdded
206 where
207 mapScores' = mapScores adapt1 -- identity
208 adapt1 (s1,s2) = (log' 5 s1, log' 2 s2)
209 log' n' x = 1 + (if x <= 0 then 0 else log $ (10^(n'::Int)) * x)
210 -- adapt2 TOCHECK with DC
211 -- printDebug "groupsWithScores" groupsWithScores
212 let
213 -- sort / partition / split
214 -- filter mono/multi again
215 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
216 -- filter with max score
217 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
218
219 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
220 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
221
222 -- splitAt
223 let
224 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
225 inclSize = 0.4 :: Double
226 exclSize = 1 - inclSize
227 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
228
229 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
230 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
231
232 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
233 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
234
235
236 -- Final Step building the Typed list
237 -- (map (toGargList $ Just StopTerm) stopTerms) -- Removing stops (needs social score)
238 termListHead =
239 (map (\g -> g { _gt_listType = Just MapTerm} ) ( monoScoredInclHead
240 <> monoScoredExclHead
241 <> multScoredInclHead
242 <> multScoredExclHead
243 )
244 )
245 <> (map (\g -> g { _gt_listType = Just CandidateTerm }) ( monoScoredInclTail
246 <> monoScoredExclTail
247 <> multScoredInclTail
248 <> multScoredExclTail
249 )
250 )
251
252 termListTail = map (\g -> g { _gt_listType = Just CandidateTerm }) ( groupedMonoTail <> groupedMultTail)
253
254 -- printDebug "monoScoredInclHead" monoScoredInclHead
255 -- printDebug "monoScoredExclHead" monoScoredExclTail
256 --
257 printDebug "multScoredInclHead" multScoredInclHead
258 printDebug "multScoredExclTail" multScoredExclTail
259
260
261
262 pure $ Map.fromList [(NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
263 <> (List.concat $ map toNgramsElement $ termListTail)
264 )
265 ]
266
267 groupStems :: [(Stem, GroupedText Double)] -> [GroupedText Double]
268 groupStems = Map.elems . groupStems'
269
270 groupStems' :: [(Stem, GroupedText Double)] -> Map Stem (GroupedText Double)
271 groupStems' = Map.fromListWith grouping
272 where
273 grouping (GroupedText lt1 label1 score1 group1 s1 stem1 nodes1)
274 (GroupedText lt2 label2 score2 group2 s2 stem2 nodes2)
275 | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr) s1 stem1 nodes
276 | otherwise = GroupedText lt label2 score2 (Set.insert label1 gr) s2 stem2 nodes
277 where
278 lt = lt1 <> lt2
279 gr = Set.union group1 group2
280 nodes = Set.union nodes1 nodes2
281
282
283
284
285 toNgramsElement :: GroupedText a -> [NgramsElement]
286 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
287 [parentElem] <> childrenElems
288 where
289 parent = label
290 children = Set.toList setNgrams
291 parentElem = mkNgramsElement (NgramsTerm parent)
292 (fromMaybe CandidateTerm listType)
293 Nothing
294 (mSetFromList (NgramsTerm <$> children))
295 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
296 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
297 (mSetFromList [])
298 ) (NgramsTerm <$> children)
299
300
301 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
302 toGargList l n = (l,n)
303
304
305 isStopTerm :: StopSize -> Text -> Bool
306 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
307 where
308 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
309
310
311 ------------------------------------------------------------------------------
312 type Group = Lang -> Int -> Int -> Text -> Text
313 type Stem = Text
314 type Label = Text
315 data GroupedText score =
316 GroupedText { _gt_listType :: !(Maybe ListType)
317 , _gt_label :: !Label
318 , _gt_score :: !score
319 , _gt_group :: !(Set Text)
320 , _gt_size :: !Int
321 , _gt_stem :: !Stem
322 , _gt_nodes :: !(Set NodeId)
323 }
324 instance Show score => Show (GroupedText score) where
325 show (GroupedText _ l s _ _ _ _) = show l <> ":" <> show s
326
327 instance (Eq a) => Eq (GroupedText a) where
328 (==) (GroupedText _ _ score1 _ _ _ _)
329 (GroupedText _ _ score2 _ _ _ _) = (==) score1 score2
330
331 instance (Eq a, Ord a) => Ord (GroupedText a) where
332 compare (GroupedText _ _ score1 _ _ _ _)
333 (GroupedText _ _ score2 _ _ _ _) = compare score1 score2
334
335
336
337 -- Lenses Instances
338 makeLenses 'GroupedText