2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
14 module Gargantext.Core.Text.List
17 import Control.Lens (makeLenses)
18 import Data.Maybe (fromMaybe, catMaybes)
19 import Data.Ord (Down(..))
22 import Data.Text (Text)
23 import qualified Data.Char as Char
24 import qualified Data.List as List
25 import qualified Data.Map as Map
26 import qualified Data.Set as Set
27 import qualified Data.Text as Text
29 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
30 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
31 import Gargantext.Core (Lang(..))
32 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
33 import Gargantext.Database.Admin.Types.Node (NodeId)
34 import Gargantext.Core.Text.Metrics (scored', Scored(..))
35 import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
36 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
37 import Gargantext.Database.Query.Table.Node (defaultList)
38 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
39 import Gargantext.Database.Prelude (Cmd)
40 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
42 import Gargantext.Prelude
43 import Gargantext.Core.Text (size)
44 import Gargantext.Core.Text.List.Learn (Model(..))
45 -- import Gargantext.Core.Text.Metrics (takeScored)
48 data NgramsListBuilder = BuilderStepO { stemSize :: !Int
52 | BuilderStep1 { withModel :: !Model }
53 | BuilderStepN { withModel :: !Model }
54 | Tficf { nlb_lang :: !Lang
57 , nlb_stopSize :: !StopSize
58 , nlb_userCorpusId :: !UserCorpusId
59 , nlb_masterCorpusId :: !MasterCorpusId
63 data StopSize = StopSize {unStopSize :: !Int}
65 -- | TODO improve grouping functions of Authors, Sources, Institutes..
66 buildNgramsLists :: HasNodeError err
73 -> Cmd err (Map NgramsType [NgramsElement])
74 buildNgramsLists l n m s uCid mCid = do
75 ngTerms <- buildNgramsTermsList l n m s uCid mCid
76 othersTerms <- mapM (buildNgramsOthersList uCid identity)
77 [Authors, Sources, Institutes]
78 pure $ Map.unions $ othersTerms <> [ngTerms]
81 buildNgramsOthersList :: UserCorpusId
84 -> Cmd err (Map NgramsType [NgramsElement])
85 buildNgramsOthersList uCid groupIt nt = do
86 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
91 $ List.sortOn (Set.size . snd . snd)
94 graphTerms = List.take listSize all'
95 candiTerms = List.drop listSize all'
97 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
98 , toElements CandidateTerm candiTerms
102 Map.fromList [(nt, [ mkNgramsElement (NgramsTerm t) nType Nothing (mSetFromList [])
108 buildNgramsTermsList :: HasNodeError err
115 -> Cmd err (Map NgramsType [NgramsElement])
116 buildNgramsTermsList l n m s uCid mCid = do
118 -- Computing global speGen score
119 allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
121 -- printDebug "head candidates" (List.take 10 $ allTerms)
122 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
124 -- First remove stops terms
126 -- stopTerms ignored for now (need to be tagged already)
127 (_stopTerms, candidateTerms) = List.partition ((isStopTerm s) . fst) allTerms
129 -- Grouping the ngrams and keeping the maximum score for label
130 let grouped = groupStems'
131 $ map (\(t,d) -> let stem = ngramsGroup l n m t
133 , GroupedText Nothing t d Set.empty (size t) stem Set.empty
137 (groupedMono, groupedMult) = Map.partition (\gt -> _gt_size gt < 2) grouped
139 -- printDebug "groupedMult" groupedMult
140 -- splitting monterms and multiterms to take proportional candidates
142 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
143 monoSize = 0.4 :: Double
144 multSize = 1 - monoSize
146 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
148 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
149 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
151 printDebug "groupedMonoHead" (List.length groupedMonoHead)
152 printDebug "groupedMonoTail" (List.length groupedMonoHead)
153 printDebug "groupedMultHead" (List.length groupedMultHead)
154 printDebug "groupedMultTail" (List.length groupedMultTail)
157 -- Get Local Scores now for selected grouped ngrams
158 selectedTerms = Set.toList $ List.foldl'
159 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
164 (groupedMonoHead <> groupedMultHead)
166 -- TO remove (and remove HasNodeError instance)
167 userListId <- defaultList uCid
168 masterListId <- defaultList mCid
170 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
172 mapGroups = Map.fromList
173 $ map (\g -> (_gt_stem g, g))
174 $ groupedMonoHead <> groupedMultHead
176 -- grouping with Set NodeId
177 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup l n m k
178 in case Map.lookup k' mapGroups' of
179 Nothing -> mapGroups'
180 Just g -> case Map.lookup k mapTextDocIds of
181 Nothing -> mapGroups'
182 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
185 $ Map.keys mapTextDocIds
187 -- compute cooccurrences
188 mapCooc = Map.filter (>2) $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
189 | (t1, s1) <- mapStemNodeIds
190 , (t2, s2) <- mapStemNodeIds
193 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
194 -- printDebug "mapCooc" mapCooc
198 mapScores f = Map.fromList $ map (\(Scored t g s') -> (t, f (g,s'))) $ scored' mapCooc
200 groupsWithScores = catMaybes
202 -> case Map.lookup stem mapScores' of
204 Just s' -> Just $ g { _gt_score = s'}
205 ) $ Map.toList contextsAdded
207 mapScores' = mapScores adapt1 -- identity
208 adapt1 (s1,s2) = (log' 5 s1, log' 2 s2)
209 log' n' x = 1 + (if x <= 0 then 0 else log $ (10^(n'::Int)) * x)
210 -- adapt2 TOCHECK with DC
211 -- printDebug "groupsWithScores" groupsWithScores
213 -- sort / partition / split
214 -- filter mono/multi again
215 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
216 -- filter with max score
217 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
219 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
220 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
224 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
225 inclSize = 0.4 :: Double
226 exclSize = 1 - inclSize
227 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
229 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
230 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
232 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
233 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
236 -- Final Step building the Typed list
237 -- (map (toGargList $ Just StopTerm) stopTerms) -- Removing stops (needs social score)
239 (map (\g -> g { _gt_listType = Just MapTerm} ) ( monoScoredInclHead
240 <> monoScoredExclHead
241 <> multScoredInclHead
242 <> multScoredExclHead
245 <> (map (\g -> g { _gt_listType = Just CandidateTerm }) ( monoScoredInclTail
246 <> monoScoredExclTail
247 <> multScoredInclTail
248 <> multScoredExclTail
252 termListTail = map (\g -> g { _gt_listType = Just CandidateTerm }) ( groupedMonoTail <> groupedMultTail)
254 -- printDebug "monoScoredInclHead" monoScoredInclHead
255 -- printDebug "monoScoredExclHead" monoScoredExclTail
257 printDebug "multScoredInclHead" multScoredInclHead
258 printDebug "multScoredExclTail" multScoredExclTail
262 pure $ Map.fromList [(NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
263 <> (List.concat $ map toNgramsElement $ termListTail)
267 groupStems :: [(Stem, GroupedText Double)] -> [GroupedText Double]
268 groupStems = Map.elems . groupStems'
270 groupStems' :: [(Stem, GroupedText Double)] -> Map Stem (GroupedText Double)
271 groupStems' = Map.fromListWith grouping
273 grouping (GroupedText lt1 label1 score1 group1 s1 stem1 nodes1)
274 (GroupedText lt2 label2 score2 group2 s2 stem2 nodes2)
275 | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr) s1 stem1 nodes
276 | otherwise = GroupedText lt label2 score2 (Set.insert label1 gr) s2 stem2 nodes
279 gr = Set.union group1 group2
280 nodes = Set.union nodes1 nodes2
285 toNgramsElement :: GroupedText a -> [NgramsElement]
286 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
287 [parentElem] <> childrenElems
290 children = Set.toList setNgrams
291 parentElem = mkNgramsElement (NgramsTerm parent)
292 (fromMaybe CandidateTerm listType)
294 (mSetFromList (NgramsTerm <$> children))
295 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
296 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
298 ) (NgramsTerm <$> children)
301 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
302 toGargList l n = (l,n)
305 isStopTerm :: StopSize -> Text -> Bool
306 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
308 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
311 ------------------------------------------------------------------------------
312 type Group = Lang -> Int -> Int -> Text -> Text
315 data GroupedText score =
316 GroupedText { _gt_listType :: !(Maybe ListType)
317 , _gt_label :: !Label
318 , _gt_score :: !score
319 , _gt_group :: !(Set Text)
322 , _gt_nodes :: !(Set NodeId)
324 instance Show score => Show (GroupedText score) where
325 show (GroupedText _ l s _ _ _ _) = show l <> ":" <> show s
327 instance (Eq a) => Eq (GroupedText a) where
328 (==) (GroupedText _ _ score1 _ _ _ _)
329 (GroupedText _ _ score2 _ _ _ _) = (==) score1 score2
331 instance (Eq a, Ord a) => Ord (GroupedText a) where
332 compare (GroupedText _ _ score1 _ _ _ _)
333 (GroupedText _ _ score2 _ _ _ _) = compare score1 score2
338 makeLenses 'GroupedText