2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
14 module Gargantext.Core.Text.List
18 import Control.Lens (makeLenses)
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
23 import Data.Text (Text)
24 import qualified Data.Char as Char
25 import qualified Data.List as List
26 import qualified Data.Map as Map
27 import qualified Data.Set as Set
28 import qualified Data.Text as Text
30 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
31 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
32 import Gargantext.Core (Lang(..))
33 import Gargantext.Core.Text (size)
34 import Gargantext.Core.Text.List.Learn (Model(..))
35 import Gargantext.Core.Text.List.Social (flowSocialList)
36 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
37 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
38 import Gargantext.Core.Types.Individu (User(..))
39 import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
40 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
41 import Gargantext.API.Ngrams.Types (RepoCmdM)
42 import Gargantext.Database.Admin.Types.Node (NodeId)
43 import Gargantext.Database.Prelude (Cmd, CmdM)
44 import Gargantext.Database.Query.Table.Node (defaultList)
45 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
46 import Gargantext.Database.Query.Tree.Error (HasTreeError)
47 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
48 import Gargantext.Prelude
51 data NgramsListBuilder = BuilderStepO { stemSize :: !Int
55 | BuilderStep1 { withModel :: !Model }
56 | BuilderStepN { withModel :: !Model }
57 | Tficf { nlb_lang :: !Lang
60 , nlb_stopSize :: !StopSize
61 , nlb_userCorpusId :: !UserCorpusId
62 , nlb_masterCorpusId :: !MasterCorpusId
66 data StopSize = StopSize {unStopSize :: !Int}
68 -- | TODO improve grouping functions of Authors, Sources, Institutes..
69 buildNgramsLists :: ( RepoCmdM env err m
81 -> m (Map NgramsType [NgramsElement])
82 buildNgramsLists user l n m s uCid mCid = do
83 ngTerms <- buildNgramsTermsList user l n m s uCid mCid
84 othersTerms <- mapM (buildNgramsOthersList user uCid identity)
85 [Authors, Sources, Institutes]
86 pure $ Map.unions $ othersTerms <> [ngTerms]
89 buildNgramsOthersList :: (-- RepoCmdM env err m
98 -> Cmd err (Map NgramsType [NgramsElement])
99 buildNgramsOthersList user uCid groupIt nt = do
100 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
105 $ List.sortOn (Set.size . snd . snd)
108 graphTerms = List.take listSize all'
109 candiTerms = List.drop listSize all'
111 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
112 , toElements CandidateTerm candiTerms
116 Map.fromList [(nt, [ mkNgramsElement (NgramsTerm t) nType Nothing (mSetFromList [])
122 buildNgramsTermsList :: ( HasNodeError err
134 -> m (Map NgramsType [NgramsElement])
135 buildNgramsTermsList user l n m s uCid mCid = do
137 -- Computing global speGen score
138 allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
140 -- printDebug "head candidates" (List.take 10 $ allTerms)
141 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
143 -- First remove stops terms
144 mapSocialList <- flowSocialList user NgramsTerms (Set.fromList $ map fst allTerms)
147 -- stopTerms ignored for now (need to be tagged already)
148 (_stopTerms, candidateTerms) = List.partition ((isStopTerm s) . fst) allTerms
150 -- Grouping the ngrams and keeping the maximum score for label
151 let grouped = groupStems'
152 $ map (\(t,d) -> let stem = ngramsGroup l n m t
154 , GroupedText Nothing t d Set.empty (size t) stem Set.empty
158 (groupedMono, groupedMult) = Map.partition (\gt -> _gt_size gt < 2) grouped
160 -- printDebug "groupedMult" groupedMult
161 -- splitting monterms and multiterms to take proportional candidates
163 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
164 monoSize = 0.4 :: Double
165 multSize = 1 - monoSize
167 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
169 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
170 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
172 printDebug "groupedMonoHead" (List.length groupedMonoHead)
173 printDebug "groupedMonoTail" (List.length groupedMonoHead)
174 printDebug "groupedMultHead" (List.length groupedMultHead)
175 printDebug "groupedMultTail" (List.length groupedMultTail)
178 -- Get Local Scores now for selected grouped ngrams
179 selectedTerms = Set.toList $ List.foldl'
180 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
185 (groupedMonoHead <> groupedMultHead)
187 -- TO remove (and remove HasNodeError instance)
188 userListId <- defaultList uCid
189 masterListId <- defaultList mCid
191 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
193 mapGroups = Map.fromList
194 $ map (\g -> (_gt_stem g, g))
195 $ groupedMonoHead <> groupedMultHead
197 -- grouping with Set NodeId
198 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup l n m k
199 in case Map.lookup k' mapGroups' of
200 Nothing -> mapGroups'
201 Just g -> case Map.lookup k mapTextDocIds of
202 Nothing -> mapGroups'
203 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
206 $ Map.keys mapTextDocIds
208 -- compute cooccurrences
209 mapCooc = Map.filter (>2) $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
210 | (t1, s1) <- mapStemNodeIds
211 , (t2, s2) <- mapStemNodeIds
214 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
215 -- printDebug "mapCooc" mapCooc
219 mapScores f = Map.fromList
220 $ map (\(Scored t g s') -> (t, f (g,s')))
225 groupsWithScores = catMaybes
227 -> case Map.lookup stem mapScores' of
229 Just s' -> Just $ g { _gt_score = s'}
230 ) $ Map.toList contextsAdded
232 mapScores' = mapScores identity
233 -- adapt2 TOCHECK with DC
234 -- printDebug "groupsWithScores" groupsWithScores
236 -- sort / partition / split
237 -- filter mono/multi again
238 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
239 -- filter with max score
240 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
242 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
243 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
247 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
248 inclSize = 0.4 :: Double
249 exclSize = 1 - inclSize
250 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
252 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
253 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
255 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
256 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
259 -- Final Step building the Typed list
260 -- (map (toGargList $ Just StopTerm) stopTerms) -- Removing stops (needs social score)
262 (map (\g -> g { _gt_listType = Just MapTerm} ) ( monoScoredInclHead
263 <> monoScoredExclHead
264 <> multScoredInclHead
265 <> multScoredExclHead
268 <> (map (\g -> g { _gt_listType = Just CandidateTerm }) ( monoScoredInclTail
269 <> monoScoredExclTail
270 <> multScoredInclTail
271 <> multScoredExclTail
275 termListTail = map (\g -> g { _gt_listType = Just CandidateTerm }) ( groupedMonoTail <> groupedMultTail)
277 -- printDebug "monoScoredInclHead" monoScoredInclHead
278 -- printDebug "monoScoredExclHead" monoScoredExclTail
280 printDebug "multScoredInclHead" multScoredInclHead
281 printDebug "multScoredExclTail" multScoredExclTail
285 pure $ Map.fromList [(NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
286 <> (List.concat $ map toNgramsElement $ termListTail)
290 groupStems :: [(Stem, GroupedText Double)] -> [GroupedText Double]
291 groupStems = Map.elems . groupStems'
293 groupStems' :: [(Stem, GroupedText Double)] -> Map Stem (GroupedText Double)
294 groupStems' = Map.fromListWith grouping
296 grouping (GroupedText lt1 label1 score1 group1 s1 stem1 nodes1)
297 (GroupedText lt2 label2 score2 group2 s2 stem2 nodes2)
298 | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr) s1 stem1 nodes
299 | otherwise = GroupedText lt label2 score2 (Set.insert label1 gr) s2 stem2 nodes
302 gr = Set.union group1 group2
303 nodes = Set.union nodes1 nodes2
308 toNgramsElement :: GroupedText a -> [NgramsElement]
309 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
310 [parentElem] <> childrenElems
313 children = Set.toList setNgrams
314 parentElem = mkNgramsElement (NgramsTerm parent)
315 (fromMaybe CandidateTerm listType)
317 (mSetFromList (NgramsTerm <$> children))
318 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
319 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
321 ) (NgramsTerm <$> children)
324 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
325 toGargList l n = (l,n)
328 isStopTerm :: StopSize -> Text -> Bool
329 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
331 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
334 ------------------------------------------------------------------------------
335 type Group = Lang -> Int -> Int -> Text -> Text
338 data GroupedText score =
339 GroupedText { _gt_listType :: !(Maybe ListType)
340 , _gt_label :: !Label
341 , _gt_score :: !score
342 , _gt_group :: !(Set Text)
345 , _gt_nodes :: !(Set NodeId)
347 instance Show score => Show (GroupedText score) where
348 show (GroupedText _ l s _ _ _ _) = show l <> ":" <> show s
350 instance (Eq a) => Eq (GroupedText a) where
351 (==) (GroupedText _ _ score1 _ _ _ _)
352 (GroupedText _ _ score2 _ _ _ _) = (==) score1 score2
354 instance (Eq a, Ord a) => Ord (GroupedText a) where
355 compare (GroupedText _ _ score1 _ _ _ _)
356 (GroupedText _ _ score2 _ _ _ _) = compare score1 score2
361 makeLenses 'GroupedText