2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
14 module Gargantext.Core.Text.List
18 import Control.Lens (makeLenses)
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
23 import Data.Text (Text)
24 import qualified Data.Char as Char
25 import qualified Data.List as List
26 import qualified Data.Map as Map
27 import qualified Data.Set as Set
28 import qualified Data.Text as Text
30 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
31 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
32 import Gargantext.Core (Lang(..))
33 import Gargantext.Core.Text (size)
34 import Gargantext.Core.Text.List.Learn (Model(..))
35 import Gargantext.Core.Text.List.Social (flowSocialList)
36 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
37 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
38 import Gargantext.Core.Types.Individu (User(..))
39 import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
40 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
41 import Gargantext.API.Ngrams.Types (RepoCmdM)
42 import Gargantext.Database.Admin.Types.Node (NodeId)
43 import Gargantext.Database.Prelude (Cmd, CmdM)
44 import Gargantext.Database.Query.Table.Node (defaultList)
45 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
46 import Gargantext.Database.Query.Tree.Error (HasTreeError)
47 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
48 import Gargantext.Prelude
51 data NgramsListBuilder = BuilderStepO { stemSize :: !Int
55 | BuilderStep1 { withModel :: !Model }
56 | BuilderStepN { withModel :: !Model }
57 | Tficf { nlb_lang :: !Lang
60 , nlb_stopSize :: !StopSize
61 , nlb_userCorpusId :: !UserCorpusId
62 , nlb_masterCorpusId :: !MasterCorpusId
66 data StopSize = StopSize {unStopSize :: !Int}
68 -- | TODO improve grouping functions of Authors, Sources, Institutes..
69 buildNgramsLists :: ( RepoCmdM env err m
81 -> m (Map NgramsType [NgramsElement])
82 buildNgramsLists user l n m s uCid mCid = do
83 ngTerms <- buildNgramsTermsList user l n m s uCid mCid
84 othersTerms <- mapM (buildNgramsOthersList user uCid identity)
85 [Authors, Sources, Institutes]
86 pure $ Map.unions $ othersTerms <> [ngTerms]
89 buildNgramsOthersList :: (-- RepoCmdM env err m
98 -> Cmd err (Map NgramsType [NgramsElement])
99 buildNgramsOthersList _user uCid groupIt nt = do
100 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
104 all' = List.sortOn (Down . Set.size . snd . snd)
107 (graphTerms, candiTerms) = List.splitAt listSize all'
109 pure $ Map.unionsWith (<>) [ toElements nt MapTerm graphTerms
110 , toElements nt CandidateTerm candiTerms
113 toElements :: Ord k => k -> ListType -> [(Text, b)] -> Map k [NgramsElement]
114 toElements nType lType x =
115 Map.fromList [(nType, [ mkNgramsElement (NgramsTerm t) lType Nothing (mSetFromList [])
121 buildNgramsTermsList :: ( HasNodeError err
133 -> m (Map NgramsType [NgramsElement])
134 buildNgramsTermsList user l n m _s uCid mCid = do
136 -- Computing global speGen score
137 allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
139 -- printDebug "head candidates" (List.take 10 $ allTerms)
140 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
142 -- First remove stops terms
143 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst allTerms)
145 printDebug "\n * socialLists * \n" socialLists
148 _socialMap = fromMaybe Set.empty $ Map.lookup MapTerm socialLists
149 _socialCand = fromMaybe Set.empty $ Map.lookup CandidateTerm socialLists
150 socialStop = fromMaybe Set.empty $ Map.lookup StopTerm socialLists
151 -- stopTerms ignored for now (need to be tagged already)
152 (stopTerms, candidateTerms) = List.partition ((\t -> Set.member t socialStop) . fst) allTerms
154 printDebug "stopTerms" stopTerms
156 -- Grouping the ngrams and keeping the maximum score for label
157 let grouped = groupStems'
158 $ map (\(t,d) -> let stem = ngramsGroup l n m t
160 , GroupedText Nothing t d Set.empty (size t) stem Set.empty
164 (groupedMono, groupedMult) = Map.partition (\gt -> _gt_size gt < 2) grouped
166 -- printDebug "groupedMult" groupedMult
167 -- splitting monterms and multiterms to take proportional candidates
169 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
170 monoSize = 0.4 :: Double
171 multSize = 1 - monoSize
173 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
175 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
176 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
178 printDebug "groupedMonoHead" (List.length groupedMonoHead)
179 printDebug "groupedMonoTail" (List.length groupedMonoHead)
180 printDebug "groupedMultHead" (List.length groupedMultHead)
181 printDebug "groupedMultTail" (List.length groupedMultTail)
184 -- Get Local Scores now for selected grouped ngrams
185 selectedTerms = Set.toList $ List.foldl'
186 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
191 (groupedMonoHead <> groupedMultHead)
193 -- TO remove (and remove HasNodeError instance)
194 userListId <- defaultList uCid
195 masterListId <- defaultList mCid
197 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
199 mapGroups = Map.fromList
200 $ map (\g -> (_gt_stem g, g))
201 $ groupedMonoHead <> groupedMultHead
203 -- grouping with Set NodeId
204 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup l n m k
205 in case Map.lookup k' mapGroups' of
206 Nothing -> mapGroups'
207 Just g -> case Map.lookup k mapTextDocIds of
208 Nothing -> mapGroups'
209 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
212 $ Map.keys mapTextDocIds
214 -- compute cooccurrences
215 mapCooc = Map.filter (>2) $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
216 | (t1, s1) <- mapStemNodeIds
217 , (t2, s2) <- mapStemNodeIds
220 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
221 -- printDebug "mapCooc" mapCooc
225 mapScores f = Map.fromList
226 $ map (\(Scored t g s') -> (t, f (g,s')))
231 groupsWithScores = catMaybes
233 -> case Map.lookup stem mapScores' of
235 Just s' -> Just $ g { _gt_score = s'}
236 ) $ Map.toList contextsAdded
238 mapScores' = mapScores identity
239 -- adapt2 TOCHECK with DC
240 -- printDebug "groupsWithScores" groupsWithScores
242 -- sort / partition / split
243 -- filter mono/multi again
244 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
245 -- filter with max score
246 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
248 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
249 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
253 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
254 inclSize = 0.4 :: Double
255 exclSize = 1 - inclSize
256 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
258 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
259 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
261 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
262 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
265 -- Final Step building the Typed list
266 -- (map (toGargList $ Just StopTerm) stopTerms) -- Removing stops (needs social score)
268 (map (\g -> g { _gt_listType = Just MapTerm} ) ( monoScoredInclHead
269 <> monoScoredExclHead
270 <> multScoredInclHead
271 <> multScoredExclHead
274 <> (map (\g -> g { _gt_listType = Just CandidateTerm }) ( monoScoredInclTail
275 <> monoScoredExclTail
276 <> multScoredInclTail
277 <> multScoredExclTail
281 termListTail = map (\g -> g { _gt_listType = Just CandidateTerm }) ( groupedMonoTail <> groupedMultTail)
283 -- printDebug "monoScoredInclHead" monoScoredInclHead
284 -- printDebug "monoScoredExclHead" monoScoredExclTail
286 printDebug "multScoredInclHead" multScoredInclHead
287 printDebug "multScoredExclTail" multScoredExclTail
289 pure $ Map.unionsWith (<>)
291 NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
292 <> (List.concat $ map toNgramsElement $ termListTail)
294 , toElements NgramsTerms StopTerm stopTerms
297 groupStems :: [(Stem, GroupedText Double)] -> [GroupedText Double]
298 groupStems = Map.elems . groupStems'
300 groupStems' :: [(Stem, GroupedText Double)] -> Map Stem (GroupedText Double)
301 groupStems' = Map.fromListWith grouping
303 grouping (GroupedText lt1 label1 score1 group1 s1 stem1 nodes1)
304 (GroupedText lt2 label2 score2 group2 s2 stem2 nodes2)
305 | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr) s1 stem1 nodes
306 | otherwise = GroupedText lt label2 score2 (Set.insert label1 gr) s2 stem2 nodes
309 gr = Set.union group1 group2
310 nodes = Set.union nodes1 nodes2
315 toNgramsElement :: GroupedText a -> [NgramsElement]
316 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
317 [parentElem] <> childrenElems
320 children = Set.toList setNgrams
321 parentElem = mkNgramsElement (NgramsTerm parent)
322 (fromMaybe CandidateTerm listType)
324 (mSetFromList (NgramsTerm <$> children))
325 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
326 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
328 ) (NgramsTerm <$> children)
331 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
332 toGargList l n = (l,n)
335 isStopTerm :: StopSize -> Text -> Bool
336 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
338 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
341 ------------------------------------------------------------------------------
342 type Group = Lang -> Int -> Int -> Text -> Text
345 data GroupedText score =
346 GroupedText { _gt_listType :: !(Maybe ListType)
347 , _gt_label :: !Label
348 , _gt_score :: !score
349 , _gt_group :: !(Set Text)
352 , _gt_nodes :: !(Set NodeId)
354 instance Show score => Show (GroupedText score) where
355 show (GroupedText _ l s _ _ _ _) = show l <> ":" <> show s
357 instance (Eq a) => Eq (GroupedText a) where
358 (==) (GroupedText _ _ score1 _ _ _ _)
359 (GroupedText _ _ score2 _ _ _ _) = (==) score1 score2
361 instance (Eq a, Ord a) => Ord (GroupedText a) where
362 compare (GroupedText _ _ score1 _ _ _ _)
363 (GroupedText _ _ score2 _ _ _ _) = compare score1 score2
368 makeLenses 'GroupedText