2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
14 module Gargantext.Core.Text.List
17 import Control.Lens (makeLenses)
18 import Data.Maybe (fromMaybe, catMaybes)
19 import Data.Ord (Down(..))
22 import Data.Text (Text)
23 import qualified Data.Char as Char
24 import qualified Data.List as List
25 import qualified Data.Map as Map
26 import qualified Data.Set as Set
27 import qualified Data.Text as Text
29 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
30 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
31 import Gargantext.Core (Lang(..))
32 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
33 import Gargantext.Database.Admin.Types.Node (NodeId)
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
35 import Gargantext.Database.Action.Metrics.NgramsByNode (ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
36 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
37 import Gargantext.Database.Query.Table.Node (defaultList)
38 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
39 import Gargantext.Database.Prelude (Cmd)
40 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
42 import Gargantext.Prelude
43 import Gargantext.Core.Text (size)
44 import Gargantext.Core.Text.List.Learn (Model(..))
45 -- import Gargantext.Core.Text.Metrics (takeScored)
48 data NgramsListBuilder = BuilderStepO { stemSize :: !Int
52 | BuilderStep1 { withModel :: !Model }
53 | BuilderStepN { withModel :: !Model }
54 | Tficf { nlb_lang :: !Lang
57 , nlb_stopSize :: !StopSize
58 , nlb_userCorpusId :: !UserCorpusId
59 , nlb_masterCorpusId :: !MasterCorpusId
63 data StopSize = StopSize {unStopSize :: !Int}
65 -- | TODO improve grouping functions of Authors, Sources, Institutes..
66 buildNgramsLists :: HasNodeError err
73 -> Cmd err (Map NgramsType [NgramsElement])
74 buildNgramsLists l n m s uCid mCid = do
75 ngTerms <- buildNgramsTermsList l n m s uCid mCid
76 othersTerms <- mapM (buildNgramsOthersList uCid identity)
77 [Authors, Sources, Institutes]
78 pure $ Map.unions $ othersTerms <> [ngTerms]
81 buildNgramsOthersList :: UserCorpusId
84 -> Cmd err (Map NgramsType [NgramsElement])
85 buildNgramsOthersList uCid groupIt nt = do
86 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
91 $ List.sortOn (Set.size . snd . snd)
94 graphTerms = List.take listSize all'
95 candiTerms = List.drop listSize all'
97 pure $ Map.unionsWith (<>) [ toElements MapTerm graphTerms
98 , toElements CandidateTerm candiTerms
102 Map.fromList [(nt, [ mkNgramsElement (NgramsTerm t) nType Nothing (mSetFromList [])
108 buildNgramsTermsList :: HasNodeError err
115 -> Cmd err (Map NgramsType [NgramsElement])
116 buildNgramsTermsList l n m s uCid mCid = do
118 -- Computing global speGen score
119 allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
121 -- printDebug "head candidates" (List.take 10 $ allTerms)
122 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
124 -- First remove stops terms
126 -- stopTerms ignored for now (need to be tagged already)
127 (_stopTerms, candidateTerms) = List.partition ((isStopTerm s) . fst) allTerms
129 -- Grouping the ngrams and keeping the maximum score for label
130 let grouped = groupStems'
131 $ map (\(t,d) -> let stem = ngramsGroup l n m t
133 , GroupedText Nothing t d Set.empty (size t) stem Set.empty
137 (groupedMono, groupedMult) = Map.partition (\gt -> _gt_size gt < 2) grouped
139 -- printDebug "groupedMult" groupedMult
140 -- splitting monterms and multiterms to take proportional candidates
142 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
143 monoSize = 0.4 :: Double
144 multSize = 1 - monoSize
146 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
148 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
149 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
151 printDebug "groupedMonoHead" (List.length groupedMonoHead)
152 printDebug "groupedMonoTail" (List.length groupedMonoHead)
153 printDebug "groupedMultHead" (List.length groupedMultHead)
154 printDebug "groupedMultTail" (List.length groupedMultTail)
157 -- Get Local Scores now for selected grouped ngrams
158 selectedTerms = Set.toList $ List.foldl'
159 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
164 (groupedMonoHead <> groupedMultHead)
166 -- TO remove (and remove HasNodeError instance)
167 userListId <- defaultList uCid
168 masterListId <- defaultList mCid
170 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
172 mapGroups = Map.fromList
173 $ map (\g -> (_gt_stem g, g))
174 $ groupedMonoHead <> groupedMultHead
176 -- grouping with Set NodeId
177 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup l n m k
178 in case Map.lookup k' mapGroups' of
179 Nothing -> mapGroups'
180 Just g -> case Map.lookup k mapTextDocIds of
181 Nothing -> mapGroups'
182 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
185 $ Map.keys mapTextDocIds
187 -- compute cooccurrences
188 mapCooc = Map.filter (>2) $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
189 | (t1, s1) <- mapStemNodeIds
190 , (t2, s2) <- mapStemNodeIds
193 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
194 -- printDebug "mapCooc" mapCooc
198 mapScores f = Map.fromList
199 $ map (\(Scored t g s') -> (t, f (g,s')))
204 groupsWithScores = catMaybes
206 -> case Map.lookup stem mapScores' of
208 Just s' -> Just $ g { _gt_score = s'}
209 ) $ Map.toList contextsAdded
211 mapScores' = mapScores identity
212 -- adapt2 TOCHECK with DC
213 -- printDebug "groupsWithScores" groupsWithScores
215 -- sort / partition / split
216 -- filter mono/multi again
217 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
218 -- filter with max score
219 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
221 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
222 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
226 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
227 inclSize = 0.4 :: Double
228 exclSize = 1 - inclSize
229 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
231 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
232 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
234 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
235 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
238 -- Final Step building the Typed list
239 -- (map (toGargList $ Just StopTerm) stopTerms) -- Removing stops (needs social score)
241 (map (\g -> g { _gt_listType = Just MapTerm} ) ( monoScoredInclHead
242 <> monoScoredExclHead
243 <> multScoredInclHead
244 <> multScoredExclHead
247 <> (map (\g -> g { _gt_listType = Just CandidateTerm }) ( monoScoredInclTail
248 <> monoScoredExclTail
249 <> multScoredInclTail
250 <> multScoredExclTail
254 termListTail = map (\g -> g { _gt_listType = Just CandidateTerm }) ( groupedMonoTail <> groupedMultTail)
256 -- printDebug "monoScoredInclHead" monoScoredInclHead
257 -- printDebug "monoScoredExclHead" monoScoredExclTail
259 printDebug "multScoredInclHead" multScoredInclHead
260 printDebug "multScoredExclTail" multScoredExclTail
264 pure $ Map.fromList [(NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
265 <> (List.concat $ map toNgramsElement $ termListTail)
269 groupStems :: [(Stem, GroupedText Double)] -> [GroupedText Double]
270 groupStems = Map.elems . groupStems'
272 groupStems' :: [(Stem, GroupedText Double)] -> Map Stem (GroupedText Double)
273 groupStems' = Map.fromListWith grouping
275 grouping (GroupedText lt1 label1 score1 group1 s1 stem1 nodes1)
276 (GroupedText lt2 label2 score2 group2 s2 stem2 nodes2)
277 | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr) s1 stem1 nodes
278 | otherwise = GroupedText lt label2 score2 (Set.insert label1 gr) s2 stem2 nodes
281 gr = Set.union group1 group2
282 nodes = Set.union nodes1 nodes2
287 toNgramsElement :: GroupedText a -> [NgramsElement]
288 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
289 [parentElem] <> childrenElems
292 children = Set.toList setNgrams
293 parentElem = mkNgramsElement (NgramsTerm parent)
294 (fromMaybe CandidateTerm listType)
296 (mSetFromList (NgramsTerm <$> children))
297 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
298 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
300 ) (NgramsTerm <$> children)
303 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
304 toGargList l n = (l,n)
307 isStopTerm :: StopSize -> Text -> Bool
308 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
310 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
313 ------------------------------------------------------------------------------
314 type Group = Lang -> Int -> Int -> Text -> Text
317 data GroupedText score =
318 GroupedText { _gt_listType :: !(Maybe ListType)
319 , _gt_label :: !Label
320 , _gt_score :: !score
321 , _gt_group :: !(Set Text)
324 , _gt_nodes :: !(Set NodeId)
326 instance Show score => Show (GroupedText score) where
327 show (GroupedText _ l s _ _ _ _) = show l <> ":" <> show s
329 instance (Eq a) => Eq (GroupedText a) where
330 (==) (GroupedText _ _ score1 _ _ _ _)
331 (GroupedText _ _ score2 _ _ _ _) = (==) score1 score2
333 instance (Eq a, Ord a) => Ord (GroupedText a) where
334 compare (GroupedText _ _ score1 _ _ _ _)
335 (GroupedText _ _ score2 _ _ _ _) = compare score1 score2
340 makeLenses 'GroupedText