2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
14 module Gargantext.Core.Text.List
18 import Control.Lens ((^.))
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
22 import Data.Text (Text)
23 import qualified Data.Char as Char
24 import qualified Data.List as List
25 import qualified Data.Map as Map
26 import qualified Data.Set as Set
27 import qualified Data.Text as Text
29 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
30 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
31 import Gargantext.API.Ngrams.Types (RepoCmdM)
32 import Gargantext.Core.Text (size)
33 import Gargantext.Core.Text.List.Social (flowSocialList, invertForw)
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
35 import Gargantext.Core.Text.Group
36 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
37 import Gargantext.Core.Types.Individu (User(..))
38 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
39 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
40 import Gargantext.Database.Prelude (Cmd, CmdM)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
43 import Gargantext.Database.Query.Tree.Error (HasTreeError)
44 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
45 import Gargantext.Prelude
48 -- | TODO improve grouping functions of Authors, Sources, Institutes..
49 buildNgramsLists :: ( RepoCmdM env err m
58 -> m (Map NgramsType [NgramsElement])
59 buildNgramsLists user gp uCid mCid = do
60 ngTerms <- buildNgramsTermsList user gp uCid mCid
61 othersTerms <- mapM (buildNgramsOthersList user uCid identity)
62 [Authors, Sources, Institutes]
63 pure $ Map.unions $ othersTerms <> [ngTerms]
66 buildNgramsOthersList :: (-- RepoCmdM env err m
75 -> Cmd err (Map NgramsType [NgramsElement])
76 buildNgramsOthersList _user uCid groupIt nt = do
77 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
81 all' = List.sortOn (Down . Set.size . snd . snd)
84 (graphTerms, candiTerms) = List.splitAt listSize all'
86 pure $ Map.unionsWith (<>) [ toElements nt MapTerm graphTerms
87 , toElements nt CandidateTerm candiTerms
90 toElements :: Ord k => k -> ListType -> [(Text, b)] -> Map k [NgramsElement]
91 toElements nType lType x =
92 Map.fromList [(nType, [ mkNgramsElement (NgramsTerm t) lType Nothing (mSetFromList [])
98 buildNgramsTermsList :: ( HasNodeError err
107 -> m (Map NgramsType [NgramsElement])
108 buildNgramsTermsList user groupParams uCid mCid = do
110 -- Computing global speGen score
111 allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
113 -- printDebug "head candidates" (List.take 10 $ allTerms)
114 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
116 -- First remove stops terms
117 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst allTerms)
119 printDebug "\n * socialLists * \n" socialLists
123 _socialMap = fromMaybe Set.empty $ Map.lookup MapTerm socialLists
124 _socialCand = fromMaybe Set.empty $ Map.lookup CandidateTerm socialLists
125 socialStop = fromMaybe Set.empty $ Map.lookup StopTerm socialLists
126 -- stopTerms ignored for now (need to be tagged already)
127 -- (stopTerms, candidateTerms) = List.partition ((\t -> Set.member t socialStop) . fst) allTerms
131 -- Grouping the ngrams and keeping the maximum score for label
132 let grouped = groupStems'
133 $ map (\(t,d) -> let stem = ngramsGroup groupParams t
135 , GroupedText Nothing t d Set.empty (size t) stem Set.empty
139 groupedWithList = map (addListType (invertForw socialLists)) grouped
140 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
141 (groupedMono, groupedMult) = Map.partition (\gt -> _gt_size gt < 2) candidateTerms
143 -- printDebug "\n * stopTerms * \n" stopTerms
144 -- printDebug "groupedMult" groupedMult
145 -- splitting monterms and multiterms to take proportional candidates
147 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
148 monoSize = 0.4 :: Double
149 multSize = 1 - monoSize
151 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
153 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
154 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
156 printDebug "groupedMonoHead" (List.length groupedMonoHead)
157 printDebug "groupedMonoTail" (List.length groupedMonoHead)
158 printDebug "groupedMultHead" (List.length groupedMultHead)
159 printDebug "groupedMultTail" (List.length groupedMultTail)
162 -- Get Local Scores now for selected grouped ngrams
163 selectedTerms = Set.toList $ List.foldl'
164 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
169 (groupedMonoHead <> groupedMultHead)
171 -- TO remove (and remove HasNodeError instance)
172 userListId <- defaultList uCid
173 masterListId <- defaultList mCid
175 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
177 mapGroups = Map.fromList
178 $ map (\g -> (_gt_stem g, g))
179 $ groupedMonoHead <> groupedMultHead
181 -- grouping with Set NodeId
182 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup groupParams k
183 in case Map.lookup k' mapGroups' of
184 Nothing -> mapGroups'
185 Just g -> case Map.lookup k mapTextDocIds of
186 Nothing -> mapGroups'
187 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
190 $ Map.keys mapTextDocIds
192 -- compute cooccurrences
193 mapCooc = Map.filter (>2) $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
194 | (t1, s1) <- mapStemNodeIds
195 , (t2, s2) <- mapStemNodeIds
198 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
199 -- printDebug "mapCooc" mapCooc
203 mapScores f = Map.fromList
204 $ map (\(Scored t g s') -> (t, f (g,s')))
209 groupsWithScores = catMaybes
211 -> case Map.lookup stem mapScores' of
213 Just s' -> Just $ g { _gt_score = s'}
214 ) $ Map.toList contextsAdded
216 mapScores' = mapScores identity
217 -- adapt2 TOCHECK with DC
218 -- printDebug "groupsWithScores" groupsWithScores
220 -- sort / partition / split
221 -- filter mono/multi again
222 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
223 -- filter with max score
224 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
226 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
227 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
231 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
232 inclSize = 0.4 :: Double
233 exclSize = 1 - inclSize
234 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
236 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
237 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
239 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
240 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
243 -- Final Step building the Typed list
244 -- (map (toGargList $ Just StopTerm) stopTerms) -- Removing stops (needs social score)
246 (map (\g -> g { _gt_listType = Just MapTerm} ) ( monoScoredInclHead
247 <> monoScoredExclHead
248 <> multScoredInclHead
249 <> multScoredExclHead
252 <> (map (\g -> g { _gt_listType = Just CandidateTerm }) ( monoScoredInclTail
253 <> monoScoredExclTail
254 <> multScoredInclTail
255 <> multScoredExclTail
259 termListTail = map (\g -> g { _gt_listType = Just CandidateTerm }) ( groupedMonoTail <> groupedMultTail)
261 -- printDebug "monoScoredInclHead" monoScoredInclHead
262 -- printDebug "monoScoredExclHead" monoScoredExclTail
264 printDebug "multScoredInclHead" multScoredInclHead
265 printDebug "multScoredExclTail" multScoredExclTail
267 let result = Map.unionsWith (<>)
269 NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
270 <> (List.concat $ map toNgramsElement $ termListTail)
271 <> (List.concat $ map toNgramsElement $ stopTerms)
274 -- printDebug "\n result \n" r
277 groupStems :: [(Stem, GroupedText Double)] -> [GroupedText Double]
278 groupStems = Map.elems . groupStems'
280 groupStems' :: [(Stem, GroupedText Double)] -> Map Stem (GroupedText Double)
281 groupStems' = Map.fromListWith grouping
283 grouping (GroupedText lt1 label1 score1 group1 s1 stem1 nodes1)
284 (GroupedText lt2 label2 score2 group2 s2 stem2 nodes2)
285 | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr) s1 stem1 nodes
286 | otherwise = GroupedText lt label2 score2 (Set.insert label1 gr) s2 stem2 nodes
289 gr = Set.union group1 group2
290 nodes = Set.union nodes1 nodes2
295 toNgramsElement :: GroupedText a -> [NgramsElement]
296 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
297 [parentElem] <> childrenElems
300 children = Set.toList setNgrams
301 parentElem = mkNgramsElement (NgramsTerm parent)
302 (fromMaybe CandidateTerm listType)
304 (mSetFromList (NgramsTerm <$> children))
305 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
306 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
308 ) (NgramsTerm <$> children)
311 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
312 toGargList l n = (l,n)
315 isStopTerm :: StopSize -> Text -> Bool
316 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
318 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
320 ------------------------------------------------------------------------------