2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
14 module Gargantext.Core.Text.List
18 import Control.Lens ((^.), set)
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
22 import Data.Text (Text)
23 import qualified Data.Char as Char
24 import qualified Data.List as List
25 import qualified Data.Map as Map
26 import qualified Data.Set as Set
27 import qualified Data.Text as Text
29 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
30 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
31 import Gargantext.API.Ngrams.Types (RepoCmdM)
32 import Gargantext.Core.Text.List.Social (flowSocialList, invertForw)
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
34 import Gargantext.Core.Text.Group
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Node (defaultList)
41 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
42 import Gargantext.Database.Query.Tree.Error (HasTreeError)
43 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
44 import Gargantext.Prelude
47 -- | TODO improve grouping functions of Authors, Sources, Institutes..
48 buildNgramsLists :: ( RepoCmdM env err m
57 -> m (Map NgramsType [NgramsElement])
58 buildNgramsLists user gp uCid mCid = do
59 ngTerms <- buildNgramsTermsList user uCid mCid gp
60 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
61 [ (Authors , MapListSize 9)
62 , (Sources , MapListSize 9)
63 , (Institutes, MapListSize 9)
66 pure $ Map.unions $ [ngTerms] <> othersTerms
69 data MapListSize = MapListSize Int
71 buildNgramsOthersList ::( HasNodeError err
79 -> (NgramsType, MapListSize)
80 -> m (Map NgramsType [NgramsElement])
81 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
82 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
85 grouped = toGroupedText groupIt (Set.size . snd) fst snd
86 (Map.toList $ Map.mapWithKey (\k (a,b) -> (Set.delete k a, b)) $ ngs)
88 socialLists <- flowSocialList user nt (Set.fromList $ Map.keys ngs)
91 groupedWithList = map (addListType (invertForw socialLists)) grouped
92 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
93 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm) tailTerms
95 listSize = mapListSize - (List.length mapTerms)
96 (mapTerms', candiTerms) = List.splitAt listSize $ List.sortOn (Down . _gt_score) $ Map.elems tailTerms'
98 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
99 <> (List.concat $ map toNgramsElement mapTerms)
100 <> (List.concat $ map toNgramsElement $ map (set gt_listType (Just MapTerm)) mapTerms')
101 <> (List.concat $ map toNgramsElement $ map (set gt_listType (Just CandidateTerm)) candiTerms)
105 buildNgramsTermsList :: ( HasNodeError err
114 -> m (Map NgramsType [NgramsElement])
115 buildNgramsTermsList user uCid mCid groupParams = do
117 -- Computing global speGen score
118 allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
120 -- printDebug "head candidates" (List.take 10 $ allTerms)
121 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
123 -- First remove stops terms
124 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst allTerms)
125 -- printDebug "\n * socialLists * \n" socialLists
127 printDebug "\n * socialLists * \n" socialLists
130 _socialStop = fromMaybe Set.empty $ Map.lookup StopTerm socialLists
131 _socialMap = fromMaybe Set.empty $ Map.lookup MapTerm socialLists
132 _socialCand = fromMaybe Set.empty $ Map.lookup CandidateTerm socialLists
133 -- stopTerms ignored for now (need to be tagged already)
134 -- (stopTerms, candidateTerms) = List.partition ((\t -> Set.member t socialStop) . fst) allTerms
135 -- (mapTerms, candidateTerms) = List.partition ((\t -> Set.member t socialMap ) . fst) allTerms
137 -- printDebug "stopTerms" stopTerms
139 -- Grouping the ngrams and keeping the maximum score for label
140 let grouped = toGroupedText (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) allTerms
142 groupedWithList = map (addListType (invertForw socialLists)) grouped
144 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
145 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
147 -- printDebug "\n * stopTerms * \n" stopTerms
148 -- splitting monterms and multiterms to take proportional candidates
150 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
151 monoSize = 0.4 :: Double
152 multSize = 1 - monoSize
154 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
156 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
157 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
159 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
160 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
161 -- printDebug "groupedMultHead" (List.length groupedMultHead)
162 -- printDebug "groupedMultTail" (List.length groupedMultTail)
165 -- Get Local Scores now for selected grouped ngrams
166 selectedTerms = Set.toList $ List.foldl'
167 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
171 (groupedMonoHead <> groupedMultHead)
173 -- TO remove (and remove HasNodeError instance)
174 userListId <- defaultList uCid
175 masterListId <- defaultList mCid
177 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
180 mapGroups = Map.fromList
181 $ map (\g -> (g ^. gt_stem, g))
182 $ groupedMonoHead <> groupedMultHead
184 -- grouping with Set NodeId
185 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup groupParams k
186 in case Map.lookup k' mapGroups' of
187 Nothing -> mapGroups'
188 Just g -> case Map.lookup k mapTextDocIds of
189 Nothing -> mapGroups'
190 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
193 $ Map.keys mapTextDocIds
195 -- compute cooccurrences
196 mapCooc = Map.filter (>2)
197 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
198 | (t1, s1) <- mapStemNodeIds
199 , (t2, s2) <- mapStemNodeIds
202 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
203 -- printDebug "mapCooc" mapCooc
207 mapScores f = Map.fromList
208 $ map (\(Scored t g s') -> (t, f (g,s')))
213 groupsWithScores = catMaybes
215 -> case Map.lookup stem mapScores' of
217 Just s' -> Just $ g { _gt_score = s'}
218 ) $ Map.toList contextsAdded
220 mapScores' = mapScores identity
221 -- adapt2 TOCHECK with DC
222 -- printDebug "groupsWithScores" groupsWithScores
224 -- sort / partition / split
225 -- filter mono/multi again
226 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
227 -- filter with max score
228 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
230 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
231 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
235 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
236 inclSize = 0.4 :: Double
237 exclSize = 1 - inclSize
238 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
240 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
241 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
243 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
244 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
247 -- Final Step building the Typed list
248 termListHead = maps <> cands
250 maps = set gt_listType (Just MapTerm)
251 <$> monoScoredInclHead
252 <> monoScoredExclHead
253 <> multScoredInclHead
254 <> multScoredExclHead
256 cands = set gt_listType (Just CandidateTerm)
257 <$> monoScoredInclTail
258 <> monoScoredExclTail
259 <> multScoredInclTail
260 <> multScoredExclTail
262 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
264 -- printDebug "monoScoredInclHead" monoScoredInclHead
265 -- printDebug "monoScoredExclHead" monoScoredExclTail
266 -- printDebug "multScoredInclHead" multScoredInclHead
267 -- printDebug "multScoredExclTail" multScoredExclTail
269 let result = Map.unionsWith (<>)
270 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
271 <> (List.concat $ map toNgramsElement $ termListTail)
272 <> (List.concat $ map toNgramsElement $ stopTerms)
275 -- printDebug "\n result \n" r
280 toNgramsElement :: GroupedText a -> [NgramsElement]
281 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
282 [parentElem] <> childrenElems
285 children = Set.toList setNgrams
286 parentElem = mkNgramsElement (NgramsTerm parent)
287 (fromMaybe CandidateTerm listType)
289 (mSetFromList (NgramsTerm <$> children))
290 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
291 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
293 ) (NgramsTerm <$> children)
296 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
297 toGargList l n = (l,n)
300 isStopTerm :: StopSize -> Text -> Bool
301 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
303 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
305 ------------------------------------------------------------------------------