2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
14 module Gargantext.Core.Text.List
18 import Control.Lens ((^.), set)
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
23 import Data.Text (Text)
24 import qualified Data.Char as Char
25 import qualified Data.List as List
26 import qualified Data.Map as Map
27 import qualified Data.Set as Set
28 import qualified Data.Text as Text
30 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
31 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
32 import Gargantext.API.Ngrams.Types (RepoCmdM)
33 import Gargantext.Core.Text.List.Social (flowSocialList, flowSocialList', FlowSocialListPriority(..), invertForw)
34 import Gargantext.Core.Text.List.Social.Group (FlowListScores)
35 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
36 import Gargantext.Core.Text.Group
37 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
38 import Gargantext.Core.Types.Individu (User(..))
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
41 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
42 import Gargantext.Database.Prelude (CmdM)
43 import Gargantext.Database.Query.Table.Node (defaultList)
44 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
45 import Gargantext.Database.Query.Tree.Error (HasTreeError)
46 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
47 import Gargantext.Prelude
50 -- | TODO improve grouping functions of Authors, Sources, Institutes..
51 buildNgramsLists :: ( RepoCmdM env err m
60 -> m (Map NgramsType [NgramsElement])
61 buildNgramsLists user gp uCid mCid = do
62 ngTerms <- buildNgramsTermsList user uCid mCid gp
63 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
64 [ (Authors , MapListSize 9)
65 , (Sources , MapListSize 9)
66 , (Institutes, MapListSize 9)
69 pure $ Map.unions $ [ngTerms] <> othersTerms
72 data MapListSize = MapListSize { unMapListSize :: !Int }
74 buildNgramsOthersList ::( HasNodeError err
82 -> (NgramsType, MapListSize)
83 -> m (Map NgramsType [NgramsElement])
84 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
85 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
87 socialLists' :: Map Text FlowListScores
88 <- flowSocialList' MySelfFirst user nt (Set.fromList $ Map.keys ngs')
89 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
91 -- 8< 8< 8< 8< 8< 8< 8<
93 ngs :: Map Text (Set Text, Set NodeId) = groupNodesByNgramsWith groupIt ngs'
94 socialLists <- flowSocialList user nt (Set.fromList $ Map.keys ngs)
95 -- >8 >8 >8 >8 >8 >8 >8
98 grouped = toGroupedText (GroupedTextParams groupIt (Set.size . snd) fst snd)
100 $ Map.mapWithKey (\k (a,b) -> (Set.delete k a, b))
105 groupedWithList = map (addListType (invertForw socialLists)) grouped
106 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm)
108 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm)
111 listSize = mapListSize - (List.length mapTerms)
112 (mapTerms', candiTerms) = List.splitAt listSize
113 $ List.sortOn (Down . _gt_score)
114 $ Map.elems tailTerms'
116 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
117 <> (List.concat $ map toNgramsElement mapTerms )
118 <> (List.concat $ map toNgramsElement
119 $ map (set gt_listType (Just MapTerm )) mapTerms' )
120 <> (List.concat $ map toNgramsElement
121 $ map (set gt_listType (Just CandidateTerm)) candiTerms)
125 buildNgramsTermsList :: ( HasNodeError err
134 -> m (Map NgramsType [NgramsElement])
135 buildNgramsTermsList user uCid mCid groupParams = do
137 -- Computing global speGen score
138 allTerms :: [(Text, Double)] <- Map.toList <$> getTficf uCid mCid NgramsTerms
140 -- printDebug "head candidates" (List.take 10 $ allTerms)
141 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
143 -- First remove stops terms
144 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst allTerms)
145 -- printDebug "\n * socialLists * \n" socialLists
147 printDebug "\n * socialLists * \n" socialLists
150 _socialStop = fromMaybe Set.empty $ Map.lookup StopTerm socialLists
151 _socialMap = fromMaybe Set.empty $ Map.lookup MapTerm socialLists
152 _socialCand = fromMaybe Set.empty $ Map.lookup CandidateTerm socialLists
153 -- stopTerms ignored for now (need to be tagged already)
154 -- (stopTerms, candidateTerms) = List.partition ((\t -> Set.member t socialStop) . fst) allTerms
155 -- (mapTerms, candidateTerms) = List.partition ((\t -> Set.member t socialMap ) . fst) allTerms
157 -- printDebug "stopTerms" stopTerms
159 -- Grouping the ngrams and keeping the maximum score for label
160 let grouped = toGroupedText (GroupedTextParams (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty)) allTerms
162 groupedWithList = map (addListType (invertForw socialLists)) grouped
164 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
165 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
167 -- printDebug "\n * stopTerms * \n" stopTerms
168 -- splitting monterms and multiterms to take proportional candidates
170 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
171 monoSize = 0.4 :: Double
172 multSize = 1 - monoSize
174 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
176 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
177 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
179 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
180 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
181 -- printDebug "groupedMultHead" (List.length groupedMultHead)
182 -- printDebug "groupedMultTail" (List.length groupedMultTail)
185 -- Get Local Scores now for selected grouped ngrams
186 selectedTerms = Set.toList $ List.foldl'
187 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
191 (groupedMonoHead <> groupedMultHead)
193 -- TO remove (and remove HasNodeError instance)
194 userListId <- defaultList uCid
195 masterListId <- defaultList mCid
197 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
200 mapGroups = Map.fromList
201 $ map (\g -> (g ^. gt_stem, g))
202 $ groupedMonoHead <> groupedMultHead
204 -- grouping with Set NodeId
205 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup groupParams k
206 in case Map.lookup k' mapGroups' of
207 Nothing -> mapGroups'
208 Just g -> case Map.lookup k mapTextDocIds of
209 Nothing -> mapGroups'
210 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
213 $ Map.keys mapTextDocIds
215 -- compute cooccurrences
216 mapCooc = Map.filter (>2)
217 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
218 | (t1, s1) <- mapStemNodeIds
219 , (t2, s2) <- mapStemNodeIds
222 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
223 -- printDebug "mapCooc" mapCooc
227 mapScores f = Map.fromList
228 $ map (\(Scored t g s') -> (t, f (g,s')))
233 groupsWithScores = catMaybes
235 -> case Map.lookup stem mapScores' of
237 Just s' -> Just $ g { _gt_score = s'}
238 ) $ Map.toList contextsAdded
240 mapScores' = mapScores identity
241 -- adapt2 TOCHECK with DC
242 -- printDebug "groupsWithScores" groupsWithScores
244 -- sort / partition / split
245 -- filter mono/multi again
246 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
247 -- filter with max score
248 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
250 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
251 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
255 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
256 inclSize = 0.4 :: Double
257 exclSize = 1 - inclSize
258 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
260 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
261 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
263 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
264 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
267 -- Final Step building the Typed list
268 termListHead = maps <> cands
270 maps = set gt_listType (Just MapTerm)
271 <$> monoScoredInclHead
272 <> monoScoredExclHead
273 <> multScoredInclHead
274 <> multScoredExclHead
276 cands = set gt_listType (Just CandidateTerm)
277 <$> monoScoredInclTail
278 <> monoScoredExclTail
279 <> multScoredInclTail
280 <> multScoredExclTail
282 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
284 -- printDebug "monoScoredInclHead" monoScoredInclHead
285 -- printDebug "monoScoredExclHead" monoScoredExclTail
286 -- printDebug "multScoredInclHead" multScoredInclHead
287 -- printDebug "multScoredExclTail" multScoredExclTail
289 let result = Map.unionsWith (<>)
290 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
291 <> (List.concat $ map toNgramsElement $ termListTail)
292 <> (List.concat $ map toNgramsElement $ stopTerms)
295 -- printDebug "\n result \n" r
300 toNgramsElement :: GroupedText a -> [NgramsElement]
301 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
302 [parentElem] <> childrenElems
305 children = Set.toList setNgrams
306 parentElem = mkNgramsElement (NgramsTerm parent)
307 (fromMaybe CandidateTerm listType)
309 (mSetFromList (NgramsTerm <$> children))
310 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
311 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
313 ) (NgramsTerm <$> children)
316 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
317 toGargList l n = (l,n)
320 isStopTerm :: StopSize -> Text -> Bool
321 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
323 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
325 ------------------------------------------------------------------------------