2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
19 import Control.Lens ((^.), set, view, over)
20 import Data.Maybe (fromMaybe, catMaybes)
21 import Data.Ord (Down(..))
24 import Data.Text (Text)
25 import qualified Data.Char as Char
26 import qualified Data.List as List
27 import qualified Data.Map as Map
28 import qualified Data.Set as Set
29 import qualified Data.Text as Text
31 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
32 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
33 import Gargantext.API.Ngrams.Types (RepoCmdM)
34 import Gargantext.Core.Text.List.Social
35 import Gargantext.Core.Text.List.Social.Prelude
36 import Gargantext.Core.Text.List.Group
37 import Gargantext.Core.Text.List.Group.WithStem
38 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
39 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
40 import Gargantext.Core.Types.Individu (User(..))
41 import Gargantext.Database.Admin.Types.Node (NodeId)
42 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
43 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
44 import Gargantext.Database.Prelude (CmdM)
45 import Gargantext.Database.Query.Table.Node (defaultList)
46 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
47 import Gargantext.Database.Query.Tree.Error (HasTreeError)
48 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
49 import Gargantext.Prelude
52 -- | TODO improve grouping functions of Authors, Sources, Institutes..
53 buildNgramsLists :: ( RepoCmdM env err m
62 -> m (Map NgramsType [NgramsElement])
63 buildNgramsLists user gp uCid mCid = do
64 ngTerms <- buildNgramsTermsList user uCid mCid gp
65 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
66 [ (Authors , MapListSize 9)
67 , (Sources , MapListSize 9)
68 , (Institutes, MapListSize 9)
71 pure $ Map.unions $ [ngTerms] <> othersTerms
74 data MapListSize = MapListSize { unMapListSize :: !Int }
76 buildNgramsOthersList ::( HasNodeError err
84 -> (NgramsType, MapListSize)
85 -> m (Map NgramsType [NgramsElement])
86 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
87 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
89 socialLists' :: FlowListCont Text
90 <- flowSocialList' MySelfFirst user nt (FlowListCont Map.empty $ Set.fromList $ Map.keys ngs')
91 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
93 printDebug "flowSocialList'"
94 $ Map.filter (not . ((==) Map.empty) . view fls_parents)
95 $ view flc_scores socialLists'
98 groupParams = GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-}
99 groupedWithList = toGroupedText groupParams (view flc_scores socialLists') ngs'
101 printDebug "groupedWithList"
102 $ Map.map (\v -> (view gt_label v, view gt_children v))
103 $ Map.filter (\v -> (Set.size $ view gt_children v) > 0)
107 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
108 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm) tailTerms
110 listSize = mapListSize - (List.length mapTerms)
111 (mapTerms', candiTerms) = List.splitAt listSize
112 $ List.sortOn (Down . _gt_score)
113 $ Map.elems tailTerms'
115 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
116 <> (List.concat $ map toNgramsElement mapTerms )
117 <> (List.concat $ map toNgramsElement
118 $ map (set gt_listType (Just MapTerm )) mapTerms' )
119 <> (List.concat $ map toNgramsElement
120 $ map (set gt_listType (Just CandidateTerm)) candiTerms)
125 buildNgramsTermsList :: ( HasNodeError err
134 -> m (Map NgramsType [NgramsElement])
135 buildNgramsTermsList user uCid mCid groupParams = do
137 -- Computing global speGen score
138 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
140 -- printDebug "head candidates" (List.take 10 $ allTerms)
141 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
143 -- First remove stops terms
144 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
145 -- printDebug "\n * socialLists * \n" socialLists
147 -- Grouping the ngrams and keeping the maximum score for label
148 let grouped = groupedTextWithStem ( GroupedTextParams (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
150 groupedWithList = map (addListType (invertForw socialLists)) grouped
152 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
153 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
155 -- printDebug "\n * stopTerms * \n" stopTerms
156 -- splitting monterms and multiterms to take proportional candidates
158 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
159 monoSize = 0.4 :: Double
160 multSize = 1 - monoSize
162 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
164 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
165 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
167 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
168 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
169 -- printDebug "groupedMultHead" (List.length groupedMultHead)
170 -- printDebug "groupedMultTail" (List.length groupedMultTail)
173 -- Get Local Scores now for selected grouped ngrams
174 selectedTerms = Set.toList $ List.foldl'
175 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
179 (groupedMonoHead <> groupedMultHead)
181 -- TO remove (and remove HasNodeError instance)
182 userListId <- defaultList uCid
183 masterListId <- defaultList mCid
185 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
188 mapGroups = Map.fromList
189 $ map (\g -> (g ^. gt_stem, g))
190 $ groupedMonoHead <> groupedMultHead
192 -- grouping with Set NodeId
193 contextsAdded = foldl' (\mapGroups' k ->
194 let k' = ngramsGroup groupParams k in
195 case Map.lookup k' mapGroups' of
196 Nothing -> mapGroups'
197 Just g -> case Map.lookup k mapTextDocIds of
198 Nothing -> mapGroups'
199 Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups'
202 $ Map.keys mapTextDocIds
204 -- compute cooccurrences
205 mapCooc = Map.filter (>2)
206 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
207 | (t1, s1) <- mapStemNodeIds
208 , (t2, s2) <- mapStemNodeIds
209 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
212 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
213 -- printDebug "mapCooc" mapCooc
217 mapScores f = Map.fromList
218 $ map (\(Scored t g s') -> (t, f (g,s')))
223 groupsWithScores = catMaybes
225 -> case Map.lookup stem mapScores' of
227 Just s' -> Just $ g { _gt_score = s'}
228 ) $ Map.toList contextsAdded
230 mapScores' = mapScores identity
231 -- adapt2 TOCHECK with DC
232 -- printDebug "groupsWithScores" groupsWithScores
234 -- sort / partition / split
235 -- filter mono/multi again
236 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
237 -- filter with max score
238 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
240 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
241 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
245 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
246 inclSize = 0.4 :: Double
247 exclSize = 1 - inclSize
248 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
250 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
251 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
253 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
254 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
257 -- Final Step building the Typed list
258 termListHead = maps <> cands
260 maps = set gt_listType (Just MapTerm)
261 <$> monoScoredInclHead
262 <> monoScoredExclHead
263 <> multScoredInclHead
264 <> multScoredExclHead
266 cands = set gt_listType (Just CandidateTerm)
267 <$> monoScoredInclTail
268 <> monoScoredExclTail
269 <> multScoredInclTail
270 <> multScoredExclTail
272 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
274 -- printDebug "monoScoredInclHead" monoScoredInclHead
275 -- printDebug "monoScoredExclHead" monoScoredExclTail
276 -- printDebug "multScoredInclHead" multScoredInclHead
277 -- printDebug "multScoredExclTail" multScoredExclTail
279 let result = Map.unionsWith (<>)
280 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
281 <> (List.concat $ map toNgramsElement $ termListTail)
282 <> (List.concat $ map toNgramsElement $ stopTerms)
285 -- printDebug "\n result \n" r
290 toNgramsElement :: GroupedText a -> [NgramsElement]
291 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
292 [parentElem] <> childrenElems
295 children = Set.toList setNgrams
296 parentElem = mkNgramsElement (NgramsTerm parent)
297 (fromMaybe CandidateTerm listType)
299 (mSetFromList (NgramsTerm <$> children))
300 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
301 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
303 ) (NgramsTerm <$> children)
306 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
307 toGargList l n = (l,n)
310 isStopTerm :: StopSize -> Text -> Bool
311 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
313 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
315 ------------------------------------------------------------------------------