2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
14 module Gargantext.Core.Text.List
18 import Control.Lens ((^.), set)
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
23 import Data.Text (Text)
24 import qualified Data.Char as Char
25 import qualified Data.List as List
26 import qualified Data.Map as Map
27 import qualified Data.Set as Set
28 import qualified Data.Text as Text
30 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
31 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
32 import Gargantext.API.Ngrams.Types (RepoCmdM)
33 import Gargantext.Core.Text (size)
34 import Gargantext.Core.Text.List.Social (flowSocialList, flowSocialList', FlowSocialListPriority(..), invertForw)
35 import Gargantext.Core.Text.List.Social.Scores (FlowListScores)
36 import Gargantext.Core.Text.List.Group
37 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
38 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
39 import Gargantext.Core.Types.Individu (User(..))
40 import Gargantext.Database.Admin.Types.Node (NodeId)
41 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
42 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
43 import Gargantext.Database.Prelude (CmdM)
44 import Gargantext.Database.Query.Table.Node (defaultList)
45 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
46 import Gargantext.Database.Query.Tree.Error (HasTreeError)
47 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
48 import Gargantext.Prelude
51 -- | TODO improve grouping functions of Authors, Sources, Institutes..
52 buildNgramsLists :: ( RepoCmdM env err m
61 -> m (Map NgramsType [NgramsElement])
62 buildNgramsLists user gp uCid mCid = do
63 ngTerms <- buildNgramsTermsList user uCid mCid gp
64 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
65 [ (Authors , MapListSize 9)
66 , (Sources , MapListSize 9)
67 , (Institutes, MapListSize 9)
70 pure $ Map.unions $ [ngTerms] <> othersTerms
73 data MapListSize = MapListSize { unMapListSize :: !Int }
75 buildNgramsOthersList ::( HasNodeError err
83 -> (NgramsType, MapListSize)
84 -> m (Map NgramsType [NgramsElement])
85 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
86 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
88 socialLists' :: Map Text FlowListScores
89 <- flowSocialList' MySelfFirst user nt (Set.fromList $ Map.keys ngs')
90 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
93 groupParams = GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-}
94 grouped' = toGroupedText groupParams socialLists' ngs'
96 -- 8< 8< 8< 8< 8< 8< 8<
98 ngs :: Map Text (Set Text, Set NodeId) = groupNodesByNgramsWith groupIt ngs'
99 socialLists <- flowSocialList user nt (Set.fromList $ Map.keys ngs)
100 -- >8 >8 >8 >8 >8 >8 >8
103 grouped = groupedTextWithStem (GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-} ) -- socialLists'
104 $ Map.mapWithKey (\k (a,b) -> (Set.delete k a, b))
109 groupedWithList = map (addListType (invertForw socialLists)) grouped
110 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm)
112 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm)
115 listSize = mapListSize - (List.length mapTerms)
116 (mapTerms', candiTerms) = List.splitAt listSize
117 $ List.sortOn (Down . _gt_score)
118 $ Map.elems tailTerms'
120 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
121 <> (List.concat $ map toNgramsElement mapTerms )
122 <> (List.concat $ map toNgramsElement
123 $ map (set gt_listType (Just MapTerm )) mapTerms' )
124 <> (List.concat $ map toNgramsElement
125 $ map (set gt_listType (Just CandidateTerm)) candiTerms)
129 buildNgramsTermsList :: ( HasNodeError err
138 -> m (Map NgramsType [NgramsElement])
139 buildNgramsTermsList user uCid mCid groupParams = do
141 -- Computing global speGen score
142 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
144 -- printDebug "head candidates" (List.take 10 $ allTerms)
145 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
147 -- First remove stops terms
148 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
149 -- printDebug "\n * socialLists * \n" socialLists
151 -- Grouping the ngrams and keeping the maximum score for label
152 let grouped = groupedTextWithStem ( GroupedTextParams (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
154 groupedWithList = map (addListType (invertForw socialLists)) grouped
156 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
157 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
159 -- printDebug "\n * stopTerms * \n" stopTerms
160 -- splitting monterms and multiterms to take proportional candidates
162 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
163 monoSize = 0.4 :: Double
164 multSize = 1 - monoSize
166 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
168 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
169 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
171 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
172 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
173 -- printDebug "groupedMultHead" (List.length groupedMultHead)
174 -- printDebug "groupedMultTail" (List.length groupedMultTail)
177 -- Get Local Scores now for selected grouped ngrams
178 selectedTerms = Set.toList $ List.foldl'
179 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
183 (groupedMonoHead <> groupedMultHead)
185 -- TO remove (and remove HasNodeError instance)
186 userListId <- defaultList uCid
187 masterListId <- defaultList mCid
189 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
192 mapGroups = Map.fromList
193 $ map (\g -> (g ^. gt_stem, g))
194 $ groupedMonoHead <> groupedMultHead
196 -- grouping with Set NodeId
197 contextsAdded = foldl' (\mapGroups' k ->
198 let k' = ngramsGroup groupParams k in
199 case Map.lookup k' mapGroups' of
200 Nothing -> mapGroups'
201 Just g -> case Map.lookup k mapTextDocIds of
202 Nothing -> mapGroups'
203 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
206 $ Map.keys mapTextDocIds
208 -- compute cooccurrences
209 mapCooc = Map.filter (>2)
210 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
211 | (t1, s1) <- mapStemNodeIds
212 , (t2, s2) <- mapStemNodeIds
213 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
216 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
217 -- printDebug "mapCooc" mapCooc
221 mapScores f = Map.fromList
222 $ map (\(Scored t g s') -> (t, f (g,s')))
227 groupsWithScores = catMaybes
229 -> case Map.lookup stem mapScores' of
231 Just s' -> Just $ g { _gt_score = s'}
232 ) $ Map.toList contextsAdded
234 mapScores' = mapScores identity
235 -- adapt2 TOCHECK with DC
236 -- printDebug "groupsWithScores" groupsWithScores
238 -- sort / partition / split
239 -- filter mono/multi again
240 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
241 -- filter with max score
242 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
244 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
245 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
249 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
250 inclSize = 0.4 :: Double
251 exclSize = 1 - inclSize
252 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
254 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
255 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
257 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
258 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
261 -- Final Step building the Typed list
262 termListHead = maps <> cands
264 maps = set gt_listType (Just MapTerm)
265 <$> monoScoredInclHead
266 <> monoScoredExclHead
267 <> multScoredInclHead
268 <> multScoredExclHead
270 cands = set gt_listType (Just CandidateTerm)
271 <$> monoScoredInclTail
272 <> monoScoredExclTail
273 <> multScoredInclTail
274 <> multScoredExclTail
276 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
278 -- printDebug "monoScoredInclHead" monoScoredInclHead
279 -- printDebug "monoScoredExclHead" monoScoredExclTail
280 -- printDebug "multScoredInclHead" multScoredInclHead
281 -- printDebug "multScoredExclTail" multScoredExclTail
283 let result = Map.unionsWith (<>)
284 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
285 <> (List.concat $ map toNgramsElement $ termListTail)
286 <> (List.concat $ map toNgramsElement $ stopTerms)
289 -- printDebug "\n result \n" r
294 toNgramsElement :: GroupedText a -> [NgramsElement]
295 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
296 [parentElem] <> childrenElems
299 children = Set.toList setNgrams
300 parentElem = mkNgramsElement (NgramsTerm parent)
301 (fromMaybe CandidateTerm listType)
303 (mSetFromList (NgramsTerm <$> children))
304 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
305 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
307 ) (NgramsTerm <$> children)
310 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
311 toGargList l n = (l,n)
314 isStopTerm :: StopSize -> Text -> Bool
315 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
317 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
319 ------------------------------------------------------------------------------