2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE TemplateHaskell #-}
14 module Gargantext.Core.Text.List
18 import Control.Lens ((^.), set)
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
23 import Data.Text (Text)
24 import qualified Data.Char as Char
25 import qualified Data.List as List
26 import qualified Data.Map as Map
27 import qualified Data.Set as Set
28 import qualified Data.Text as Text
30 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
31 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
32 import Gargantext.API.Ngrams.Types (RepoCmdM)
33 import Gargantext.Core.Text (size)
34 import Gargantext.Core.Text.List.Social (flowSocialList, flowSocialList', FlowSocialListPriority(..), invertForw)
35 import Gargantext.Core.Text.List.Social.Scores (FlowListScores)
36 import Gargantext.Core.Text.List.Group
37 import Gargantext.Core.Text.List.Group.WithStem
38 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
39 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
40 import Gargantext.Core.Types.Individu (User(..))
41 import Gargantext.Database.Admin.Types.Node (NodeId)
42 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
43 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
44 import Gargantext.Database.Prelude (CmdM)
45 import Gargantext.Database.Query.Table.Node (defaultList)
46 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
47 import Gargantext.Database.Query.Tree.Error (HasTreeError)
48 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
49 import Gargantext.Prelude
52 -- | TODO improve grouping functions of Authors, Sources, Institutes..
53 buildNgramsLists :: ( RepoCmdM env err m
62 -> m (Map NgramsType [NgramsElement])
63 buildNgramsLists user gp uCid mCid = do
64 ngTerms <- buildNgramsTermsList user uCid mCid gp
65 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
66 [ (Authors , MapListSize 9)
67 , (Sources , MapListSize 9)
68 , (Institutes, MapListSize 9)
71 pure $ Map.unions $ [ngTerms] <> othersTerms
74 data MapListSize = MapListSize { unMapListSize :: !Int }
76 buildNgramsOthersList ::( HasNodeError err
84 -> (NgramsType, MapListSize)
85 -> m (Map NgramsType [NgramsElement])
86 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
87 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
89 socialLists' :: Map Text FlowListScores
90 <- flowSocialList' MySelfFirst user nt (Set.fromList $ Map.keys ngs')
91 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
94 groupParams = GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-}
95 grouped' = toGroupedText groupParams socialLists' ngs'
97 -- 8< 8< 8< 8< 8< 8< 8<
99 ngs :: Map Text (Set Text, Set NodeId) = groupNodesByNgramsWith groupIt ngs'
100 socialLists <- flowSocialList user nt (Set.fromList $ Map.keys ngs)
101 -- >8 >8 >8 >8 >8 >8 >8
104 grouped = groupedTextWithStem (GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-} ) -- socialLists'
105 $ Map.mapWithKey (\k (a,b) -> (Set.delete k a, b))
110 groupedWithList = map (addListType (invertForw socialLists)) grouped
111 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm)
113 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm)
116 listSize = mapListSize - (List.length mapTerms)
117 (mapTerms', candiTerms) = List.splitAt listSize
118 $ List.sortOn (Down . _gt_score)
119 $ Map.elems tailTerms'
121 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
122 <> (List.concat $ map toNgramsElement mapTerms )
123 <> (List.concat $ map toNgramsElement
124 $ map (set gt_listType (Just MapTerm )) mapTerms' )
125 <> (List.concat $ map toNgramsElement
126 $ map (set gt_listType (Just CandidateTerm)) candiTerms)
130 buildNgramsTermsList :: ( HasNodeError err
139 -> m (Map NgramsType [NgramsElement])
140 buildNgramsTermsList user uCid mCid groupParams = do
142 -- Computing global speGen score
143 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
145 -- printDebug "head candidates" (List.take 10 $ allTerms)
146 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
148 -- First remove stops terms
149 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
150 -- printDebug "\n * socialLists * \n" socialLists
152 -- Grouping the ngrams and keeping the maximum score for label
153 let grouped = groupedTextWithStem ( GroupedTextParams (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
155 groupedWithList = map (addListType (invertForw socialLists)) grouped
157 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
158 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
160 -- printDebug "\n * stopTerms * \n" stopTerms
161 -- splitting monterms and multiterms to take proportional candidates
163 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
164 monoSize = 0.4 :: Double
165 multSize = 1 - monoSize
167 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
169 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
170 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
172 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
173 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
174 -- printDebug "groupedMultHead" (List.length groupedMultHead)
175 -- printDebug "groupedMultTail" (List.length groupedMultTail)
178 -- Get Local Scores now for selected grouped ngrams
179 selectedTerms = Set.toList $ List.foldl'
180 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
184 (groupedMonoHead <> groupedMultHead)
186 -- TO remove (and remove HasNodeError instance)
187 userListId <- defaultList uCid
188 masterListId <- defaultList mCid
190 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
193 mapGroups = Map.fromList
194 $ map (\g -> (g ^. gt_stem, g))
195 $ groupedMonoHead <> groupedMultHead
197 -- grouping with Set NodeId
198 contextsAdded = foldl' (\mapGroups' k ->
199 let k' = ngramsGroup groupParams k in
200 case Map.lookup k' mapGroups' of
201 Nothing -> mapGroups'
202 Just g -> case Map.lookup k mapTextDocIds of
203 Nothing -> mapGroups'
204 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
207 $ Map.keys mapTextDocIds
209 -- compute cooccurrences
210 mapCooc = Map.filter (>2)
211 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
212 | (t1, s1) <- mapStemNodeIds
213 , (t2, s2) <- mapStemNodeIds
214 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
217 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
218 -- printDebug "mapCooc" mapCooc
222 mapScores f = Map.fromList
223 $ map (\(Scored t g s') -> (t, f (g,s')))
228 groupsWithScores = catMaybes
230 -> case Map.lookup stem mapScores' of
232 Just s' -> Just $ g { _gt_score = s'}
233 ) $ Map.toList contextsAdded
235 mapScores' = mapScores identity
236 -- adapt2 TOCHECK with DC
237 -- printDebug "groupsWithScores" groupsWithScores
239 -- sort / partition / split
240 -- filter mono/multi again
241 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
242 -- filter with max score
243 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
245 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
246 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
250 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
251 inclSize = 0.4 :: Double
252 exclSize = 1 - inclSize
253 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
255 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
256 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
258 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
259 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
262 -- Final Step building the Typed list
263 termListHead = maps <> cands
265 maps = set gt_listType (Just MapTerm)
266 <$> monoScoredInclHead
267 <> monoScoredExclHead
268 <> multScoredInclHead
269 <> multScoredExclHead
271 cands = set gt_listType (Just CandidateTerm)
272 <$> monoScoredInclTail
273 <> monoScoredExclTail
274 <> multScoredInclTail
275 <> multScoredExclTail
277 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
279 -- printDebug "monoScoredInclHead" monoScoredInclHead
280 -- printDebug "monoScoredExclHead" monoScoredExclTail
281 -- printDebug "multScoredInclHead" multScoredInclHead
282 -- printDebug "multScoredExclTail" multScoredExclTail
284 let result = Map.unionsWith (<>)
285 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
286 <> (List.concat $ map toNgramsElement $ termListTail)
287 <> (List.concat $ map toNgramsElement $ stopTerms)
290 -- printDebug "\n result \n" r
295 toNgramsElement :: GroupedText a -> [NgramsElement]
296 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
297 [parentElem] <> childrenElems
300 children = Set.toList setNgrams
301 parentElem = mkNgramsElement (NgramsTerm parent)
302 (fromMaybe CandidateTerm listType)
304 (mSetFromList (NgramsTerm <$> children))
305 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
306 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
308 ) (NgramsTerm <$> children)
311 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
312 toGargList l n = (l,n)
315 isStopTerm :: StopSize -> Text -> Bool
316 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
318 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
320 ------------------------------------------------------------------------------