2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
19 import Control.Lens ((^.), set, view, over)
21 import Data.Maybe (fromMaybe, catMaybes)
22 import Data.Ord (Down(..))
24 import Data.Text (Text)
25 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
26 import Gargantext.API.Ngrams.Types (RepoCmdM)
27 import Gargantext.Core.Text.List.Group
28 import Gargantext.Core.Text.List.Group.Prelude
29 import Gargantext.Core.Text.List.Group.WithStem
30 import Gargantext.Core.Text.List.Social
31 import Gargantext.Core.Text.List.Social.Prelude
32 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
33 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
34 import Gargantext.Core.Types.Individu (User(..))
35 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
36 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
37 import Gargantext.Database.Admin.Types.Node (NodeId)
38 import Gargantext.Database.Prelude (CmdM)
39 import Gargantext.Database.Query.Table.Node (defaultList)
40 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
41 import Gargantext.Database.Query.Tree.Error (HasTreeError)
42 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
43 import Gargantext.Prelude
44 import qualified Data.Char as Char
45 import qualified Data.List as List
46 import qualified Data.Map as Map
47 import qualified Data.Set as Set
48 import qualified Data.Text as Text
51 -- | TODO improve grouping functions of Authors, Sources, Institutes..
52 buildNgramsLists :: ( RepoCmdM env err m
61 -> m (Map NgramsType [NgramsElement])
62 buildNgramsLists user gp uCid mCid = do
63 ngTerms <- buildNgramsTermsList user uCid mCid gp
64 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
65 [ (Authors , MapListSize 9)
66 , (Sources , MapListSize 9)
67 , (Institutes, MapListSize 9)
70 pure $ Map.unions $ [ngTerms] <> othersTerms
73 data MapListSize = MapListSize { unMapListSize :: !Int }
75 buildNgramsOthersList ::( HasNodeError err
83 -> (NgramsType, MapListSize)
84 -> m (Map NgramsType [NgramsElement])
85 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
86 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
88 socialLists' :: FlowCont Text FlowListScores
89 <- flowSocialList' MySelfFirst user nt (FlowCont Map.empty $ Set.fromList $ Map.keys ngs')
90 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
92 printDebug "flowSocialList'"
93 $ Map.filter (not . ((==) Map.empty) . view fls_parents)
94 $ view flc_scores socialLists'
97 groupParams = GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-}
98 groupedWithList = toGroupedText groupParams (view flc_scores socialLists') ngs'
100 printDebug "groupedWithList"
101 $ Map.map (\v -> (view gt_label v, view gt_children v))
102 $ Map.filter (\v -> (Set.size $ view gt_children v) > 0)
106 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
107 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm) tailTerms
109 listSize = mapListSize - (List.length mapTerms)
110 (mapTerms', candiTerms) = List.splitAt listSize
111 $ List.sortOn (Down . _gt_score)
112 $ Map.elems tailTerms'
114 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
115 <> (List.concat $ map toNgramsElement mapTerms )
116 <> (List.concat $ map toNgramsElement
117 $ map (set gt_listType (Just MapTerm )) mapTerms' )
118 <> (List.concat $ map toNgramsElement
119 $ map (set gt_listType (Just CandidateTerm)) candiTerms)
124 buildNgramsTermsList :: ( HasNodeError err
133 -> m (Map NgramsType [NgramsElement])
134 buildNgramsTermsList user uCid mCid groupParams = do
136 -- Computing global speGen score
137 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
139 -- printDebug "head candidates" (List.take 10 $ allTerms)
140 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
142 -- First remove stops terms
143 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
144 -- printDebug "\n * socialLists * \n" socialLists
146 -- Grouping the ngrams and keeping the maximum score for label
147 let grouped = groupedTextWithStem ( GroupedTextParams (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
149 groupedWithList = map (addListType (invertForw socialLists)) grouped
151 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
152 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
154 -- printDebug "\n * stopTerms * \n" stopTerms
155 -- splitting monterms and multiterms to take proportional candidates
157 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
158 monoSize = 0.4 :: Double
159 multSize = 1 - monoSize
161 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
163 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
164 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
166 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
167 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
168 -- printDebug "groupedMultHead" (List.length groupedMultHead)
169 -- printDebug "groupedMultTail" (List.length groupedMultTail)
172 -- Get Local Scores now for selected grouped ngrams
173 selectedTerms = Set.toList $ List.foldl'
174 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
178 (groupedMonoHead <> groupedMultHead)
180 -- TO remove (and remove HasNodeError instance)
181 userListId <- defaultList uCid
182 masterListId <- defaultList mCid
184 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
187 mapGroups = Map.fromList
188 $ map (\g -> (g ^. gt_stem, g))
189 $ groupedMonoHead <> groupedMultHead
191 -- grouping with Set NodeId
192 contextsAdded = foldl' (\mapGroups' k ->
193 let k' = ngramsGroup groupParams k in
194 case Map.lookup k' mapGroups' of
195 Nothing -> mapGroups'
196 Just g -> case Map.lookup k mapTextDocIds of
197 Nothing -> mapGroups'
198 Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups'
201 $ Map.keys mapTextDocIds
203 -- compute cooccurrences
204 mapCooc = Map.filter (>2)
205 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
206 | (t1, s1) <- mapStemNodeIds
207 , (t2, s2) <- mapStemNodeIds
208 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
211 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
212 -- printDebug "mapCooc" mapCooc
216 mapScores f = Map.fromList
217 $ map (\(Scored t g s') -> (t, f (g,s')))
222 groupsWithScores = catMaybes
224 -> case Map.lookup stem mapScores' of
226 Just s' -> Just $ g { _gt_score = s'}
227 ) $ Map.toList contextsAdded
229 mapScores' = mapScores identity
230 -- adapt2 TOCHECK with DC
231 -- printDebug "groupsWithScores" groupsWithScores
233 -- sort / partition / split
234 -- filter mono/multi again
235 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
236 -- filter with max score
237 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
239 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
240 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
244 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
245 inclSize = 0.4 :: Double
246 exclSize = 1 - inclSize
247 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
249 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
250 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
252 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
253 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
256 -- Final Step building the Typed list
257 termListHead = maps <> cands
259 maps = set gt_listType (Just MapTerm)
260 <$> monoScoredInclHead
261 <> monoScoredExclHead
262 <> multScoredInclHead
263 <> multScoredExclHead
265 cands = set gt_listType (Just CandidateTerm)
266 <$> monoScoredInclTail
267 <> monoScoredExclTail
268 <> multScoredInclTail
269 <> multScoredExclTail
271 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
273 -- printDebug "monoScoredInclHead" monoScoredInclHead
274 -- printDebug "monoScoredExclHead" monoScoredExclTail
275 -- printDebug "multScoredInclHead" multScoredInclHead
276 -- printDebug "multScoredExclTail" multScoredExclTail
278 let result = Map.unionsWith (<>)
279 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
280 <> (List.concat $ map toNgramsElement $ termListTail)
281 <> (List.concat $ map toNgramsElement $ stopTerms)
284 -- printDebug "\n result \n" r
289 toNgramsElement :: GroupedText a -> [NgramsElement]
290 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
291 [parentElem] <> childrenElems
294 children = Set.toList setNgrams
295 parentElem = mkNgramsElement (NgramsTerm parent)
296 (fromMaybe CandidateTerm listType)
298 (mSetFromList (NgramsTerm <$> children))
299 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
300 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
302 ) (NgramsTerm <$> children)
305 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
306 toGargList l n = (l,n)
309 isStopTerm :: StopSize -> Text -> Bool
310 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
312 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
314 ------------------------------------------------------------------------------