2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
19 import Control.Lens ((^.), set, view, over)
21 import Data.Maybe (fromMaybe, catMaybes)
22 import Data.Ord (Down(..))
24 import Data.Text (Text)
25 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
26 import Gargantext.API.Ngrams.Types (RepoCmdM)
27 import Gargantext.Core.Text.List.Group
28 import Gargantext.Core.Text.List.Group.WithStem
29 import Gargantext.Core.Text.List.Social
30 import Gargantext.Core.Text.List.Social.Prelude
31 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
32 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
33 import Gargantext.Core.Types.Individu (User(..))
34 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
35 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
36 import Gargantext.Database.Admin.Types.Node (NodeId)
37 import Gargantext.Database.Prelude (CmdM)
38 import Gargantext.Database.Query.Table.Node (defaultList)
39 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
40 import Gargantext.Database.Query.Tree.Error (HasTreeError)
41 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
42 import Gargantext.Prelude
43 import qualified Data.Char as Char
44 import qualified Data.List as List
45 import qualified Data.Map as Map
46 import qualified Data.Set as Set
47 import qualified Data.Text as Text
50 -- | TODO improve grouping functions of Authors, Sources, Institutes..
51 buildNgramsLists :: ( RepoCmdM env err m
60 -> m (Map NgramsType [NgramsElement])
61 buildNgramsLists user gp uCid mCid = do
62 ngTerms <- buildNgramsTermsList user uCid mCid gp
63 othersTerms <- mapM (buildNgramsOthersList user uCid (ngramsGroup GroupIdentity))
64 [ (Authors , MapListSize 9)
65 , (Sources , MapListSize 9)
66 , (Institutes, MapListSize 9)
69 pure $ Map.unions $ [ngTerms] <> othersTerms
72 data MapListSize = MapListSize { unMapListSize :: !Int }
74 buildNgramsOthersList ::( HasNodeError err
82 -> (NgramsType, MapListSize)
83 -> m (Map NgramsType [NgramsElement])
84 buildNgramsOthersList user uCid groupIt (nt, MapListSize mapListSize) = do
85 ngs' :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
87 socialLists' :: FlowCont Text FlowListScores
88 <- flowSocialList' MySelfFirst user nt (FlowCont Map.empty $ Set.fromList $ Map.keys ngs')
89 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
91 printDebug "flowSocialList'"
92 $ Map.filter (not . ((==) Map.empty) . view fls_parents)
93 $ view flc_scores socialLists'
96 groupParams = GroupedTextParams groupIt (Set.size . snd) fst snd {-(size . fst)-}
97 groupedWithList = toGroupedText groupParams (view flc_scores socialLists') ngs'
99 printDebug "groupedWithList"
100 $ Map.map (\v -> (view gt_label v, view gt_children v))
101 $ Map.filter (\v -> (Set.size $ view gt_children v) > 0)
105 (stopTerms, tailTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
106 (mapTerms, tailTerms') = Map.partition (\t -> t ^. gt_listType == Just MapTerm) tailTerms
108 listSize = mapListSize - (List.length mapTerms)
109 (mapTerms', candiTerms) = List.splitAt listSize
110 $ List.sortOn (Down . _gt_score)
111 $ Map.elems tailTerms'
113 pure $ Map.fromList [( nt, (List.concat $ map toNgramsElement stopTerms)
114 <> (List.concat $ map toNgramsElement mapTerms )
115 <> (List.concat $ map toNgramsElement
116 $ map (set gt_listType (Just MapTerm )) mapTerms' )
117 <> (List.concat $ map toNgramsElement
118 $ map (set gt_listType (Just CandidateTerm)) candiTerms)
123 buildNgramsTermsList :: ( HasNodeError err
132 -> m (Map NgramsType [NgramsElement])
133 buildNgramsTermsList user uCid mCid groupParams = do
135 -- Computing global speGen score
136 allTerms :: Map Text Double <- getTficf uCid mCid NgramsTerms
138 -- printDebug "head candidates" (List.take 10 $ allTerms)
139 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
141 -- First remove stops terms
142 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst $ Map.toList allTerms)
143 -- printDebug "\n * socialLists * \n" socialLists
145 -- Grouping the ngrams and keeping the maximum score for label
146 let grouped = groupedTextWithStem ( GroupedTextParams (ngramsGroup groupParams) identity (const Set.empty) (const Set.empty) {-(size . _gt_label)-} ) allTerms
148 groupedWithList = map (addListType (invertForw socialLists)) grouped
150 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
151 (groupedMono, groupedMult) = Map.partition (\t -> t ^. gt_size < 2) candidateTerms
153 -- printDebug "\n * stopTerms * \n" stopTerms
154 -- splitting monterms and multiterms to take proportional candidates
156 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if too small
157 monoSize = 0.4 :: Double
158 multSize = 1 - monoSize
160 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
162 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
163 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
165 -- printDebug "groupedMonoHead" (List.length groupedMonoHead)
166 -- printDebug "groupedMonoTail" (List.length groupedMonoHead)
167 -- printDebug "groupedMultHead" (List.length groupedMultHead)
168 -- printDebug "groupedMultTail" (List.length groupedMultTail)
171 -- Get Local Scores now for selected grouped ngrams
172 selectedTerms = Set.toList $ List.foldl'
173 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
177 (groupedMonoHead <> groupedMultHead)
179 -- TO remove (and remove HasNodeError instance)
180 userListId <- defaultList uCid
181 masterListId <- defaultList mCid
183 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
186 mapGroups = Map.fromList
187 $ map (\g -> (g ^. gt_stem, g))
188 $ groupedMonoHead <> groupedMultHead
190 -- grouping with Set NodeId
191 contextsAdded = foldl' (\mapGroups' k ->
192 let k' = ngramsGroup groupParams k in
193 case Map.lookup k' mapGroups' of
194 Nothing -> mapGroups'
195 Just g -> case Map.lookup k mapTextDocIds of
196 Nothing -> mapGroups'
197 Just ns -> Map.insert k' (over gt_nodes (Set.union ns) g) mapGroups'
200 $ Map.keys mapTextDocIds
202 -- compute cooccurrences
203 mapCooc = Map.filter (>2)
204 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
205 | (t1, s1) <- mapStemNodeIds
206 , (t2, s2) <- mapStemNodeIds
207 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
210 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
211 -- printDebug "mapCooc" mapCooc
215 mapScores f = Map.fromList
216 $ map (\(Scored t g s') -> (t, f (g,s')))
221 groupsWithScores = catMaybes
223 -> case Map.lookup stem mapScores' of
225 Just s' -> Just $ g { _gt_score = s'}
226 ) $ Map.toList contextsAdded
228 mapScores' = mapScores identity
229 -- adapt2 TOCHECK with DC
230 -- printDebug "groupsWithScores" groupsWithScores
232 -- sort / partition / split
233 -- filter mono/multi again
234 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
235 -- filter with max score
236 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
238 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
239 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
243 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
244 inclSize = 0.4 :: Double
245 exclSize = 1 - inclSize
246 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
248 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
249 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
251 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
252 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
255 -- Final Step building the Typed list
256 termListHead = maps <> cands
258 maps = set gt_listType (Just MapTerm)
259 <$> monoScoredInclHead
260 <> monoScoredExclHead
261 <> multScoredInclHead
262 <> multScoredExclHead
264 cands = set gt_listType (Just CandidateTerm)
265 <$> monoScoredInclTail
266 <> monoScoredExclTail
267 <> multScoredInclTail
268 <> multScoredExclTail
270 termListTail = map (set gt_listType (Just CandidateTerm)) ( groupedMonoTail <> groupedMultTail)
272 -- printDebug "monoScoredInclHead" monoScoredInclHead
273 -- printDebug "monoScoredExclHead" monoScoredExclTail
274 -- printDebug "multScoredInclHead" multScoredInclHead
275 -- printDebug "multScoredExclTail" multScoredExclTail
277 let result = Map.unionsWith (<>)
278 [ Map.fromList [( NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
279 <> (List.concat $ map toNgramsElement $ termListTail)
280 <> (List.concat $ map toNgramsElement $ stopTerms)
283 -- printDebug "\n result \n" r
288 toNgramsElement :: GroupedText a -> [NgramsElement]
289 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
290 [parentElem] <> childrenElems
293 children = Set.toList setNgrams
294 parentElem = mkNgramsElement (NgramsTerm parent)
295 (fromMaybe CandidateTerm listType)
297 (mSetFromList (NgramsTerm <$> children))
298 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
299 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
301 ) (NgramsTerm <$> children)
304 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
305 toGargList l n = (l,n)
308 isStopTerm :: StopSize -> Text -> Bool
309 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
311 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
313 ------------------------------------------------------------------------------