2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14 {-# LANGUAGE BangPatterns #-}
16 module Gargantext.Core.Text.List
19 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
20 import Data.HashMap.Strict (HashMap)
21 import Data.HashSet (HashSet)
22 import Data.Map.Strict (Map)
23 import Data.Monoid (mempty)
24 import Data.Ord (Down(..))
26 import Data.Text (Text)
27 import Data.Tuple.Extra (both)
28 import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
29 import Gargantext.Core.NodeStory
30 import Gargantext.Core.Text (size)
31 import Gargantext.Core.Text.List.Group
32 import Gargantext.Core.Text.List.Group.Prelude
33 import Gargantext.Core.Text.List.Group.WithStem
34 import Gargantext.Core.Text.List.Social
35 import Gargantext.Core.Text.List.Social.Prelude
36 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
37 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
38 import Gargantext.Core.Types.Individu (User(..))
39 import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsUser, getContextsByNgramsOnlyUser)
40 import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
41 import Gargantext.Database.Admin.Types.Node (NodeId)
42 import Gargantext.Database.Prelude (CmdM)
43 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
44 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
45 import Gargantext.Database.Query.Table.Node (defaultList)
46 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
47 import Gargantext.Database.Query.Tree.Error (HasTreeError)
48 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
49 import Gargantext.Prelude
50 import qualified Data.HashMap.Strict as HashMap
51 import qualified Data.HashSet as HashSet
52 import qualified Data.List as List
53 import qualified Data.Map.Strict as Map
54 import qualified Data.Set as Set
55 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
58 -- TODO maybe useful for later
59 isStopTerm :: StopSize -> Text -> Bool
60 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
62 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
66 -- | TODO improve grouping functions of Authors, Sources, Institutes..
67 buildNgramsLists :: ( HasNodeStory env err m
75 -> Maybe FlowSocialListWith
77 -> m (Map NgramsType [NgramsElement])
78 buildNgramsLists user uCid mCid mfslw gp = do
79 ngTerms <- buildNgramsTermsList user uCid mCid mfslw gp (NgramsTerms, MapListSize 350)
80 othersTerms <- mapM (buildNgramsOthersList user uCid mfslw GroupIdentity)
81 [ (Authors , MapListSize 9, MaxListSize 1000)
82 , (Sources , MapListSize 9, MaxListSize 1000)
83 , (Institutes, MapListSize 9, MaxListSize 1000)
86 pure $ Map.unions $ [ngTerms] <> othersTerms
89 data MapListSize = MapListSize { unMapListSize :: !Int }
90 data MaxListSize = MaxListSize { unMaxListSize :: !Int }
92 buildNgramsOthersList :: ( HasNodeError err
94 , HasNodeStory env err m
99 -> Maybe FlowSocialListWith
101 -> (NgramsType, MapListSize, MaxListSize)
102 -> m (Map NgramsType [NgramsElement])
103 buildNgramsOthersList user uCid mfslw _groupParams (nt, MapListSize mapListSize, MaxListSize maxListSize) = do
104 allTerms :: HashMap NgramsTerm (Set NodeId) <- getContextsByNgramsUser uCid nt
106 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
107 socialLists :: FlowCont NgramsTerm FlowListScores
108 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
110 $ List.zip (HashMap.keys allTerms)
111 (List.cycle [mempty])
114 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
116 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
117 $ view flc_scores groupedWithList
119 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
121 listSize = mapListSize - (List.length mapTerms)
122 (mapTerms', candiTerms) = both HashMap.fromList
123 $ List.splitAt listSize
124 $ List.take maxListSize
125 $ List.sortOn (Down . viewScore . snd)
126 $ HashMap.toList tailTerms'
129 pure $ Map.fromList [( nt, List.take maxListSize $ (toNgramsElement stopTerms)
130 <> (toNgramsElement mapTerms )
131 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
132 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
136 getGroupParams :: ( HasNodeError err
138 , HasNodeStory env err m
141 => GroupParams -> HashSet Ngrams -> m GroupParams
142 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
143 !hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
144 -- printDebug "hashMap" hashMap
145 pure $ over gwl_map (\x -> x <> hashMap) gp
146 getGroupParams gp _ = pure gp
150 buildNgramsTermsList :: ( HasNodeError err
152 , HasNodeStory env err m
158 -> Maybe FlowSocialListWith
160 -> (NgramsType, MapListSize)
161 -> m (Map NgramsType [NgramsElement])
162 buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSize) = do
164 -- Filter 0 With Double
165 -- Computing global speGen score
166 printDebug "[buildNgramsTermsList: Sample List] / start" nt
167 !(allTerms :: HashMap NgramsTerm Double) <- getTficf_withSample uCid mCid nt
169 printDebug "[buildNgramsTermsList: Sample List / end]" (nt, HashMap.size allTerms)
171 printDebug "[buildNgramsTermsList: Flow Social List / start]" nt
173 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
174 !(socialLists :: FlowCont NgramsTerm FlowListScores)
175 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
177 $ List.zip (HashMap.keys allTerms)
178 (List.cycle [mempty])
180 printDebug "[buildNgramsTermsList: Flow Social List / end]" nt
182 let !ngramsKeys = HashSet.fromList
183 $ List.take mapListSize
185 $ HashMap.keysSet allTerms
187 printDebug "[buildNgramsTermsList: ngramsKeys]" (HashSet.size ngramsKeys)
189 !groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
191 printDebug "[buildNgramsTermsList: groupParams']" ("" :: Text)
194 !socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
195 !groupedWithList = toGroupedTree socialLists_Stemmed allTerms
196 !(stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
197 $ HashMap.filter (\g -> (view gts'_score g) > 1)
198 $ view flc_scores groupedWithList
200 !(groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
202 printDebug "[buildNgramsTermsList] socialLists" socialLists
203 printDebug "[buildNgramsTermsList] socialLists with scores" socialLists_Stemmed
204 printDebug "[buildNgramsTermsList] groupedWithList" groupedWithList
205 printDebug "[buildNgramsTermsList] stopTerms" stopTerms
207 -- splitting monterms and multiterms to take proportional candidates
208 -- use % of list if to big, or Int if too small
210 !listSizeGlobal = 2000 :: Double
211 !monoSize = 0.4 :: Double
212 !multSize = 1 - monoSize
214 splitAt n' ns = both (HashMap.fromListWith (<>))
215 $ List.splitAt (round $ n' * listSizeGlobal)
216 $ List.sortOn (viewScore . snd)
219 !(groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
220 !(groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
222 -------------------------
223 -- Filter 1 With Set NodeId and SpeGen
224 !selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
226 printDebug "[buildNgramsTermsList: selectedTerms]" selectedTerms
228 -- TODO remove (and remove HasNodeError instance)
229 !userListId <- defaultList uCid
230 !masterListId <- defaultList mCid
232 !mapTextDocIds <- getContextsByNgramsOnlyUser uCid
233 [userListId, masterListId]
238 printDebug "[buildNgramsTermsList: mapTextDocIds]" mapTextDocIds
241 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
242 !groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
243 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
246 printDebug "[buildNgramsTermsList: groupedTreeScores_SetNodeId]" groupedTreeScores_SetNodeId
248 -- Coocurrences computation
249 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
250 let !mapCooc = HashMap.filter (>1) -- removing cooc of 1
251 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
252 | (t1, s1) <- mapStemNodeIds
253 , (t2, s2) <- mapStemNodeIds
256 mapStemNodeIds = HashMap.toList
257 $ HashMap.map viewScores
258 $ groupedTreeScores_SetNodeId
261 mapScores f = HashMap.fromList
262 $ map (\g -> (view scored_terms g, f g))
266 $ Map.fromList -- TODO remove this
267 $ HashMap.toList mapCooc
270 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
271 !groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
274 -- sort / partition / split
275 -- filter mono/multi again
276 !(monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
278 -- filter with max score
279 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
280 > (view scored_speExc $ view gts'_score g)
283 !(monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
284 !(multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
288 -- use % of list if to big, or Int if to small
289 !mapSize = 1000 :: Double
290 !canSize = mapSize * 2 :: Double
292 !inclSize = 0.4 :: Double
293 !exclSize = 1 - inclSize
295 splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
296 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
298 monoInc_size n = splitAt' n $ monoSize * inclSize / 2
299 multExc_size n = splitAt' n $ multSize * exclSize / 2
302 !(mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
303 !(mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
305 !(mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
306 !(mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
309 !(canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
310 !(canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
312 !(canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
313 !(canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
315 ------------------------------------------------------------
316 -- Final Step building the Typed list
317 -- Candidates Terms need to be filtered
319 !maps = setListType (Just MapTerm)
320 $ mapMonoScoredInclHead
321 <> mapMonoScoredExclHead
322 <> mapMultScoredInclHead
323 <> mapMultScoredExclHead
325 -- An original way to filter to start with
326 !cands = setListType (Just CandidateTerm)
327 $ canMonoScoredIncHead
328 <> canMonoScoredExclHead
329 <> canMulScoredInclHead
330 <> canMultScoredExclHead
333 !cands' = setListType (Just CandidateTerm)
338 !candNgramsElement = List.take 1000
339 $ toNgramsElement cands <> toNgramsElement cands'
341 !result = Map.unionsWith (<>)
342 [ Map.fromList [( nt, toNgramsElement maps
343 <> toNgramsElement stopTerms