2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
14 {-# LANGUAGE BangPatterns #-}
16 module Gargantext.Core.Text.List
19 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
20 import Data.HashMap.Strict (HashMap)
21 import Data.HashSet (HashSet)
23 import Data.Monoid (mempty)
24 import Data.Ord (Down(..))
26 import Data.Text (Text)
27 import Data.Tuple.Extra (both)
28 import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
29 import Gargantext.Core.NodeStory
30 import Gargantext.Core.Text (size)
31 import Gargantext.Core.Text.List.Group
32 import Gargantext.Core.Text.List.Group.Prelude
33 import Gargantext.Core.Text.List.Group.WithStem
34 import Gargantext.Core.Text.List.Social
35 import Gargantext.Core.Text.List.Social.Prelude
36 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
37 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
38 import Gargantext.Core.Types.Individu (User(..))
39 import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsUser, getContextsByNgramsOnlyUser)
40 import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
41 import Gargantext.Database.Admin.Types.Node (NodeId)
42 import Gargantext.Database.Prelude (CmdM)
43 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
44 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
45 import Gargantext.Database.Query.Table.Node (defaultList)
46 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
47 import Gargantext.Database.Query.Tree.Error (HasTreeError)
48 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
49 import Gargantext.Prelude
50 import qualified Data.HashMap.Strict as HashMap
51 import qualified Data.HashSet as HashSet
52 import qualified Data.List as List
53 import qualified Data.Map as Map
54 import qualified Data.Set as Set
55 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
58 -- TODO maybe useful for later
59 isStopTerm :: StopSize -> Text -> Bool
60 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
62 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
66 -- | TODO improve grouping functions of Authors, Sources, Institutes..
67 buildNgramsLists :: ( HasNodeStory env err m
75 -> Maybe FlowSocialListWith
77 -> m (Map NgramsType [NgramsElement])
78 buildNgramsLists user uCid mCid mfslw gp = do
79 ngTerms <- buildNgramsTermsList user uCid mCid mfslw gp (NgramsTerms, MapListSize 350)
80 othersTerms <- mapM (buildNgramsOthersList user uCid mfslw GroupIdentity)
81 [ (Authors , MapListSize 9, MaxListSize 1000)
82 , (Sources , MapListSize 9, MaxListSize 1000)
83 , (Institutes, MapListSize 9, MaxListSize 1000)
86 pure $ Map.unions $ [ngTerms] <> othersTerms
89 data MapListSize = MapListSize { unMapListSize :: !Int }
90 data MaxListSize = MaxListSize { unMaxListSize :: !Int }
92 buildNgramsOthersList :: ( HasNodeError err
94 , HasNodeStory env err m
99 -> Maybe FlowSocialListWith
101 -> (NgramsType, MapListSize, MaxListSize)
102 -> m (Map NgramsType [NgramsElement])
103 buildNgramsOthersList user uCid mfslw _groupParams (nt, MapListSize mapListSize, MaxListSize maxListSize) = do
104 allTerms :: HashMap NgramsTerm (Set NodeId) <- getContextsByNgramsUser uCid nt
106 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
107 socialLists :: FlowCont NgramsTerm FlowListScores
108 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
110 $ List.zip (HashMap.keys allTerms)
111 (List.cycle [mempty])
114 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
116 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
117 $ view flc_scores groupedWithList
119 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
121 listSize = mapListSize - (List.length mapTerms)
122 (mapTerms', candiTerms) = both HashMap.fromList
123 $ List.splitAt listSize
124 $ List.take maxListSize
125 $ List.sortOn (Down . viewScore . snd)
126 $ HashMap.toList tailTerms'
129 pure $ Map.fromList [( nt, List.take maxListSize $ (toNgramsElement stopTerms)
130 <> (toNgramsElement mapTerms )
131 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
132 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
136 getGroupParams :: ( HasNodeError err
138 , HasNodeStory env err m
141 => GroupParams -> HashSet Ngrams -> m GroupParams
142 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
143 !hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
144 -- printDebug "hashMap" hashMap
145 pure $ over gwl_map (\x -> x <> hashMap) gp
146 getGroupParams gp _ = pure gp
150 buildNgramsTermsList :: ( HasNodeError err
152 , HasNodeStory env err m
158 -> Maybe FlowSocialListWith
160 -> (NgramsType, MapListSize)
161 -> m (Map NgramsType [NgramsElement])
162 buildNgramsTermsList user uCid mCid mfslw groupParams (nt, MapListSize mapListSize) = do
164 -- Filter 0 With Double
165 -- Computing global speGen score
166 printDebug "[buildNgramsTermsList: Sample List] / start" nt
167 !(allTerms :: HashMap NgramsTerm Double) <- getTficf_withSample uCid mCid nt
169 printDebug "[buildNgramsTermsList: Sample List / end]" (nt, HashMap.size allTerms)
171 printDebug "[buildNgramsTermsList: Flow Social List / start]" nt
173 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
174 !(socialLists :: FlowCont NgramsTerm FlowListScores)
175 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
177 $ List.zip (HashMap.keys allTerms)
178 (List.cycle [mempty])
180 printDebug "[buildNgramsTermsList: Flow Social List / end]" nt
182 let !ngramsKeys = HashSet.fromList $ List.take mapListSize $ HashSet.toList $ HashMap.keysSet allTerms
184 printDebug "[buildNgramsTermsList: ngramsKeys]" (HashSet.size ngramsKeys)
186 !groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
188 printDebug "[buildNgramsTermsList: groupParams']" ("" :: Text)
191 !socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
192 --printDebug "socialLists_Stemmed" socialLists_Stemmed
193 !groupedWithList = toGroupedTree socialLists_Stemmed allTerms
194 !(stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
195 $ HashMap.filter (\g -> (view gts'_score g) > 1)
196 $ view flc_scores groupedWithList
198 !(groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
200 printDebug "[buildNgramsTermsList] stopTerms" stopTerms
202 -- splitting monterms and multiterms to take proportional candidates
203 -- use % of list if to big, or Int if too small
205 !listSizeGlobal = 2000 :: Double
206 !monoSize = 0.4 :: Double
207 !multSize = 1 - monoSize
209 splitAt n' ns = both (HashMap.fromListWith (<>))
210 $ List.splitAt (round $ n' * listSizeGlobal)
211 $ List.sortOn (viewScore . snd)
214 !(groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
215 !(groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
217 -------------------------
218 -- Filter 1 With Set NodeId and SpeGen
219 !selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
221 printDebug "[buildNgramsTermsList: selectedTerms]" selectedTerms
223 -- TODO remove (and remove HasNodeError instance)
224 !userListId <- defaultList uCid
225 !masterListId <- defaultList mCid
227 !mapTextDocIds <- getContextsByNgramsOnlyUser uCid
228 [userListId, masterListId]
233 printDebug "[buildNgramsTermsList: mapTextDocIds]" mapTextDocIds
236 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
237 !groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
238 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
241 printDebug "[buildNgramsTermsList: groupedTreeScores_SetNodeId]" groupedTreeScores_SetNodeId
243 -- Coocurrences computation
244 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
245 let !mapCooc = HashMap.filter (>1) -- removing cooc of 1
246 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
247 | (t1, s1) <- mapStemNodeIds
248 , (t2, s2) <- mapStemNodeIds
251 mapStemNodeIds = HashMap.toList
252 $ HashMap.map viewScores
253 $ groupedTreeScores_SetNodeId
256 mapScores f = HashMap.fromList
257 $ map (\g -> (view scored_terms g, f g))
261 $ Map.fromList -- TODO remove this
262 $ HashMap.toList mapCooc
265 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
266 !groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
269 -- sort / partition / split
270 -- filter mono/multi again
271 !(monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
273 -- filter with max score
274 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
275 > (view scored_speExc $ view gts'_score g)
278 !(monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
279 !(multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
283 -- use % of list if to big, or Int if to small
284 !mapSize = 1000 :: Double
285 !canSize = mapSize * 2 :: Double
287 !inclSize = 0.4 :: Double
288 !exclSize = 1 - inclSize
290 splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
291 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
293 monoInc_size n = splitAt' n $ monoSize * inclSize / 2
294 multExc_size n = splitAt' n $ multSize * exclSize / 2
297 !(mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
298 !(mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
300 !(mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
301 !(mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
304 !(canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
305 !(canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
307 !(canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
308 !(canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
310 ------------------------------------------------------------
311 -- Final Step building the Typed list
312 -- Candidates Terms need to be filtered
314 !maps = setListType (Just MapTerm)
315 $ mapMonoScoredInclHead
316 <> mapMonoScoredExclHead
317 <> mapMultScoredInclHead
318 <> mapMultScoredExclHead
320 -- An original way to filter to start with
321 !cands = setListType (Just CandidateTerm)
322 $ canMonoScoredIncHead
323 <> canMonoScoredExclHead
324 <> canMulScoredInclHead
325 <> canMultScoredExclHead
328 !cands' = setListType (Just CandidateTerm)
333 !candNgramsElement = List.take 1000
334 $ toNgramsElement cands <> toNgramsElement cands'
336 !result = Map.unionsWith (<>)
337 [ Map.fromList [( nt, toNgramsElement maps
338 <> toNgramsElement stopTerms