2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
20 import Data.HashSet (HashSet)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
27 import Gargantext.Core.NodeStory
28 import Gargantext.Core.Text (size)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsUser, getContextsByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
43 import Gargantext.Database.Query.Table.Node (defaultList)
44 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
45 import Gargantext.Database.Query.Tree.Error (HasTreeError)
46 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
47 import Gargantext.Prelude
48 import qualified Data.HashMap.Strict as HashMap
49 import qualified Data.HashSet as HashSet
50 import qualified Data.List as List
51 import qualified Data.Map as Map
52 import qualified Data.Set as Set
53 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
56 -- TODO maybe useful for later
57 isStopTerm :: StopSize -> Text -> Bool
58 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
60 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
64 -- | TODO improve grouping functions of Authors, Sources, Institutes..
65 buildNgramsLists :: ( HasNodeStory env err m
73 -> Maybe FlowSocialListWith
75 -> m (Map NgramsType [NgramsElement])
76 buildNgramsLists user uCid mCid mfslw gp = do
77 ngTerms <- buildNgramsTermsList user uCid mCid mfslw gp (NgramsTerms, MapListSize 350)
78 othersTerms <- mapM (buildNgramsOthersList user uCid mfslw GroupIdentity)
79 [ (Authors , MapListSize 9)
80 , (Sources , MapListSize 9)
81 , (Institutes, MapListSize 9)
84 pure $ Map.unions $ [ngTerms] <> othersTerms
87 data MapListSize = MapListSize { unMapListSize :: !Int }
89 buildNgramsOthersList :: ( HasNodeError err
91 , HasNodeStory env err m
96 -> Maybe FlowSocialListWith
98 -> (NgramsType, MapListSize)
99 -> m (Map NgramsType [NgramsElement])
100 buildNgramsOthersList user uCid mfslw _groupParams (nt, MapListSize mapListSize) = do
101 allTerms :: HashMap NgramsTerm (Set NodeId) <- getContextsByNgramsUser uCid nt
103 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
104 socialLists :: FlowCont NgramsTerm FlowListScores
105 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
107 $ List.zip (HashMap.keys allTerms)
108 (List.cycle [mempty])
111 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
113 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
114 $ view flc_scores groupedWithList
116 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
118 listSize = mapListSize - (List.length mapTerms)
119 (mapTerms', candiTerms) = both HashMap.fromList
120 $ List.splitAt listSize
121 $ List.sortOn (Down . viewScore . snd)
122 $ HashMap.toList tailTerms'
125 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
126 <> (toNgramsElement mapTerms )
127 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
128 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
132 getGroupParams :: ( HasNodeError err
134 , HasNodeStory env err m
137 => GroupParams -> HashSet Ngrams -> m GroupParams
138 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
139 hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
140 -- printDebug "hashMap" hashMap
141 pure $ over gwl_map (\x -> x <> hashMap) gp
142 getGroupParams gp _ = pure gp
146 buildNgramsTermsList :: ( HasNodeError err
148 , HasNodeStory env err m
154 -> Maybe FlowSocialListWith
156 -> (NgramsType, MapListSize)
157 -> m (Map NgramsType [NgramsElement])
158 buildNgramsTermsList user uCid mCid mfslw groupParams (nt, _mapListSize)= do
160 -- Filter 0 With Double
161 -- Computing global speGen score
162 printDebug "[buildNgramsTermsList: Sample List] / start" nt
163 allTerms :: HashMap NgramsTerm Double <- getTficf_withSample uCid mCid nt
164 printDebug "[buildNgramsTermsList: Sample List / end]" (nt, HashMap.size allTerms)
166 printDebug "[buildNgramsTermsList: Flow Social List / start]" nt
167 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
168 socialLists :: FlowCont NgramsTerm FlowListScores
169 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
171 $ List.zip (HashMap.keys allTerms)
172 (List.cycle [mempty])
174 printDebug "[buildNgramsTermsList: Flow Social List / end]" nt
176 let ngramsKeys = HashMap.keysSet allTerms
178 groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
181 socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
182 --printDebug "socialLists_Stemmed" socialLists_Stemmed
183 groupedWithList = toGroupedTree socialLists_Stemmed allTerms
184 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
185 $ HashMap.filter (\g -> (view gts'_score g) > 1)
186 $ view flc_scores groupedWithList
188 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
190 -- printDebug "stopTerms" stopTerms
192 -- splitting monterms and multiterms to take proportional candidates
193 -- use % of list if to big, or Int if too small
194 listSizeGlobal = 2000 :: Double
195 monoSize = 0.4 :: Double
196 multSize = 1 - monoSize
198 splitAt n' ns = both (HashMap.fromListWith (<>))
199 $ List.splitAt (round $ n' * listSizeGlobal)
200 $ List.sortOn (viewScore . snd)
203 (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
204 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
206 -------------------------
207 -- Filter 1 With Set NodeId and SpeGen
208 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
211 -- TODO remove (and remove HasNodeError instance)
212 userListId <- defaultList uCid
213 masterListId <- defaultList mCid
215 mapTextDocIds <- getContextsByNgramsOnlyUser uCid
216 [userListId, masterListId]
221 -- printDebug "mapTextDocIds" mapTextDocIds
224 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
225 groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
226 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
229 --printDebug "groupedTreeScores_SetNodeId" groupedTreeScores_SetNodeId
231 -- Coocurrences computation
232 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
233 let mapCooc = HashMap.filter (>1) -- removing cooc of 1
234 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
235 | (t1, s1) <- mapStemNodeIds
236 , (t2, s2) <- mapStemNodeIds
239 mapStemNodeIds = HashMap.toList
240 $ HashMap.map viewScores
241 $ groupedTreeScores_SetNodeId
244 mapScores f = HashMap.fromList
245 $ map (\g -> (view scored_terms g, f g))
249 $ Map.fromList -- TODO remove this
250 $ HashMap.toList mapCooc
253 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
254 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
257 -- sort / partition / split
258 -- filter mono/multi again
259 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
261 -- filter with max score
262 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
263 > (view scored_speExc $ view gts'_score g)
266 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
267 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
271 -- use % of list if to big, or Int if to small
272 mapSize = 1000 :: Double
273 canSize = mapSize * 5 :: Double
275 inclSize = 0.4 :: Double
276 exclSize = 1 - inclSize
278 splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
279 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
281 monoInc_size n = splitAt' n $ monoSize * inclSize / 2
282 multExc_size n = splitAt' n $ multSize * exclSize / 2
285 (mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
286 (mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
288 (mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
289 (mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
292 (canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
293 (canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
295 (canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
296 (canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
298 ------------------------------------------------------------
299 -- Final Step building the Typed list
300 -- Candidates Terms need to be filtered
302 maps = setListType (Just MapTerm)
303 $ mapMonoScoredInclHead
304 <> mapMonoScoredExclHead
305 <> mapMultScoredInclHead
306 <> mapMultScoredExclHead
308 -- An original way to filter to start with
309 cands = setListType (Just CandidateTerm)
310 $ canMonoScoredIncHead
311 <> canMonoScoredExclHead
312 <> canMulScoredInclHead
313 <> canMultScoredExclHead
316 cands' = setListType (Just CandidateTerm)
321 candNgramsElement = List.take 5000
322 $ toNgramsElement cands <> toNgramsElement cands'
324 result = Map.unionsWith (<>)
325 [ Map.fromList [( nt, toNgramsElement maps
326 <> toNgramsElement stopTerms