2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
20 import Data.HashSet (HashSet)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
27 import Gargantext.Core.NodeStory
28 import Gargantext.Core.Text (size)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
43 import Gargantext.Database.Query.Table.Node (defaultList)
44 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
45 import Gargantext.Database.Query.Tree.Error (HasTreeError)
46 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
47 import Gargantext.Prelude
48 import qualified Data.HashMap.Strict as HashMap
49 import qualified Data.HashSet as HashSet
50 import qualified Data.List as List
51 import qualified Data.Map as Map
52 import qualified Data.Set as Set
53 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
56 -- TODO maybe useful for later
57 isStopTerm :: StopSize -> Text -> Bool
58 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
60 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
64 -- | TODO improve grouping functions of Authors, Sources, Institutes..
65 buildNgramsLists :: ( HasNodeStory env err m
74 -> m (Map NgramsType [NgramsElement])
75 buildNgramsLists gp user uCid mCid = do
76 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
77 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
78 [ (Authors , MapListSize 9)
79 , (Sources , MapListSize 9)
80 , (Institutes, MapListSize 9)
83 pure $ Map.unions $ [ngTerms] <> othersTerms
86 data MapListSize = MapListSize { unMapListSize :: !Int }
88 buildNgramsOthersList ::( HasNodeError err
90 , HasNodeStory env err m
96 -> (NgramsType, MapListSize)
97 -> m (Map NgramsType [NgramsElement])
98 buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
99 allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
101 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
102 socialLists :: FlowCont NgramsTerm FlowListScores
103 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
105 $ List.zip (HashMap.keys allTerms)
106 (List.cycle [mempty])
109 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
111 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
112 $ view flc_scores groupedWithList
114 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
116 listSize = mapListSize - (List.length mapTerms)
117 (mapTerms', candiTerms) = both HashMap.fromList
118 $ List.splitAt listSize
119 $ List.sortOn (Down . viewScore . snd)
120 $ HashMap.toList tailTerms'
123 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
124 <> (toNgramsElement mapTerms )
125 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
126 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
130 getGroupParams :: ( HasNodeError err
132 , HasNodeStory env err m
135 => GroupParams -> HashSet Ngrams -> m GroupParams
136 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
137 hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
138 -- printDebug "hashMap" hashMap
139 pure $ over gwl_map (\x -> x <> hashMap) gp
140 getGroupParams gp _ = pure gp
144 buildNgramsTermsList :: ( HasNodeError err
146 , HasNodeStory env err m
153 -> (NgramsType, MapListSize)
154 -> m (Map NgramsType [NgramsElement])
155 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
157 -- Filter 0 With Double
158 -- Computing global speGen score
159 allTerms :: HashMap NgramsTerm Double <- getTficf uCid mCid nt
161 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
162 socialLists :: FlowCont NgramsTerm FlowListScores
163 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
165 $ List.zip (HashMap.keys allTerms)
166 (List.cycle [mempty])
168 let ngramsKeys = HashMap.keysSet allTerms
170 groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
173 socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
174 --printDebug "socialLists_Stemmed" socialLists_Stemmed
175 groupedWithList = toGroupedTree socialLists_Stemmed allTerms
176 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
177 $ HashMap.filter (\g -> (view gts'_score g) > 1)
178 $ view flc_scores groupedWithList
180 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
182 -- printDebug "stopTerms" stopTerms
184 -- splitting monterms and multiterms to take proportional candidates
185 -- use % of list if to big, or Int if too small
186 listSizeGlobal = 2000 :: Double
187 monoSize = 0.4 :: Double
188 multSize = 1 - monoSize
190 splitAt n' ns = both (HashMap.fromListWith (<>))
191 $ List.splitAt (round $ n' * listSizeGlobal)
192 $ List.sortOn (viewScore . snd)
195 (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
196 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
198 -------------------------
199 -- Filter 1 With Set NodeId and SpeGen
200 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
203 -- TODO remove (and remove HasNodeError instance)
204 userListId <- defaultList uCid
205 masterListId <- defaultList mCid
207 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
208 [userListId, masterListId]
213 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
214 groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
215 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
218 printDebug "groupedTreeScores_SetNodeId" groupedTreeScores_SetNodeId
220 -- Coocurrences computation
221 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
222 let mapCooc = HashMap.filter (>1) -- removing cooc of 1
223 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
224 | (t1, s1) <- mapStemNodeIds
225 , (t2, s2) <- mapStemNodeIds
228 mapStemNodeIds = HashMap.toList
229 $ HashMap.map viewScores
230 $ groupedTreeScores_SetNodeId
233 mapScores f = HashMap.fromList
234 $ map (\g -> (view scored_terms g, f g))
238 $ Map.fromList -- TODO remove this
239 $ HashMap.toList mapCooc
242 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
243 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
246 -- sort / partition / split
247 -- filter mono/multi again
248 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
250 -- filter with max score
251 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
252 > (view scored_speExc $ view gts'_score g)
255 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
256 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
260 -- use % of list if to big, or Int if to small
261 mapSize = 1000 :: Double
262 canSize = mapSize * 5 :: Double
264 inclSize = 0.4 :: Double
265 exclSize = 1 - inclSize
267 splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
268 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
270 monoInc_size n = splitAt' n $ monoSize * inclSize / 2
271 multExc_size n = splitAt' n $ multSize * exclSize / 2
274 (mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
275 (mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
277 (mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
278 (mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
281 (canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
282 (canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
284 (canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
285 (canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
287 ------------------------------------------------------------
288 -- Final Step building the Typed list
289 -- Candidates Terms need to be filtered
291 maps = setListType (Just MapTerm)
292 $ mapMonoScoredInclHead
293 <> mapMonoScoredExclHead
294 <> mapMultScoredInclHead
295 <> mapMultScoredExclHead
297 -- An original way to filter to start with
298 cands = setListType (Just CandidateTerm)
299 $ canMonoScoredIncHead
300 <> canMonoScoredExclHead
301 <> canMulScoredInclHead
302 <> canMultScoredExclHead
305 cands' = setListType (Just CandidateTerm)
310 candNgramsElement = List.take 5000
311 $ toNgramsElement cands <> toNgramsElement cands'
313 result = Map.unionsWith (<>)
314 [ Map.fromList [( nt, toNgramsElement maps
315 <> toNgramsElement stopTerms