2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
20 import Data.HashSet (HashSet)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement, RepoCmdM, NgramsTerm(..))
27 import Gargantext.Core.Text (size)
28 import Gargantext.Core.Text.List.Group
29 import Gargantext.Core.Text.List.Group.Prelude
30 import Gargantext.Core.Text.List.Group.WithStem
31 import Gargantext.Core.Text.List.Social
32 import Gargantext.Core.Text.List.Social.Prelude
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
35 import Gargantext.Core.Types.Individu (User(..))
36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
38 import Gargantext.Database.Admin.Types.Node (NodeId)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
43 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
44 import Gargantext.Database.Query.Tree.Error (HasTreeError)
45 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
46 import Gargantext.Prelude
47 import qualified Data.HashMap.Strict as HashMap
48 import qualified Data.List as List
49 import qualified Data.Map as Map
50 import qualified Data.Set as Set
51 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
52 import qualified Data.HashSet as HashSet
55 -- TODO maybe useful for later
56 isStopTerm :: StopSize -> Text -> Bool
57 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
59 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
63 -- | TODO improve grouping functions of Authors, Sources, Institutes..
64 buildNgramsLists :: ( RepoCmdM env err m
73 -> m (Map NgramsType [NgramsElement])
74 buildNgramsLists gp user uCid mCid = do
75 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
76 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
77 [ (Authors , MapListSize 9)
78 , (Sources , MapListSize 9)
79 , (Institutes, MapListSize 9)
82 pure $ Map.unions $ [ngTerms] <> othersTerms
85 data MapListSize = MapListSize { unMapListSize :: !Int }
87 buildNgramsOthersList ::( HasNodeError err
95 -> (NgramsType, MapListSize)
96 -> m (Map NgramsType [NgramsElement])
97 buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
98 allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
100 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
101 socialLists :: FlowCont NgramsTerm FlowListScores
102 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
104 $ List.zip (HashMap.keys allTerms)
105 (List.cycle [mempty])
108 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
110 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
111 $ view flc_scores groupedWithList
113 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
115 listSize = mapListSize - (List.length mapTerms)
116 (mapTerms', candiTerms) = both HashMap.fromList
117 $ List.splitAt listSize
118 $ List.sortOn (Down . viewScore . snd)
119 $ HashMap.toList tailTerms'
122 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
123 <> (toNgramsElement mapTerms )
124 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
125 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
129 getGroupParams :: ( HasNodeError err
134 => GroupParams -> HashSet Ngrams -> m GroupParams
135 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
136 hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
137 -- printDebug "hashMap" hashMap
138 pure $ over gwl_map (\x -> x <> hashMap) gp
139 getGroupParams gp _ = pure gp
143 buildNgramsTermsList :: ( HasNodeError err
152 -> (NgramsType, MapListSize)
153 -> m (Map NgramsType [NgramsElement])
154 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
156 -- Filter 0 With Double
157 -- Computing global speGen score
158 allTerms :: HashMap NgramsTerm Double <- getTficf uCid mCid nt
160 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
161 socialLists :: FlowCont NgramsTerm FlowListScores
162 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
164 $ List.zip (HashMap.keys allTerms)
165 (List.cycle [mempty])
167 let ngramsKeys = HashMap.keysSet allTerms
169 groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
172 socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
173 --printDebug "socialLists_Stemmed" socialLists_Stemmed
174 groupedWithList = toGroupedTree socialLists_Stemmed allTerms
175 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
176 $ HashMap.filter (\g -> (view gts'_score g) > 1)
177 $ view flc_scores groupedWithList
179 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
181 -- printDebug "stopTerms" stopTerms
183 -- splitting monterms and multiterms to take proportional candidates
184 -- use % of list if to big, or Int if too small
185 listSizeGlobal = 2000 :: Double
186 monoSize = 0.4 :: Double
187 multSize = 1 - monoSize
189 splitAt n' ns = both (HashMap.fromListWith (<>))
190 $ List.splitAt (round $ n' * listSizeGlobal)
191 $ List.sortOn (viewScore . snd)
194 (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
195 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
197 -------------------------
198 -- Filter 1 With Set NodeId and SpeGen
199 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
202 -- TODO remove (and remove HasNodeError instance)
203 userListId <- defaultList uCid
204 masterListId <- defaultList mCid
206 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
207 [userListId, masterListId]
212 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
213 groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
214 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
217 printDebug "groupedTreeScores_SetNodeId" groupedTreeScores_SetNodeId
219 -- Coocurrences computation
220 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
221 let mapCooc = HashMap.filter (>1) -- removing cooc of 1
222 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
223 | (t1, s1) <- mapStemNodeIds
224 , (t2, s2) <- mapStemNodeIds
227 mapStemNodeIds = HashMap.toList
228 $ HashMap.map viewScores
229 $ groupedTreeScores_SetNodeId
232 mapScores f = HashMap.fromList
233 $ map (\g -> (view scored_terms g, f g))
237 $ Map.fromList -- TODO remove this
238 $ HashMap.toList mapCooc
241 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
242 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
245 -- sort / partition / split
246 -- filter mono/multi again
247 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
249 -- filter with max score
250 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
251 > (view scored_speExc $ view gts'_score g)
254 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
255 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
259 -- use % of list if to big, or Int if to small
260 mapSize = 1000 :: Double
261 canSize = mapSize * 5 :: Double
263 inclSize = 0.4 :: Double
264 exclSize = 1 - inclSize
266 splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
267 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
269 monoInc_size n = splitAt' n $ monoSize * inclSize / 2
270 multExc_size n = splitAt' n $ multSize * exclSize / 2
273 (mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
274 (mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
276 (mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
277 (mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
280 (canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
281 (canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
283 (canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
284 (canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
286 ------------------------------------------------------------
287 -- Final Step building the Typed list
288 -- Candidates Terms need to be filtered
290 maps = setListType (Just MapTerm)
291 $ mapMonoScoredInclHead
292 <> mapMonoScoredExclHead
293 <> mapMultScoredInclHead
294 <> mapMultScoredExclHead
296 -- An original way to filter to start with
297 cands = setListType (Just CandidateTerm)
298 $ canMonoScoredIncHead
299 <> canMonoScoredExclHead
300 <> canMulScoredInclHead
301 <> canMultScoredExclHead
304 cands' = setListType (Just CandidateTerm)
309 candNgramsElement = List.take 5000
310 $ toNgramsElement cands <> toNgramsElement cands'
312 result = Map.unionsWith (<>)
313 [ Map.fromList [( nt, toNgramsElement maps
314 <> toNgramsElement stopTerms