2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
20 import Data.HashSet (HashSet)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
27 import Gargantext.Core.NodeStory
28 import Gargantext.Core.Text (size)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByContext (getContextsByNgramsUser, getContextsByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
43 import Gargantext.Database.Query.Table.Node (defaultList)
44 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
45 import Gargantext.Database.Query.Tree.Error (HasTreeError)
46 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
47 import Gargantext.Prelude
48 import qualified Data.HashMap.Strict as HashMap
49 import qualified Data.HashSet as HashSet
50 import qualified Data.List as List
51 import qualified Data.Map as Map
52 import qualified Data.Set as Set
53 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
56 -- TODO maybe useful for later
57 isStopTerm :: StopSize -> Text -> Bool
58 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
60 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
64 -- | TODO improve grouping functions of Authors, Sources, Institutes..
65 buildNgramsLists :: ( HasNodeStory env err m
73 -> Maybe FlowSocialListWith
75 -> m (Map NgramsType [NgramsElement])
76 buildNgramsLists user uCid mCid mfslw gp = do
77 ngTerms <- buildNgramsTermsList user uCid mCid mfslw gp (NgramsTerms, MapListSize 350)
78 othersTerms <- mapM (buildNgramsOthersList user uCid mfslw GroupIdentity)
79 [ (Authors , MapListSize 9, MaxListSize 1000)
80 , (Sources , MapListSize 9, MaxListSize 1000)
81 , (Institutes, MapListSize 9, MaxListSize 1000)
84 pure $ Map.unions $ [ngTerms] <> othersTerms
87 data MapListSize = MapListSize { unMapListSize :: !Int }
88 data MaxListSize = MaxListSize { unMaxListSize :: !Int }
90 buildNgramsOthersList :: ( HasNodeError err
92 , HasNodeStory env err m
97 -> Maybe FlowSocialListWith
99 -> (NgramsType, MapListSize, MaxListSize)
100 -> m (Map NgramsType [NgramsElement])
101 buildNgramsOthersList user uCid mfslw _groupParams (nt, MapListSize mapListSize, MaxListSize maxListSize) = do
102 allTerms :: HashMap NgramsTerm (Set NodeId) <- getContextsByNgramsUser uCid nt
104 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
105 socialLists :: FlowCont NgramsTerm FlowListScores
106 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
108 $ List.zip (HashMap.keys allTerms)
109 (List.cycle [mempty])
112 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
114 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
115 $ view flc_scores groupedWithList
117 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
119 listSize = mapListSize - (List.length mapTerms)
120 (mapTerms', candiTerms) = both HashMap.fromList
121 $ List.splitAt listSize
122 $ List.take maxListSize
123 $ List.sortOn (Down . viewScore . snd)
124 $ HashMap.toList tailTerms'
127 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
128 <> (toNgramsElement mapTerms )
129 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
130 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
134 getGroupParams :: ( HasNodeError err
136 , HasNodeStory env err m
139 => GroupParams -> HashSet Ngrams -> m GroupParams
140 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
141 hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
142 -- printDebug "hashMap" hashMap
143 pure $ over gwl_map (\x -> x <> hashMap) gp
144 getGroupParams gp _ = pure gp
148 buildNgramsTermsList :: ( HasNodeError err
150 , HasNodeStory env err m
156 -> Maybe FlowSocialListWith
158 -> (NgramsType, MapListSize)
159 -> m (Map NgramsType [NgramsElement])
160 buildNgramsTermsList user uCid mCid mfslw groupParams (nt, _mapListSize)= do
162 -- Filter 0 With Double
163 -- Computing global speGen score
164 printDebug "[buildNgramsTermsList: Sample List] / start" nt
165 allTerms :: HashMap NgramsTerm Double <- getTficf_withSample uCid mCid nt
166 printDebug "[buildNgramsTermsList: Sample List / end]" (nt, HashMap.size allTerms)
168 printDebug "[buildNgramsTermsList: Flow Social List / start]" nt
169 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
170 socialLists :: FlowCont NgramsTerm FlowListScores
171 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
173 $ List.zip (HashMap.keys allTerms)
174 (List.cycle [mempty])
176 printDebug "[buildNgramsTermsList: Flow Social List / end]" nt
178 let ngramsKeys = HashMap.keysSet allTerms
180 groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
183 socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
184 --printDebug "socialLists_Stemmed" socialLists_Stemmed
185 groupedWithList = toGroupedTree socialLists_Stemmed allTerms
186 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
187 $ HashMap.filter (\g -> (view gts'_score g) > 1)
188 $ view flc_scores groupedWithList
190 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
192 -- printDebug "stopTerms" stopTerms
194 -- splitting monterms and multiterms to take proportional candidates
195 -- use % of list if to big, or Int if too small
196 listSizeGlobal = 2000 :: Double
197 monoSize = 0.4 :: Double
198 multSize = 1 - monoSize
200 splitAt n' ns = both (HashMap.fromListWith (<>))
201 $ List.splitAt (round $ n' * listSizeGlobal)
202 $ List.sortOn (viewScore . snd)
205 (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
206 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
208 -------------------------
209 -- Filter 1 With Set NodeId and SpeGen
210 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
213 -- TODO remove (and remove HasNodeError instance)
214 userListId <- defaultList uCid
215 masterListId <- defaultList mCid
217 mapTextDocIds <- getContextsByNgramsOnlyUser uCid
218 [userListId, masterListId]
223 -- printDebug "mapTextDocIds" mapTextDocIds
226 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
227 groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
228 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
231 --printDebug "groupedTreeScores_SetNodeId" groupedTreeScores_SetNodeId
233 -- Coocurrences computation
234 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
235 let mapCooc = HashMap.filter (>1) -- removing cooc of 1
236 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
237 | (t1, s1) <- mapStemNodeIds
238 , (t2, s2) <- mapStemNodeIds
241 mapStemNodeIds = HashMap.toList
242 $ HashMap.map viewScores
243 $ groupedTreeScores_SetNodeId
246 mapScores f = HashMap.fromList
247 $ map (\g -> (view scored_terms g, f g))
251 $ Map.fromList -- TODO remove this
252 $ HashMap.toList mapCooc
255 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
256 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
259 -- sort / partition / split
260 -- filter mono/multi again
261 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
263 -- filter with max score
264 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
265 > (view scored_speExc $ view gts'_score g)
268 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
269 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
273 -- use % of list if to big, or Int if to small
274 mapSize = 1000 :: Double
275 canSize = mapSize * 5 :: Double
277 inclSize = 0.4 :: Double
278 exclSize = 1 - inclSize
280 splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
281 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
283 monoInc_size n = splitAt' n $ monoSize * inclSize / 2
284 multExc_size n = splitAt' n $ multSize * exclSize / 2
287 (mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
288 (mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
290 (mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
291 (mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
294 (canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
295 (canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
297 (canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
298 (canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
300 ------------------------------------------------------------
301 -- Final Step building the Typed list
302 -- Candidates Terms need to be filtered
304 maps = setListType (Just MapTerm)
305 $ mapMonoScoredInclHead
306 <> mapMonoScoredExclHead
307 <> mapMultScoredInclHead
308 <> mapMultScoredExclHead
310 -- An original way to filter to start with
311 cands = setListType (Just CandidateTerm)
312 $ canMonoScoredIncHead
313 <> canMonoScoredExclHead
314 <> canMulScoredInclHead
315 <> canMultScoredExclHead
318 cands' = setListType (Just CandidateTerm)
323 candNgramsElement = List.take 5000
324 $ toNgramsElement cands <> toNgramsElement cands'
326 result = Map.unionsWith (<>)
327 [ Map.fromList [( nt, toNgramsElement maps
328 <> toNgramsElement stopTerms