2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
20 import Data.HashSet (HashSet)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
27 import Gargantext.Core.NodeStory
28 import Gargantext.Core.Text (size)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
43 import Gargantext.Database.Query.Table.Node (defaultList)
44 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
45 import Gargantext.Database.Query.Tree.Error (HasTreeError)
46 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
47 import Gargantext.Prelude
48 import qualified Data.HashMap.Strict as HashMap
49 import qualified Data.HashSet as HashSet
50 import qualified Data.List as List
51 import qualified Data.Map as Map
52 import qualified Data.Set as Set
53 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
56 -- TODO maybe useful for later
57 isStopTerm :: StopSize -> Text -> Bool
58 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
60 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
64 -- | TODO improve grouping functions of Authors, Sources, Institutes..
65 buildNgramsLists :: ( HasNodeStory env err m
73 -> Maybe FlowSocialListWith
75 -> m (Map NgramsType [NgramsElement])
76 buildNgramsLists user uCid mCid mfslw gp = do
77 ngTerms <- buildNgramsTermsList user uCid mCid mfslw gp (NgramsTerms, MapListSize 350)
78 othersTerms <- mapM (buildNgramsOthersList user uCid mfslw GroupIdentity)
79 [ (Authors , MapListSize 9)
80 , (Sources , MapListSize 9)
81 , (Institutes, MapListSize 9)
84 pure $ Map.unions $ [ngTerms] <> othersTerms
87 data MapListSize = MapListSize { unMapListSize :: !Int }
89 buildNgramsOthersList :: ( HasNodeError err
91 , HasNodeStory env err m
96 -> Maybe FlowSocialListWith
98 -> (NgramsType, MapListSize)
99 -> m (Map NgramsType [NgramsElement])
100 buildNgramsOthersList user uCid mfslw _groupParams (nt, MapListSize mapListSize) = do
101 allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
103 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
104 socialLists :: FlowCont NgramsTerm FlowListScores
105 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
107 $ List.zip (HashMap.keys allTerms)
108 (List.cycle [mempty])
111 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
113 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
114 $ view flc_scores groupedWithList
116 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
118 listSize = mapListSize - (List.length mapTerms)
119 (mapTerms', candiTerms) = both HashMap.fromList
120 $ List.splitAt listSize
121 $ List.sortOn (Down . viewScore . snd)
122 $ HashMap.toList tailTerms'
125 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
126 <> (toNgramsElement mapTerms )
127 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
128 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
132 getGroupParams :: ( HasNodeError err
134 , HasNodeStory env err m
137 => GroupParams -> HashSet Ngrams -> m GroupParams
138 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
139 hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
140 -- printDebug "hashMap" hashMap
141 pure $ over gwl_map (\x -> x <> hashMap) gp
142 getGroupParams gp _ = pure gp
146 buildNgramsTermsList :: ( HasNodeError err
148 , HasNodeStory env err m
154 -> Maybe FlowSocialListWith
156 -> (NgramsType, MapListSize)
157 -> m (Map NgramsType [NgramsElement])
158 buildNgramsTermsList user uCid mCid mfslw groupParams (nt, _mapListSize)= do
160 -- Filter 0 With Double
161 -- Computing global speGen score
162 printDebug "[buldNgramsTermsList: Sample List] / start" nt
163 allTerms :: HashMap NgramsTerm Double <- getTficf_withSample uCid mCid nt
164 printDebug "[buldNgramsTermsList: Sample List / end]" nt
166 printDebug "[buldNgramsTermsList: Flow Social List / start]" nt
167 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
168 socialLists :: FlowCont NgramsTerm FlowListScores
169 <- flowSocialList mfslw user nt ( FlowCont HashMap.empty
171 $ List.zip (HashMap.keys allTerms)
172 (List.cycle [mempty])
174 printDebug "[buldNgramsTermsList: Flow Social List / end]" nt
176 let ngramsKeys = HashMap.keysSet allTerms
178 groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
181 socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
182 --printDebug "socialLists_Stemmed" socialLists_Stemmed
183 groupedWithList = toGroupedTree socialLists_Stemmed allTerms
184 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
185 $ HashMap.filter (\g -> (view gts'_score g) > 1)
186 $ view flc_scores groupedWithList
188 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
190 -- printDebug "stopTerms" stopTerms
192 -- splitting monterms and multiterms to take proportional candidates
193 -- use % of list if to big, or Int if too small
194 listSizeGlobal = 2000 :: Double
195 monoSize = 0.4 :: Double
196 multSize = 1 - monoSize
198 splitAt n' ns = both (HashMap.fromListWith (<>))
199 $ List.splitAt (round $ n' * listSizeGlobal)
200 $ List.sortOn (viewScore . snd)
203 (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
204 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
206 -------------------------
207 -- Filter 1 With Set NodeId and SpeGen
208 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
211 -- TODO remove (and remove HasNodeError instance)
212 userListId <- defaultList uCid
213 masterListId <- defaultList mCid
215 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
216 [userListId, masterListId]
221 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
222 groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
223 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
226 --printDebug "groupedTreeScores_SetNodeId" groupedTreeScores_SetNodeId
228 -- Coocurrences computation
229 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
230 let mapCooc = HashMap.filter (>1) -- removing cooc of 1
231 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
232 | (t1, s1) <- mapStemNodeIds
233 , (t2, s2) <- mapStemNodeIds
236 mapStemNodeIds = HashMap.toList
237 $ HashMap.map viewScores
238 $ groupedTreeScores_SetNodeId
241 mapScores f = HashMap.fromList
242 $ map (\g -> (view scored_terms g, f g))
246 $ Map.fromList -- TODO remove this
247 $ HashMap.toList mapCooc
250 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
251 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
254 -- sort / partition / split
255 -- filter mono/multi again
256 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
258 -- filter with max score
259 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
260 > (view scored_speExc $ view gts'_score g)
263 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
264 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
268 -- use % of list if to big, or Int if to small
269 mapSize = 1000 :: Double
270 canSize = mapSize * 5 :: Double
272 inclSize = 0.4 :: Double
273 exclSize = 1 - inclSize
275 splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
276 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
278 monoInc_size n = splitAt' n $ monoSize * inclSize / 2
279 multExc_size n = splitAt' n $ multSize * exclSize / 2
282 (mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
283 (mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
285 (mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
286 (mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
289 (canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
290 (canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
292 (canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
293 (canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
295 ------------------------------------------------------------
296 -- Final Step building the Typed list
297 -- Candidates Terms need to be filtered
299 maps = setListType (Just MapTerm)
300 $ mapMonoScoredInclHead
301 <> mapMonoScoredExclHead
302 <> mapMultScoredInclHead
303 <> mapMultScoredExclHead
305 -- An original way to filter to start with
306 cands = setListType (Just CandidateTerm)
307 $ canMonoScoredIncHead
308 <> canMonoScoredExclHead
309 <> canMulScoredInclHead
310 <> canMultScoredExclHead
313 cands' = setListType (Just CandidateTerm)
318 candNgramsElement = List.take 5000
319 $ toNgramsElement cands <> toNgramsElement cands'
321 result = Map.unionsWith (<>)
322 [ Map.fromList [( nt, toNgramsElement maps
323 <> toNgramsElement stopTerms