2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
20 import Data.HashSet (HashSet)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement, NgramsTerm(..))
27 import Gargantext.Core.NodeStory
28 import Gargantext.Core.Text (size)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf_withSample)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
43 import Gargantext.Database.Query.Table.Node (defaultList)
44 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
45 import Gargantext.Database.Query.Tree.Error (HasTreeError)
46 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
47 import Gargantext.Prelude
48 import qualified Data.HashMap.Strict as HashMap
49 import qualified Data.HashSet as HashSet
50 import qualified Data.List as List
51 import qualified Data.Map as Map
52 import qualified Data.Set as Set
53 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
56 -- TODO maybe useful for later
57 isStopTerm :: StopSize -> Text -> Bool
58 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
60 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
64 -- | TODO improve grouping functions of Authors, Sources, Institutes..
65 buildNgramsLists :: ( HasNodeStory env err m
74 -> m (Map NgramsType [NgramsElement])
75 buildNgramsLists gp user uCid mCid = do
76 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
77 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
78 [ (Authors , MapListSize 9)
79 , (Sources , MapListSize 9)
80 , (Institutes, MapListSize 9)
83 pure $ Map.unions $ [ngTerms] <> othersTerms
86 data MapListSize = MapListSize { unMapListSize :: !Int }
88 buildNgramsOthersList ::( HasNodeError err
90 , HasNodeStory env err m
96 -> (NgramsType, MapListSize)
97 -> m (Map NgramsType [NgramsElement])
98 buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
99 allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
101 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
102 socialLists :: FlowCont NgramsTerm FlowListScores
103 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
105 $ List.zip (HashMap.keys allTerms)
106 (List.cycle [mempty])
109 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
111 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
112 $ view flc_scores groupedWithList
114 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
116 listSize = mapListSize - (List.length mapTerms)
117 (mapTerms', candiTerms) = both HashMap.fromList
118 $ List.splitAt listSize
119 $ List.sortOn (Down . viewScore . snd)
120 $ HashMap.toList tailTerms'
123 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
124 <> (toNgramsElement mapTerms )
125 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
126 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
130 getGroupParams :: ( HasNodeError err
132 , HasNodeStory env err m
135 => GroupParams -> HashSet Ngrams -> m GroupParams
136 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
137 hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
138 -- printDebug "hashMap" hashMap
139 pure $ over gwl_map (\x -> x <> hashMap) gp
140 getGroupParams gp _ = pure gp
144 buildNgramsTermsList :: ( HasNodeError err
146 , HasNodeStory env err m
153 -> (NgramsType, MapListSize)
154 -> m (Map NgramsType [NgramsElement])
155 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
157 -- Filter 0 With Double
158 -- Computing global speGen score
159 printDebug "[buldNgramsTermsList: Sample List] / start" nt
160 allTerms :: HashMap NgramsTerm Double <- getTficf_withSample uCid mCid nt
161 printDebug "[buldNgramsTermsList: Sample List / end]" nt
163 printDebug "[buldNgramsTermsList: Flow Social List / start]" nt
164 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
165 socialLists :: FlowCont NgramsTerm FlowListScores
166 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
168 $ List.zip (HashMap.keys allTerms)
169 (List.cycle [mempty])
171 printDebug "[buldNgramsTermsList: Flow Social List / end]" nt
173 let ngramsKeys = HashMap.keysSet allTerms
175 groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
178 socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
179 --printDebug "socialLists_Stemmed" socialLists_Stemmed
180 groupedWithList = toGroupedTree socialLists_Stemmed allTerms
181 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
182 $ HashMap.filter (\g -> (view gts'_score g) > 1)
183 $ view flc_scores groupedWithList
185 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
187 -- printDebug "stopTerms" stopTerms
189 -- splitting monterms and multiterms to take proportional candidates
190 -- use % of list if to big, or Int if too small
191 listSizeGlobal = 2000 :: Double
192 monoSize = 0.4 :: Double
193 multSize = 1 - monoSize
195 splitAt n' ns = both (HashMap.fromListWith (<>))
196 $ List.splitAt (round $ n' * listSizeGlobal)
197 $ List.sortOn (viewScore . snd)
200 (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
201 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
203 -------------------------
204 -- Filter 1 With Set NodeId and SpeGen
205 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
208 -- TODO remove (and remove HasNodeError instance)
209 userListId <- defaultList uCid
210 masterListId <- defaultList mCid
212 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
213 [userListId, masterListId]
218 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
219 groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
220 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
223 printDebug "groupedTreeScores_SetNodeId" groupedTreeScores_SetNodeId
225 -- Coocurrences computation
226 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
227 let mapCooc = HashMap.filter (>1) -- removing cooc of 1
228 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
229 | (t1, s1) <- mapStemNodeIds
230 , (t2, s2) <- mapStemNodeIds
233 mapStemNodeIds = HashMap.toList
234 $ HashMap.map viewScores
235 $ groupedTreeScores_SetNodeId
238 mapScores f = HashMap.fromList
239 $ map (\g -> (view scored_terms g, f g))
243 $ Map.fromList -- TODO remove this
244 $ HashMap.toList mapCooc
247 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
248 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
251 -- sort / partition / split
252 -- filter mono/multi again
253 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
255 -- filter with max score
256 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
257 > (view scored_speExc $ view gts'_score g)
260 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
261 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
265 -- use % of list if to big, or Int if to small
266 mapSize = 1000 :: Double
267 canSize = mapSize * 5 :: Double
269 inclSize = 0.4 :: Double
270 exclSize = 1 - inclSize
272 splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
273 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
275 monoInc_size n = splitAt' n $ monoSize * inclSize / 2
276 multExc_size n = splitAt' n $ multSize * exclSize / 2
279 (mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
280 (mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
282 (mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
283 (mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
286 (canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
287 (canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
289 (canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
290 (canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
292 ------------------------------------------------------------
293 -- Final Step building the Typed list
294 -- Candidates Terms need to be filtered
296 maps = setListType (Just MapTerm)
297 $ mapMonoScoredInclHead
298 <> mapMonoScoredExclHead
299 <> mapMultScoredInclHead
300 <> mapMultScoredExclHead
302 -- An original way to filter to start with
303 cands = setListType (Just CandidateTerm)
304 $ canMonoScoredIncHead
305 <> canMonoScoredExclHead
306 <> canMulScoredInclHead
307 <> canMultScoredExclHead
310 cands' = setListType (Just CandidateTerm)
315 candNgramsElement = List.take 5000
316 $ toNgramsElement cands <> toNgramsElement cands'
318 result = Map.unionsWith (<>)
319 [ Map.fromList [( nt, toNgramsElement maps
320 <> toNgramsElement stopTerms