2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
20 import Data.HashSet (HashSet)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement, RepoCmdM, NgramsTerm(..))
27 import Gargantext.Core.Text (size)
28 import Gargantext.Core.Text.List.Group
29 import Gargantext.Core.Text.List.Group.Prelude
30 import Gargantext.Core.Text.List.Group.WithStem
31 import Gargantext.Core.Text.List.Social
32 import Gargantext.Core.Text.List.Social.Prelude
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
35 import Gargantext.Core.Types.Individu (User(..))
36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
38 import Gargantext.Database.Admin.Types.Node (NodeId)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
43 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
44 import Gargantext.Database.Query.Tree.Error (HasTreeError)
45 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
46 import Gargantext.Prelude
47 import qualified Data.HashMap.Strict as HashMap
48 import qualified Data.List as List
49 import qualified Data.Map as Map
50 import qualified Data.Set as Set
51 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
52 import qualified Data.HashSet as HashSet
55 -- TODO maybe useful for later
56 isStopTerm :: StopSize -> Text -> Bool
57 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
59 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
63 -- | TODO improve grouping functions of Authors, Sources, Institutes..
64 buildNgramsLists :: ( RepoCmdM env err m
73 -> m (Map NgramsType [NgramsElement])
74 buildNgramsLists gp user uCid mCid = do
75 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
76 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
77 [ (Authors , MapListSize 9)
78 , (Sources , MapListSize 9)
79 , (Institutes, MapListSize 9)
82 pure $ Map.unions $ [ngTerms] <> othersTerms
85 data MapListSize = MapListSize { unMapListSize :: !Int }
87 buildNgramsOthersList ::( HasNodeError err
95 -> (NgramsType, MapListSize)
96 -> m (Map NgramsType [NgramsElement])
97 buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
98 allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
100 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
101 socialLists :: FlowCont NgramsTerm FlowListScores
102 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
104 $ List.zip (HashMap.keys allTerms)
105 (List.cycle [mempty])
108 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
110 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
111 $ view flc_scores groupedWithList
113 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
115 listSize = mapListSize - (List.length mapTerms)
116 (mapTerms', candiTerms) = both HashMap.fromList
117 $ List.splitAt listSize
118 $ List.sortOn (Down . viewScore . snd)
119 $ HashMap.toList tailTerms'
121 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
122 <> (toNgramsElement mapTerms )
123 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
124 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
128 getGroupParams :: ( HasNodeError err
133 => GroupParams -> HashSet Ngrams -> m GroupParams
134 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
135 hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
136 -- printDebug "hashMap" hashMap
137 pure $ over gwl_map (\x -> x <> hashMap) gp
138 getGroupParams gp _ = pure gp
142 buildNgramsTermsList :: ( HasNodeError err
151 -> (NgramsType, MapListSize)
152 -> m (Map NgramsType [NgramsElement])
153 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
155 -- | Filter 0 With Double
156 -- Computing global speGen score
157 allTerms :: HashMap NgramsTerm Double <- getTficf uCid mCid nt
159 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
160 socialLists :: FlowCont NgramsTerm FlowListScores
161 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
163 $ List.zip (HashMap.keys allTerms)
164 (List.cycle [mempty])
166 let ngramsKeys = HashMap.keysSet allTerms
168 groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
171 socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
172 --printDebug "socialLists_Stemmed" socialLists_Stemmed
173 groupedWithList = toGroupedTree socialLists_Stemmed allTerms
174 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
175 $ view flc_scores groupedWithList
177 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
179 -- printDebug "stopTerms" stopTerms
181 -- splitting monterms and multiterms to take proportional candidates
182 -- use % of list if to big, or Int if too small
183 listSizeGlobal = 2000 :: Double
184 monoSize = 0.4 :: Double
185 multSize = 1 - monoSize
187 splitAt n' ns = both (HashMap.fromListWith (<>))
188 $ List.splitAt (round $ n' * listSizeGlobal)
189 $ List.sortOn (viewScore . snd)
192 (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
193 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
195 -------------------------
196 -- Filter 1 With Set NodeId and SpeGen
197 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
200 -- TODO remove (and remove HasNodeError instance)
201 userListId <- defaultList uCid
202 masterListId <- defaultList mCid
204 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
205 [userListId, masterListId]
210 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
211 groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
212 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
215 printDebug "groupedTreeScores_SetNodeId" groupedTreeScores_SetNodeId
217 -- | Coocurrences computation
218 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
219 let mapCooc = HashMap.filter (>1) -- removing cooc of 1
220 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
221 | (t1, s1) <- mapStemNodeIds
222 , (t2, s2) <- mapStemNodeIds
225 mapStemNodeIds = HashMap.toList
226 $ HashMap.map viewScores
227 $ groupedTreeScores_SetNodeId
230 mapScores f = HashMap.fromList
231 $ map (\g -> (view scored_terms g, f g))
235 $ Map.fromList -- TODO remove this
236 $ HashMap.toList mapCooc
239 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
240 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
243 -- sort / partition / split
244 -- filter mono/multi again
245 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
247 -- filter with max score
248 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
249 > (view scored_speExc $ view gts'_score g)
252 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
253 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
257 -- use % of list if to big, or Int if to small
258 listSizeLocal = 1000 :: Double
259 inclSize = 0.4 :: Double
260 exclSize = 1 - inclSize
262 splitAt' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
263 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
266 monoInc_size = splitAt' $ monoSize * inclSize / 2
267 (monoScoredInclHead, _monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl
268 (monoScoredExclHead, _monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl
270 multExc_size = splitAt' $ multSize * exclSize / 2
271 (multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl
272 (multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl
274 ------------------------------------------------------------
275 -- Final Step building the Typed list
276 -- Candidates Terms need to be filtered
278 maps = setListType (Just MapTerm)
280 <> monoScoredExclHead
281 <> multScoredInclHead
282 <> multScoredExclHead
284 -- An original way to filter to start with
285 cands = setListType (Just CandidateTerm)
286 $ {- monoScoredInclTail
287 <> monoScoredExclTail
288 <> -} multScoredInclTail
289 <> multScoredExclTail
291 cands' = setListType (Just CandidateTerm)
295 result = Map.unionsWith (<>)
296 [ Map.fromList [( nt, toNgramsElement maps
297 <> toNgramsElement cands
298 <> toNgramsElement cands'
299 <> toNgramsElement stopTerms
303 -- printDebug "result" result