2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
21 import Data.Monoid (mempty)
22 import Data.Ord (Down(..))
24 import Data.Tuple.Extra (both)
25 import Gargantext.API.Ngrams.Types (NgramsElement, RepoCmdM, NgramsTerm(..))
26 import Gargantext.Core.Text (size)
27 import Gargantext.Core.Text.List.Group
28 import Gargantext.Core.Text.List.Group.Prelude
29 import Gargantext.Core.Text.List.Group.WithStem
30 import Gargantext.Core.Text.List.Social
31 import Gargantext.Core.Text.List.Social.Prelude
32 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
33 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
34 import Gargantext.Core.Types.Individu (User(..))
35 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
36 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
37 import Gargantext.Database.Admin.Types.Node (NodeId)
38 import Gargantext.Database.Prelude (CmdM)
39 import Gargantext.Database.Query.Table.Node (defaultList)
40 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
41 import Gargantext.Database.Query.Tree.Error (HasTreeError)
42 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
43 import Gargantext.Prelude
44 import qualified Data.HashMap.Strict as HashMap
45 import qualified Data.List as List
46 import qualified Data.Map as Map
47 import qualified Data.Set as Set
48 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
51 -- TODO maybe useful for later
52 isStopTerm :: StopSize -> Text -> Bool
53 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
55 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
59 -- | TODO improve grouping functions of Authors, Sources, Institutes..
60 buildNgramsLists :: ( RepoCmdM env err m
69 -> m (Map NgramsType [NgramsElement])
70 buildNgramsLists user gp uCid mCid = do
71 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
72 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
73 [ (Authors , MapListSize 9)
74 , (Sources , MapListSize 9)
75 , (Institutes, MapListSize 9)
78 pure $ Map.unions $ [ngTerms] <> othersTerms
81 data MapListSize = MapListSize { unMapListSize :: !Int }
83 buildNgramsOthersList ::( HasNodeError err
91 -> (NgramsType, MapListSize)
92 -> m (Map NgramsType [NgramsElement])
93 buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
94 allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
96 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
97 socialLists :: FlowCont NgramsTerm FlowListScores
98 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
100 $ List.zip (HashMap.keys allTerms)
101 (List.cycle [mempty])
104 if nt == Sources -- Authors
105 then printDebug "flowSocialList" socialLists
106 else printDebug "flowSocialList" ""
109 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
111 if nt == Sources -- Authors
112 then printDebug "groupedWithList" groupedWithList
113 else printDebug "groupedWithList" ""
117 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
118 $ view flc_scores groupedWithList
120 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
122 listSize = mapListSize - (List.length mapTerms)
123 (mapTerms', candiTerms) = both HashMap.fromList
124 $ List.splitAt listSize
125 $ List.sortOn (Down . viewScore . snd)
126 $ HashMap.toList tailTerms'
128 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
129 <> (toNgramsElement mapTerms )
130 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
131 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
136 buildNgramsTermsList :: ( HasNodeError err
145 -> (NgramsType, MapListSize)
146 -> m (Map NgramsType [NgramsElement])
147 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
149 -- | Filter 0 With Double
150 -- Computing global speGen score
151 allTerms :: HashMap NgramsTerm Double <- getTficf uCid mCid nt
153 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
154 socialLists :: FlowCont NgramsTerm FlowListScores
155 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
157 $ List.zip (HashMap.keys allTerms)
158 (List.cycle [mempty])
161 let socialLists_Stemmed = addScoreStem groupParams (HashMap.keysSet allTerms) socialLists
162 printDebug "socialLists_Stemmed" socialLists_Stemmed
163 let groupedWithList = toGroupedTree {- groupParams -} socialLists_Stemmed allTerms
164 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
165 $ view flc_scores groupedWithList
167 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
169 -- printDebug "stopTerms" stopTerms
171 -- splitting monterms and multiterms to take proportional candidates
173 -- use % of list if to big, or Int if too small
174 listSizeGlobal = 2000 :: Double
175 monoSize = 0.4 :: Double
176 multSize = 1 - monoSize
178 splitAt n' ns = both (HashMap.fromListWith (<>))
179 $ List.splitAt (round $ n' * listSizeGlobal)
180 $ List.sortOn (viewScore . snd)
183 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
184 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
186 -------------------------
187 -- Filter 1 With Set NodeId and SpeGen
188 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
191 -- TO remove (and remove HasNodeError instance)
192 userListId <- defaultList uCid
193 masterListId <- defaultList mCid
196 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
197 [userListId, masterListId]
202 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
203 groupedTreeScores_SetNodeId = setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
205 -- | Coocurrences computation
206 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
207 let mapCooc = HashMap.filter (>2)
208 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
209 | (t1, s1) <- mapStemNodeIds
210 , (t2, s2) <- mapStemNodeIds
213 mapStemNodeIds = HashMap.toList
214 $ HashMap.map viewScores
215 $ groupedTreeScores_SetNodeId
218 mapScores f = HashMap.fromList
219 $ map (\g -> (view scored_terms g, f g))
223 $ Map.fromList -- TODO remove this
224 $ HashMap.toList mapCooc
227 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
228 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity)
234 -- sort / partition / split
235 -- filter mono/multi again
236 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
238 -- filter with max score
239 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
240 > (view scored_speExc $ view gts'_score g)
243 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
244 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
248 -- use % of list if to big, or Int if to small
249 listSizeLocal = 1000 :: Double
250 inclSize = 0.4 :: Double
251 exclSize = 1 - inclSize
253 splitAt' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
254 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
257 monoInc_size = splitAt' $ monoSize * inclSize / 2
258 (monoScoredInclHead, monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl
259 (monoScoredExclHead, monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl
261 multExc_size = splitAt' $ multSize * exclSize / 2
262 (multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl
263 (multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl
265 ------------------------------------------------------------
266 -- Final Step building the Typed list
267 termListHead = maps <> cands
269 maps = setListType (Just MapTerm)
271 <> monoScoredExclHead
272 <> multScoredInclHead
273 <> multScoredExclHead
275 cands = setListType (Just CandidateTerm)
277 <> monoScoredExclTail
278 <> multScoredInclTail
279 <> multScoredExclTail
281 termListTail = (setListType (Just CandidateTerm)) (groupedMonoTail <> groupedMultTail)
283 let result = Map.unionsWith (<>)
284 [ Map.fromList [( nt, toNgramsElement termListHead
285 <> toNgramsElement termListTail
286 <> toNgramsElement stopTerms
290 -- printDebug "result" result