2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
20 import Data.Monoid (mempty)
21 import Data.Ord (Down(..))
23 import Data.Text (Text)
24 import Data.Tuple.Extra (both)
25 import Gargantext.API.Ngrams.Types (NgramsElement, RepoCmdM, NgramsTerm(..))
26 import Gargantext.Core.Text (size)
27 import Gargantext.Core.Text.List.Group
28 import Gargantext.Core.Text.List.Group.Prelude
29 import Gargantext.Core.Text.List.Group.WithStem
30 import Gargantext.Core.Text.List.Social
31 import Gargantext.Core.Text.List.Social.Prelude
32 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
33 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
34 import Gargantext.Core.Types.Individu (User(..))
35 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
36 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
37 import Gargantext.Database.Admin.Types.Node (NodeId)
38 import Gargantext.Database.Prelude (CmdM)
39 import Gargantext.Database.Query.Table.Node (defaultList)
40 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
41 import Gargantext.Database.Query.Tree.Error (HasTreeError)
42 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
43 import Gargantext.Prelude
44 import qualified Data.List as List
45 import qualified Data.Map as Map
46 import qualified Data.Set as Set
50 -- TODO maybe useful for later
51 isStopTerm :: StopSize -> Text -> Bool
52 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
54 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
58 -- | TODO improve grouping functions of Authors, Sources, Institutes..
59 buildNgramsLists :: ( RepoCmdM env err m
68 -> m (Map NgramsType [NgramsElement])
69 buildNgramsLists user gp uCid mCid = do
70 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
71 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
72 [ (Authors , MapListSize 9)
73 , (Sources , MapListSize 9)
74 , (Institutes, MapListSize 9)
77 pure $ Map.unions $ [ngTerms] <> othersTerms
80 data MapListSize = MapListSize { unMapListSize :: !Int }
82 buildNgramsOthersList ::( HasNodeError err
90 -> (NgramsType, MapListSize)
91 -> m (Map NgramsType [NgramsElement])
92 buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
93 allTerms :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
95 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
96 socialLists :: FlowCont Text FlowListScores
97 <- flowSocialList MySelfFirst user nt ( FlowCont Map.empty
99 $ List.zip (Map.keys allTerms)
100 (List.cycle [mempty])
103 if nt == Sources -- Authors
104 then printDebug "flowSocialList" socialLists
105 else printDebug "flowSocialList" ""
108 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
110 if nt == Sources -- Authors
111 then printDebug "groupedWithList" groupedWithList
112 else printDebug "groupedWithList" ""
116 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType)
117 $ view flc_scores groupedWithList
119 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
121 listSize = mapListSize - (List.length mapTerms)
122 (mapTerms', candiTerms) = both Map.fromList
123 $ List.splitAt listSize
124 $ List.sortOn (Down . viewScore . snd)
125 $ Map.toList tailTerms'
127 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
128 <> (toNgramsElement mapTerms )
129 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
130 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
135 buildNgramsTermsList :: ( HasNodeError err
144 -> (NgramsType, MapListSize)
145 -> m (Map NgramsType [NgramsElement])
146 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
148 -- | Filter 0 With Double
149 -- Computing global speGen score
150 allTerms :: Map Text Double <- getTficf uCid mCid nt
152 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
153 socialLists :: FlowCont Text FlowListScores
154 <- flowSocialList MySelfFirst user nt ( FlowCont Map.empty
156 $ List.zip (Map.keys allTerms)
157 (List.cycle [mempty])
160 let socialLists_Stemmed = addScoreStem groupParams (Set.map NgramsTerm $ Map.keysSet allTerms) socialLists
161 printDebug "socialLists_Stemmed" socialLists_Stemmed
162 let groupedWithList = toGroupedTree {- groupParams -} socialLists_Stemmed allTerms
163 (stopTerms, candidateTerms) = Map.partition ((== Just StopTerm) . viewListType)
164 $ view flc_scores groupedWithList
166 (groupedMono, groupedMult) = Map.partitionWithKey (\t _v -> size t < 2) candidateTerms
168 -- printDebug "stopTerms" stopTerms
170 -- splitting monterms and multiterms to take proportional candidates
172 -- use % of list if to big, or Int if too small
173 listSizeGlobal = 2000 :: Double
174 monoSize = 0.4 :: Double
175 multSize = 1 - monoSize
177 splitAt n' ns = both (Map.fromListWith (<>))
178 $ List.splitAt (round $ n' * listSizeGlobal)
179 $ List.sortOn (viewScore . snd)
182 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
183 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
185 -------------------------
186 -- Filter 1 With Set NodeId and SpeGen
187 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
190 -- TO remove (and remove HasNodeError instance)
191 userListId <- defaultList uCid
192 masterListId <- defaultList mCid
195 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
196 [userListId, masterListId]
201 groupedTreeScores_SetNodeId :: Map Text (GroupedTreeScores (Set NodeId))
202 groupedTreeScores_SetNodeId = setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
204 -- | Coocurrences computation
205 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
206 let mapCooc = Map.filter (>2)
207 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
208 | (t1, s1) <- mapStemNodeIds
209 , (t2, s2) <- mapStemNodeIds
212 mapStemNodeIds = Map.toList
214 $ groupedTreeScores_SetNodeId
217 mapScores f = Map.fromList
218 $ map (\g -> (view scored_terms g, f g))
224 groupedTreeScores_SpeGen :: Map Text (GroupedTreeScores (Scored Text))
225 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity)
231 -- sort / partition / split
232 -- filter mono/multi again
233 (monoScored, multScored) = Map.partitionWithKey (\t _v -> size t < 2) groupedTreeScores_SpeGen
235 -- filter with max score
236 partitionWithMaxScore = Map.partition (\g -> (view scored_genInc $ view gts'_score g)
237 > (view scored_speExc $ view gts'_score g)
240 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
241 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
245 -- use % of list if to big, or Int if to small
246 listSizeLocal = 1000 :: Double
247 inclSize = 0.4 :: Double
248 exclSize = 1 - inclSize
250 splitAt' n' = (both (Map.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
251 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . Map.toList
254 monoInc_size = splitAt' $ monoSize * inclSize / 2
255 (monoScoredInclHead, monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl
256 (monoScoredExclHead, monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl
258 multExc_size = splitAt' $ multSize * exclSize / 2
259 (multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl
260 (multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl
263 ------------------------------------------------------------
265 -- Final Step building the Typed list
266 termListHead = maps <> cands
268 maps = setListType (Just MapTerm)
270 <> monoScoredExclHead
271 <> multScoredInclHead
272 <> multScoredExclHead
274 cands = setListType (Just CandidateTerm)
276 <> monoScoredExclTail
277 <> multScoredInclTail
278 <> multScoredExclTail
280 termListTail = (setListType (Just CandidateTerm)) (groupedMonoTail <> groupedMultTail)
282 let result = Map.unionsWith (<>)
283 [ Map.fromList [( nt, toNgramsElement termListHead
284 <> toNgramsElement termListTail
285 <> toNgramsElement stopTerms
289 -- printDebug "result" result