2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
20 import Data.Monoid (mempty)
21 import Data.Ord (Down(..))
23 import Data.Text (Text)
24 import Data.Tuple.Extra (both)
25 import Gargantext.API.Ngrams.Types (NgramsElement)
26 import Gargantext.API.Ngrams.Types (RepoCmdM)
27 import Gargantext.Core.Text (size)
28 import Gargantext.Core.Text.List.Group
29 import Gargantext.Core.Text.List.Group.Prelude
30 import Gargantext.Core.Text.List.Group.WithStem
31 import Gargantext.Core.Text.List.Social
32 import Gargantext.Core.Text.List.Social.Prelude
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
35 import Gargantext.Core.Types.Individu (User(..))
36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
38 import Gargantext.Database.Admin.Types.Node (NodeId)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Node (defaultList)
41 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
42 import Gargantext.Database.Query.Tree.Error (HasTreeError)
43 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
44 import Gargantext.Prelude
45 import qualified Data.List as List
46 import qualified Data.Map as Map
47 import qualified Data.Set as Set
51 -- TODO maybe useful for later
52 isStopTerm :: StopSize -> Text -> Bool
53 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
55 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
59 -- | TODO improve grouping functions of Authors, Sources, Institutes..
60 buildNgramsLists :: ( RepoCmdM env err m
69 -> m (Map NgramsType [NgramsElement])
70 buildNgramsLists user gp uCid mCid = do
71 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
72 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
73 [ (Authors , MapListSize 9)
74 , (Sources , MapListSize 9)
75 , (Institutes, MapListSize 9)
78 pure $ Map.unions $ [ngTerms] <> othersTerms
81 data MapListSize = MapListSize { unMapListSize :: !Int }
83 buildNgramsOthersList ::( HasNodeError err
91 -> (NgramsType, MapListSize)
92 -> m (Map NgramsType [NgramsElement])
93 buildNgramsOthersList user uCid groupParams (nt, MapListSize mapListSize) = do
94 allTerms :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
96 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
97 socialLists :: FlowCont Text FlowListScores
98 <- flowSocialList MySelfFirst user nt ( FlowCont Map.empty
100 $ List.zip (Map.keys allTerms)
101 (List.cycle [mempty])
105 then printDebug "flowSocialList" socialLists
106 else printDebug "flowSocialList" ""
109 groupedWithList = toGroupedTree groupParams socialLists allTerms
112 then printDebug "groupedWithList" groupedWithList
113 else printDebug "groupedWithList" ""
117 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType)
118 $ view flc_scores groupedWithList
120 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
122 listSize = mapListSize - (List.length mapTerms)
123 (mapTerms', candiTerms) = both Map.fromList
124 $ List.splitAt listSize
125 $ List.sortOn (Down . viewScore . snd)
126 $ Map.toList tailTerms'
128 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
129 <> (toNgramsElement mapTerms )
130 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
131 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
136 buildNgramsTermsList :: ( HasNodeError err
145 -> (NgramsType, MapListSize)
146 -> m (Map NgramsType [NgramsElement])
147 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
149 -- | Filter 0 With Double
150 -- Computing global speGen score
151 allTerms :: Map Text Double <- getTficf uCid mCid nt
153 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
154 socialLists :: FlowCont Text FlowListScores
155 <- flowSocialList MySelfFirst user nt ( FlowCont Map.empty
157 $ List.zip (Map.keys allTerms)
158 (List.cycle [mempty])
161 let groupedWithList = toGroupedTree groupParams socialLists allTerms
162 (stopTerms, candidateTerms) = Map.partition ((== Just StopTerm) . viewListType)
163 $ view flc_scores groupedWithList
165 (groupedMono, groupedMult) = Map.partitionWithKey (\t _v -> size t < 2) candidateTerms
167 -- printDebug "stopTerms" stopTerms
169 -- splitting monterms and multiterms to take proportional candidates
171 -- use % of list if to big, or Int if too small
172 listSizeGlobal = 2000 :: Double
173 monoSize = 0.4 :: Double
174 multSize = 1 - monoSize
176 splitAt n' ns = both (Map.fromListWith (<>))
177 $ List.splitAt (round $ n' * listSizeGlobal)
178 $ List.sortOn (viewScore . snd)
181 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
182 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
184 -------------------------
185 -- Filter 1 With Set NodeId and SpeGen
186 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
189 -- TO remove (and remove HasNodeError instance)
190 userListId <- defaultList uCid
191 masterListId <- defaultList mCid
194 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
195 [userListId, masterListId]
200 groupedTreeScores_SetNodeId :: Map Text (GroupedTreeScores (Set NodeId))
201 groupedTreeScores_SetNodeId = setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
203 -- | Coocurrences computation
204 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
205 let mapCooc = Map.filter (>2)
206 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
207 | (t1, s1) <- mapStemNodeIds
208 , (t2, s2) <- mapStemNodeIds
211 mapStemNodeIds = Map.toList
213 $ groupedTreeScores_SetNodeId
216 mapScores f = Map.fromList
217 $ map (\g -> (view scored_terms g, f g))
223 groupedTreeScores_SpeGen :: Map Text (GroupedTreeScores (Scored Text))
224 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity)
230 -- sort / partition / split
231 -- filter mono/multi again
232 (monoScored, multScored) = Map.partitionWithKey (\t _v -> size t < 2) groupedTreeScores_SpeGen
234 -- filter with max score
235 partitionWithMaxScore = Map.partition (\g -> (view scored_genInc $ view gts'_score g)
236 > (view scored_speExc $ view gts'_score g)
239 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
240 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
244 -- use % of list if to big, or Int if to small
245 listSizeLocal = 1000 :: Double
246 inclSize = 0.4 :: Double
247 exclSize = 1 - inclSize
249 splitAt' n' = (both (Map.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
250 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . Map.toList
253 monoInc_size = splitAt' $ monoSize * inclSize / 2
254 (monoScoredInclHead, monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl
255 (monoScoredExclHead, monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl
257 multExc_size = splitAt' $ multSize * exclSize / 2
258 (multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl
259 (multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl
262 ------------------------------------------------------------
264 -- Final Step building the Typed list
265 termListHead = maps <> cands
267 maps = setListType (Just MapTerm)
269 <> monoScoredExclHead
270 <> multScoredInclHead
271 <> multScoredExclHead
273 cands = setListType (Just CandidateTerm)
275 <> monoScoredExclTail
276 <> multScoredInclTail
277 <> multScoredExclTail
279 termListTail = (setListType (Just CandidateTerm)) (groupedMonoTail <> groupedMultTail)
281 let result = Map.unionsWith (<>)
282 [ Map.fromList [( nt, toNgramsElement termListHead
283 <> toNgramsElement termListTail
284 <> toNgramsElement stopTerms
288 -- printDebug "result" result