2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
20 import Data.Monoid (mempty)
21 import Data.Ord (Down(..))
23 import Data.Text (Text)
24 import Data.Tuple.Extra (both)
25 import Gargantext.API.Ngrams.Types (NgramsElement)
26 import Gargantext.API.Ngrams.Types (RepoCmdM)
27 import Gargantext.Core.Text (size)
28 import Gargantext.Core.Text.List.Group
29 import Gargantext.Core.Text.List.Group.Prelude
30 import Gargantext.Core.Text.List.Group.WithStem
31 import Gargantext.Core.Text.List.Social
32 import Gargantext.Core.Text.List.Social.Prelude
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
35 import Gargantext.Core.Types.Individu (User(..))
36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
38 import Gargantext.Database.Admin.Types.Node (NodeId)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Node (defaultList)
41 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
42 import Gargantext.Database.Query.Tree.Error (HasTreeError)
43 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
44 import Gargantext.Prelude
45 import qualified Data.List as List
46 import qualified Data.Map as Map
47 import qualified Data.Set as Set
51 -- TODO maybe useful for later
52 isStopTerm :: StopSize -> Text -> Bool
53 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
55 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
59 -- | TODO improve grouping functions of Authors, Sources, Institutes..
60 buildNgramsLists :: ( RepoCmdM env err m
69 -> m (Map NgramsType [NgramsElement])
70 buildNgramsLists user gp uCid mCid = do
71 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
72 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
73 [ (Authors , MapListSize 9)
74 , (Sources , MapListSize 9)
75 , (Institutes, MapListSize 9)
78 pure $ Map.unions $ [ngTerms] <> othersTerms
81 data MapListSize = MapListSize { unMapListSize :: !Int }
83 buildNgramsOthersList ::( HasNodeError err
91 -> (NgramsType, MapListSize)
92 -> m (Map NgramsType [NgramsElement])
93 buildNgramsOthersList user uCid groupParams (nt, MapListSize mapListSize) = do
94 allTerms :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
96 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
97 socialLists :: FlowCont Text FlowListScores
98 <- flowSocialList MySelfFirst user nt ( FlowCont Map.empty
100 $ List.zip (Map.keys allTerms)
101 (List.cycle [mempty])
105 groupedWithList = toGroupedTree groupParams socialLists allTerms
108 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType)
109 $ view flc_scores groupedWithList
111 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
113 listSize = mapListSize - (List.length mapTerms)
114 (mapTerms', candiTerms) = both Map.fromList
115 $ List.splitAt listSize
116 $ List.sortOn (Down . viewScore . snd)
117 $ Map.toList tailTerms'
119 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
120 <> (toNgramsElement mapTerms )
121 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
122 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
127 buildNgramsTermsList :: ( HasNodeError err
136 -> (NgramsType, MapListSize)
137 -> m (Map NgramsType [NgramsElement])
138 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
140 -- | Filter 0 With Double
141 -- Computing global speGen score
142 allTerms :: Map Text Double <- getTficf uCid mCid nt
144 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
145 socialLists :: FlowCont Text FlowListScores
146 <- flowSocialList MySelfFirst user nt ( FlowCont Map.empty
148 $ List.zip (Map.keys allTerms)
149 (List.cycle [mempty])
152 let groupedWithList = toGroupedTree groupParams socialLists allTerms
153 (stopTerms, candidateTerms) = Map.partition ((== Just StopTerm) . viewListType)
154 $ view flc_scores groupedWithList
156 (groupedMono, groupedMult) = Map.partitionWithKey (\t _v -> size t < 2) candidateTerms
158 -- printDebug "stopTerms" stopTerms
160 -- splitting monterms and multiterms to take proportional candidates
162 -- use % of list if to big, or Int if too small
163 listSizeGlobal = 2000 :: Double
164 monoSize = 0.4 :: Double
165 multSize = 1 - monoSize
167 splitAt n' ns = both (Map.fromListWith (<>))
168 $ List.splitAt (round $ n' * listSizeGlobal)
169 $ List.sortOn (viewScore . snd)
172 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
173 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
175 -------------------------
176 -- Filter 1 With Set NodeId and SpeGen
177 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
180 -- TO remove (and remove HasNodeError instance)
181 userListId <- defaultList uCid
182 masterListId <- defaultList mCid
185 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
186 [userListId, masterListId]
191 groupedTreeScores_SetNodeId :: Map Text (GroupedTreeScores (Set NodeId))
192 groupedTreeScores_SetNodeId = setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
194 -- | Coocurrences computation
195 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
196 let mapCooc = Map.filter (>2)
197 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
198 | (t1, s1) <- mapStemNodeIds
199 , (t2, s2) <- mapStemNodeIds
202 mapStemNodeIds = Map.toList
204 $ groupedTreeScores_SetNodeId
207 mapScores f = Map.fromList
208 $ map (\g -> (view scored_terms g, f g))
214 groupedTreeScores_SpeGen :: Map Text (GroupedTreeScores (Scored Text))
215 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity)
221 -- sort / partition / split
222 -- filter mono/multi again
223 (monoScored, multScored) = Map.partitionWithKey (\t _v -> size t < 2) groupedTreeScores_SpeGen
225 -- filter with max score
226 partitionWithMaxScore = Map.partition (\g -> (view scored_genInc $ view gts'_score g)
227 > (view scored_speExc $ view gts'_score g)
230 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
231 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
235 -- use % of list if to big, or Int if to small
236 listSizeLocal = 1000 :: Double
237 inclSize = 0.4 :: Double
238 exclSize = 1 - inclSize
240 splitAt' n' = (both (Map.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
241 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . Map.toList
244 monoInc_size = splitAt' $ monoSize * inclSize / 2
245 (monoScoredInclHead, monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl
246 (monoScoredExclHead, monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl
248 multExc_size = splitAt' $ multSize * exclSize / 2
249 (multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl
250 (multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl
253 ------------------------------------------------------------
255 -- Final Step building the Typed list
256 termListHead = maps <> cands
258 maps = setListType (Just MapTerm)
260 <> monoScoredExclHead
261 <> multScoredInclHead
262 <> multScoredExclHead
264 cands = setListType (Just CandidateTerm)
266 <> monoScoredExclTail
267 <> multScoredInclTail
268 <> multScoredExclTail
270 termListTail = (setListType (Just CandidateTerm)) (groupedMonoTail <> groupedMultTail)
272 let result = Map.unionsWith (<>)
273 [ Map.fromList [( nt, toNgramsElement termListHead
274 <> toNgramsElement termListTail
275 <> toNgramsElement stopTerms
279 -- printDebug "result" result