2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
20 import Data.Monoid (mempty)
21 import Data.Ord (Down(..))
23 import Data.Text (Text)
24 import Data.Tuple.Extra (both)
25 import Gargantext.API.Ngrams.Types (NgramsElement)
26 import Gargantext.API.Ngrams.Types (RepoCmdM)
27 import Gargantext.Core.Text (size)
28 import Gargantext.Core.Text.List.Group
29 import Gargantext.Core.Text.List.Group.Prelude
30 import Gargantext.Core.Text.List.Group.WithStem
31 import Gargantext.Core.Text.List.Social
32 import Gargantext.Core.Text.List.Social.Prelude
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
35 import Gargantext.Core.Types.Individu (User(..))
36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
38 import Gargantext.Database.Admin.Types.Node (NodeId)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Node (defaultList)
41 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
42 import Gargantext.Database.Query.Tree.Error (HasTreeError)
43 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
44 import Gargantext.Prelude
45 import qualified Data.List as List
46 import qualified Data.Map as Map
47 import qualified Data.Set as Set
51 -- TODO maybe useful for later
52 isStopTerm :: StopSize -> Text -> Bool
53 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
55 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
60 -- | TODO improve grouping functions of Authors, Sources, Institutes..
61 buildNgramsLists :: ( RepoCmdM env err m
70 -> m (Map NgramsType [NgramsElement])
71 buildNgramsLists user gp uCid mCid = do
72 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
73 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
74 [ (Authors , MapListSize 9)
75 , (Sources , MapListSize 9)
76 , (Institutes, MapListSize 9)
79 pure $ Map.unions $ [ngTerms] <> othersTerms
82 data MapListSize = MapListSize { unMapListSize :: !Int }
84 buildNgramsOthersList ::( HasNodeError err
92 -> (NgramsType, MapListSize)
93 -> m (Map NgramsType [NgramsElement])
94 buildNgramsOthersList user uCid groupParams (nt, MapListSize mapListSize) = do
95 allTerms :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
97 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
98 socialLists' :: FlowCont Text FlowListScores
99 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
101 $ List.zip (Map.keys allTerms)
102 (List.cycle [mempty])
105 groupedWithList = toGroupedTree groupParams socialLists' allTerms
108 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) $ view flc_scores groupedWithList
109 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
111 listSize = mapListSize - (List.length mapTerms)
112 (mapTerms', candiTerms) = both Map.fromList
113 $ List.splitAt listSize
114 $ List.sortOn (Down . viewScore . snd)
115 $ Map.toList tailTerms'
117 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
118 <> (toNgramsElement mapTerms )
119 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
120 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
125 buildNgramsTermsList :: ( HasNodeError err
134 -> (NgramsType, MapListSize)
135 -> m (Map NgramsType [NgramsElement])
136 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
138 -- | Filter 0 With Double
140 -- Computing global speGen score
141 allTerms :: Map Text Double <- getTficf uCid mCid nt
143 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
144 socialLists' :: FlowCont Text FlowListScores
145 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
147 $ List.zip (Map.keys allTerms)
148 (List.cycle [mempty])
150 let groupedWithList = toGroupedTree groupParams socialLists' allTerms
151 (stopTerms, candidateTerms) = Map.partition ((== Just StopTerm) . viewListType)
152 $ view flc_scores groupedWithList
153 (groupedMono, groupedMult) = Map.partitionWithKey (\t _v -> size t < 2) candidateTerms
155 -- splitting monterms and multiterms to take proportional candidates
157 -- use % of list if to big, or Int if too small
158 listSizeGlobal = 2000 :: Double
159 monoSize = 0.4 :: Double
160 multSize = 1 - monoSize
162 splitAt n' ns = both (Map.fromListWith (<>))
163 $ List.splitAt (round $ n' * listSizeGlobal)
164 $ List.sortOn (viewScore . snd)
167 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
168 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
170 -------------------------
171 -- Filter 1 With Set NodeId and SpeGen
172 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
174 -- TO remove (and remove HasNodeError instance)
175 userListId <- defaultList uCid
176 masterListId <- defaultList mCid
179 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
180 [userListId, masterListId]
185 groupedTreeScores_SetNodeId :: Map Text (GroupedTreeScores (Set NodeId))
186 groupedTreeScores_SetNodeId = setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
188 -- | Coocurrences computation
189 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
190 let mapCooc = Map.filter (>2)
191 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
192 | (t1, s1) <- mapStemNodeIds
193 , (t2, s2) <- mapStemNodeIds
196 mapStemNodeIds = Map.toList
198 $ groupedTreeScores_SetNodeId
201 mapScores f = Map.fromList
202 $ map (\g -> (view scored_terms g, f g))
208 groupedTreeScores_SpeGen :: Map Text (GroupedTreeScores (Scored Text))
209 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) (groupedMonoHead <> groupedMultHead)
212 -- sort / partition / split
213 -- filter mono/multi again
214 (monoScored, multScored) = Map.partitionWithKey (\t _v -> size t < 2) groupedTreeScores_SpeGen
216 -- filter with max score
217 partitionWithMaxScore = Map.partition (\g -> (view scored_genInc $ view gts'_score g)
218 > (view scored_speExc $ view gts'_score g)
221 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
222 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
226 -- use % of list if to big, or Int if to small
227 listSizeLocal = 1000 :: Double
228 inclSize = 0.4 :: Double
229 exclSize = 1 - inclSize
231 splitAt' n' = (both (Map.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
232 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . Map.toList
235 monoInc_size = splitAt' $ monoSize * inclSize / 2
236 (monoScoredInclHead, monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl
237 (monoScoredExclHead, monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl
239 multExc_size = splitAt' $ multSize * exclSize / 2
240 (multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl
241 (multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl
244 ------------------------------------------------------------
246 -- Final Step building the Typed list
247 termListHead = maps <> cands
249 maps = setListType (Just MapTerm)
251 <> monoScoredExclHead
252 <> multScoredInclHead
253 <> multScoredExclHead
255 cands = setListType (Just CandidateTerm)
257 <> monoScoredExclTail
258 <> multScoredInclTail
259 <> multScoredExclTail
261 termListTail = (setListType (Just CandidateTerm)) (groupedMonoTail <> groupedMultTail)
263 let result = Map.unionsWith (<>)
264 [ Map.fromList [( nt, toNgramsElement termListHead
265 <> toNgramsElement termListTail
266 <> toNgramsElement stopTerms