2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
20 import Data.Maybe (catMaybes, fromMaybe)
21 import Data.Monoid (mempty)
22 import Data.Ord (Down(..))
24 import Data.Text (Text)
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement)
27 import Gargantext.API.Ngrams.Types (RepoCmdM)
28 import Gargantext.Core.Text (size)
29 import Gargantext.Core.Text.List.Group
30 import Gargantext.Core.Text.List.Group.Prelude
31 import Gargantext.Core.Text.List.Group.WithStem
32 import Gargantext.Core.Text.List.Social
33 import Gargantext.Core.Text.List.Social.Prelude
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
35 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
36 import Gargantext.Core.Types.Individu (User(..))
37 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
38 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
39 import Gargantext.Database.Admin.Types.Node (NodeId)
40 import Gargantext.Database.Prelude (CmdM)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
43 import Gargantext.Database.Query.Tree.Error (HasTreeError)
44 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
45 import Gargantext.Prelude
46 import qualified Data.Char as Char
47 import qualified Data.List as List
48 import qualified Data.Map as Map
49 import qualified Data.Set as Set
50 import qualified Data.Text as Text
54 -- TODO maybe useful for later
55 isStopTerm :: StopSize -> Text -> Bool
56 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
58 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
63 -- | TODO improve grouping functions of Authors, Sources, Institutes..
64 buildNgramsLists :: ( RepoCmdM env err m
73 -> m (Map NgramsType [NgramsElement])
74 buildNgramsLists user gp uCid mCid = do
75 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
76 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
77 [ (Authors , MapListSize 9)
78 , (Sources , MapListSize 9)
79 , (Institutes, MapListSize 9)
82 pure $ Map.unions $ [ngTerms] <> othersTerms
85 data MapListSize = MapListSize { unMapListSize :: !Int }
87 buildNgramsOthersList ::( HasNodeError err
95 -> (NgramsType, MapListSize)
96 -> m (Map NgramsType [NgramsElement])
97 buildNgramsOthersList user uCid groupParams (nt, MapListSize mapListSize) = do
98 allTerms :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
100 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
101 socialLists' :: FlowCont Text FlowListScores
102 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
104 $ List.zip (Map.keys allTerms)
105 (List.cycle [mempty])
108 groupedWithList = toGroupedTree groupParams socialLists' allTerms
111 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) $ view flc_scores groupedWithList
112 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
114 listSize = mapListSize - (List.length mapTerms)
115 (mapTerms', candiTerms) = both Map.fromList
116 $ List.splitAt listSize
117 $ List.sortOn (Down . viewScore . snd)
118 $ Map.toList tailTerms'
120 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
121 <> (toNgramsElement mapTerms )
122 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
123 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
128 buildNgramsTermsList :: ( HasNodeError err
137 -> (NgramsType, MapListSize)
138 -> m (Map NgramsType [NgramsElement])
139 buildNgramsTermsList user uCid mCid groupParams (nt, mapListSize)= do
141 -- Computing global speGen score
142 allTerms :: Map Text Double <- getTficf uCid mCid nt
144 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
145 socialLists' :: FlowCont Text FlowListScores
146 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
148 $ List.zip (Map.keys allTerms)
149 (List.cycle [mempty])
151 let groupedWithList = toGroupedTree groupParams socialLists' allTerms
152 (stopTerms, candidateTerms) = Map.partition ((== Just StopTerm) . viewListType)
153 $ view flc_scores groupedWithList
154 (groupedMono, groupedMult) = Map.partitionWithKey (\t _v -> size t < 2) candidateTerms
156 -- splitting monterms and multiterms to take proportional candidates
158 -- use % of list if to big, or Int if too small
159 listSizeGlobal = 2000 :: Double
160 monoSize = 0.4 :: Double
161 multSize = 1 - monoSize
163 splitAt n' ns = both (Map.fromListWith (<>))
164 $ List.splitAt (round $ n' * listSizeGlobal)
165 $ List.sortOn (viewScore . snd)
168 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
169 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
171 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
173 -- TO remove (and remove HasNodeError instance)
174 userListId <- defaultList uCid
175 masterListId <- defaultList mCid
177 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
178 [userListId, masterListId]
183 groupedTreeScores_SetNodeId :: Map Text (GroupedTreeScores (Set NodeId))
184 groupedTreeScores_SetNodeId = undefined
185 -- setScoresWith (\_ _ -> mempty) (groupedMonoHead <> groupedMultHead)
186 -- groupedTreeScores_SetNodeId = setScoresWith ((fromMaybe mempty) . ((flip Map.lookup) mapTextDocIds)) (groupedMonoHead <> groupedMultHead)
188 -- | Coocurrences computation
189 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
190 let mapCooc = Map.filter (>2)
191 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
192 | (t1, s1) <- mapStemNodeIds
193 , (t2, s2) <- mapStemNodeIds
196 mapStemNodeIds = Map.toList
198 $ groupedTreeScores_SetNodeId
201 mapScores f = Map.fromList
202 $ map (\g -> (view scored_terms g, f g))
208 groupedTreeScores_SpeGen :: Map Text (GroupedTreeScores (Scored Double))
209 groupedTreeScores_SpeGen = undefined
210 -- setScoresWith (\k v -> set gts'_score (Scored "" 0 0) v) (groupedMonoHead <> groupedMultHead)
211 -- groupedTreeScores_SpeGen = setScoresWith (\k v -> set gts'_score (fromMaybe (Scored "" 0 0) $ Map.lookup k (mapScores identity)) v) (groupedMonoHead <> groupedMultHead)
214 -- sort / partition / split
215 -- filter mono/multi again
216 (monoScored, multScored) = Map.partitionWithKey (\t _v -> size t < 2) groupedTreeScores_SpeGen
217 -- (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
219 -- filter with max score
220 partitionWithMaxScore = Map.partition (\g -> (view scored_genInc $ view gts'_score g)
221 > (view scored_speExc $ view gts'_score g)
224 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
225 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
229 -- use % of list if to big, or Int if to small
230 listSizeLocal = 1000 :: Double
231 inclSize = 0.4 :: Double
232 exclSize = 1 - inclSize
234 splitAt' n' = (both (Map.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
235 sortOn f = (List.sortOn (Down . f . _gts'_score . snd)) . Map.toList
237 monoInc_size = monoSize * inclSize / 2
238 (monoScoredInclHead, monoScoredInclTail) = splitAt' monoInc_size $ (sortOn _scored_genInc) monoScoredIncl
239 (monoScoredExclHead, monoScoredExclTail) = splitAt' monoInc_size $ (sortOn _scored_speExc) monoScoredExcl
241 multExc_size = multSize * exclSize / 2
242 (multScoredInclHead, multScoredInclTail) = splitAt' multExc_size $ (sortOn _scored_genInc) multScoredIncl
243 (multScoredExclHead, multScoredExclTail) = splitAt' multExc_size $ (sortOn _scored_speExc) multScoredExcl
245 -- Final Step building the Typed list
246 termListHead = maps <> cands
248 maps = setListType (Just MapTerm)
250 <> monoScoredExclHead
251 <> multScoredInclHead
252 <> multScoredExclHead
254 cands = setListType (Just CandidateTerm)
256 <> monoScoredExclTail
257 <> multScoredInclTail
258 <> multScoredExclTail
260 termListTail = (setListType (Just CandidateTerm)) (groupedMonoTail <> groupedMultTail)
262 let result = Map.unionsWith (<>)
263 [ Map.fromList [( nt, toNgramsElement termListHead
264 <> toNgramsElement termListTail
265 <> toNgramsElement stopTerms
271 ------------------------------------------------------------------------------