2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
20 import Data.Monoid (mempty)
21 import Data.Ord (Down(..))
23 import Data.Text (Text)
24 import Data.Tuple.Extra (both)
25 import Gargantext.API.Ngrams.Types (NgramsElement)
26 import Gargantext.API.Ngrams.Types (RepoCmdM)
27 import Gargantext.Core.Text (size)
28 import Gargantext.Core.Text.List.Group
29 import Gargantext.Core.Text.List.Group.Prelude
30 import Gargantext.Core.Text.List.Group.WithStem
31 import Gargantext.Core.Text.List.Social
32 import Gargantext.Core.Text.List.Social.Prelude
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
35 import Gargantext.Core.Types.Individu (User(..))
36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
38 import Gargantext.Database.Admin.Types.Node (NodeId)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Node (defaultList)
41 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
42 import Gargantext.Database.Query.Tree.Error (HasTreeError)
43 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
44 import Gargantext.Prelude
45 import qualified Data.List as List
46 import qualified Data.Map as Map
47 import qualified Data.Set as Set
51 -- TODO maybe useful for later
52 isStopTerm :: StopSize -> Text -> Bool
53 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
55 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
60 -- | TODO improve grouping functions of Authors, Sources, Institutes..
61 buildNgramsLists :: ( RepoCmdM env err m
70 -> m (Map NgramsType [NgramsElement])
71 buildNgramsLists user gp uCid mCid = do
72 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
73 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
74 [ (Authors , MapListSize 9)
75 , (Sources , MapListSize 9)
76 , (Institutes, MapListSize 9)
79 pure $ Map.unions $ [ngTerms] <> othersTerms
82 data MapListSize = MapListSize { unMapListSize :: !Int }
84 buildNgramsOthersList ::( HasNodeError err
92 -> (NgramsType, MapListSize)
93 -> m (Map NgramsType [NgramsElement])
94 buildNgramsOthersList user uCid groupParams (nt, MapListSize mapListSize) = do
95 allTerms :: Map Text (Set NodeId) <- getNodesByNgramsUser uCid nt
97 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
98 socialLists' :: FlowCont Text FlowListScores
99 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
101 $ List.zip (Map.keys allTerms)
102 (List.cycle [mempty])
105 groupedWithList = toGroupedTree groupParams socialLists' allTerms
108 (stopTerms, tailTerms) = Map.partition ((== Just StopTerm) . viewListType) $ view flc_scores groupedWithList
109 (mapTerms, tailTerms') = Map.partition ((== Just MapTerm) . viewListType) tailTerms
111 listSize = mapListSize - (List.length mapTerms)
112 (mapTerms', candiTerms) = both Map.fromList
113 $ List.splitAt listSize
114 $ List.sortOn (Down . viewScore . snd)
115 $ Map.toList tailTerms'
117 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
118 <> (toNgramsElement mapTerms )
119 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
120 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
125 buildNgramsTermsList :: ( HasNodeError err
134 -> (NgramsType, MapListSize)
135 -> m (Map NgramsType [NgramsElement])
136 buildNgramsTermsList user uCid mCid groupParams (nt, mapListSize)= do
138 -- | Filter 0 With Double
140 -- Computing global speGen score
141 allTerms :: Map Text Double <- getTficf uCid mCid nt
143 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
144 socialLists' :: FlowCont Text FlowListScores
145 <- flowSocialList' MySelfFirst user nt ( FlowCont Map.empty
147 $ List.zip (Map.keys allTerms)
148 (List.cycle [mempty])
150 let groupedWithList = toGroupedTree groupParams socialLists' allTerms
151 (stopTerms, candidateTerms) = Map.partition ((== Just StopTerm) . viewListType)
152 $ view flc_scores groupedWithList
153 (groupedMono, groupedMult) = Map.partitionWithKey (\t _v -> size t < 2) candidateTerms
155 -- splitting monterms and multiterms to take proportional candidates
157 -- use % of list if to big, or Int if too small
158 listSizeGlobal = 2000 :: Double
159 monoSize = 0.4 :: Double
160 multSize = 1 - monoSize
162 splitAt n' ns = both (Map.fromListWith (<>))
163 $ List.splitAt (round $ n' * listSizeGlobal)
164 $ List.sortOn (viewScore . snd)
167 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
168 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
170 -------------------------
171 -- Filter 1 With Set NodeId and SpeGen
172 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
174 -- TO remove (and remove HasNodeError instance)
175 userListId <- defaultList uCid
176 masterListId <- defaultList mCid
179 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
180 [userListId, masterListId]
185 groupedTreeScores_SetNodeId :: Map Text (GroupedTreeScores (Set NodeId))
186 groupedTreeScores_SetNodeId = undefined
187 -- setScoresWith (\_ _ -> mempty) (groupedMonoHead <> groupedMultHead)
188 -- groupedTreeScores_SetNodeId = setScoresWith ((fromMaybe mempty) . ((flip Map.lookup) mapTextDocIds)) (groupedMonoHead <> groupedMultHead)
190 -- | Coocurrences computation
191 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
192 let mapCooc = Map.filter (>2)
193 $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
194 | (t1, s1) <- mapStemNodeIds
195 , (t2, s2) <- mapStemNodeIds
198 mapStemNodeIds = Map.toList
200 $ groupedTreeScores_SetNodeId
203 mapScores f = Map.fromList
204 $ map (\g -> (view scored_terms g, f g))
210 groupedTreeScores_SpeGen :: Map Text (GroupedTreeScores (Scored Double))
211 groupedTreeScores_SpeGen = undefined
212 -- setScoresWith (\k v -> set gts'_score (Scored "" 0 0) v) (groupedMonoHead <> groupedMultHead)
213 -- groupedTreeScores_SpeGen = setScoresWith (\k v -> set gts'_score (fromMaybe (Scored "" 0 0) $ Map.lookup k (mapScores identity)) v) (groupedMonoHead <> groupedMultHead)
216 -- sort / partition / split
217 -- filter mono/multi again
218 (monoScored, multScored) = Map.partitionWithKey (\t _v -> size t < 2) groupedTreeScores_SpeGen
219 -- (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
221 -- filter with max score
222 partitionWithMaxScore = Map.partition (\g -> (view scored_genInc $ view gts'_score g)
223 > (view scored_speExc $ view gts'_score g)
226 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
227 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
231 -- use % of list if to big, or Int if to small
232 listSizeLocal = 1000 :: Double
233 inclSize = 0.4 :: Double
234 exclSize = 1 - inclSize
236 splitAt' n' = (both (Map.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
237 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . Map.toList
239 monoInc_size = splitAt' $ monoSize * inclSize / 2
240 (monoScoredInclHead, monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl
241 (monoScoredExclHead, monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl
243 multExc_size = splitAt' $ multSize * exclSize / 2
244 (multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl
245 (multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl
247 -- Final Step building the Typed list
248 termListHead = maps <> cands
250 maps = setListType (Just MapTerm)
252 <> monoScoredExclHead
253 <> multScoredInclHead
254 <> multScoredExclHead
256 cands = setListType (Just CandidateTerm)
258 <> monoScoredExclTail
259 <> multScoredInclTail
260 <> multScoredExclTail
262 termListTail = (setListType (Just CandidateTerm)) (groupedMonoTail <> groupedMultTail)
264 let result = Map.unionsWith (<>)
265 [ Map.fromList [( nt, toNgramsElement termListHead
266 <> toNgramsElement termListTail
267 <> toNgramsElement stopTerms