2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
20 import Data.HashSet (HashSet)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement, RepoCmdM, NgramsTerm(..))
27 import Gargantext.Core.Text (size)
28 import Gargantext.Core.Text.List.Group
29 import Gargantext.Core.Text.List.Group.Prelude
30 import Gargantext.Core.Text.List.Group.WithStem
31 import Gargantext.Core.Text.List.Social
32 import Gargantext.Core.Text.List.Social.Prelude
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
35 import Gargantext.Core.Types.Individu (User(..))
36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
38 import Gargantext.Database.Admin.Types.Node (NodeId)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
43 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
44 import Gargantext.Database.Query.Tree.Error (HasTreeError)
45 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
46 import Gargantext.Prelude
47 import qualified Data.HashMap.Strict as HashMap
48 import qualified Data.List as List
49 import qualified Data.Map as Map
50 import qualified Data.Set as Set
51 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
52 import qualified Data.HashSet as HashSet
55 -- TODO maybe useful for later
56 isStopTerm :: StopSize -> Text -> Bool
57 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
59 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
63 -- | TODO improve grouping functions of Authors, Sources, Institutes..
64 buildNgramsLists :: ( RepoCmdM env err m
73 -> m (Map NgramsType [NgramsElement])
74 buildNgramsLists gp user uCid mCid = do
75 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
76 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
77 [ (Authors , MapListSize 9)
78 , (Sources , MapListSize 9)
79 , (Institutes, MapListSize 9)
82 pure $ Map.unions $ [ngTerms] <> othersTerms
85 data MapListSize = MapListSize { unMapListSize :: !Int }
87 buildNgramsOthersList ::( HasNodeError err
95 -> (NgramsType, MapListSize)
96 -> m (Map NgramsType [NgramsElement])
97 buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
98 allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
100 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
101 socialLists :: FlowCont NgramsTerm FlowListScores
102 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
104 $ List.zip (HashMap.keys allTerms)
105 (List.cycle [mempty])
108 if nt == Sources -- Authors
109 then printDebug "flowSocialList" socialLists
110 else printDebug "flowSocialList" ""
113 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
115 if nt == Sources -- Authors
116 then printDebug "groupedWithList" groupedWithList
117 else printDebug "groupedWithList" ""
121 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
122 $ view flc_scores groupedWithList
124 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
126 listSize = mapListSize - (List.length mapTerms)
127 (mapTerms', candiTerms) = both HashMap.fromList
128 $ List.splitAt listSize
129 $ List.sortOn (Down . viewScore . snd)
130 $ HashMap.toList tailTerms'
132 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
133 <> (toNgramsElement mapTerms )
134 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
135 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
139 getGroupParams :: ( HasNodeError err
144 => GroupParams -> HashSet Ngrams -> m GroupParams
145 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
146 hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
147 printDebug "hashMap" hashMap
148 pure $ over gwl_map (\x -> x <> hashMap) gp
149 getGroupParams gp _ = pure gp
155 buildNgramsTermsList :: ( HasNodeError err
164 -> (NgramsType, MapListSize)
165 -> m (Map NgramsType [NgramsElement])
166 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
168 -- | Filter 0 With Double
169 -- Computing global speGen score
170 allTerms :: HashMap NgramsTerm Double <- getTficf uCid mCid nt
172 -- | PrivateFirst for first developments since Public NodeMode is not implemented yet
173 socialLists :: FlowCont NgramsTerm FlowListScores
174 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
176 $ List.zip (HashMap.keys allTerms)
177 (List.cycle [mempty])
179 let ngramsKeys = HashMap.keysSet allTerms
181 groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
182 let socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
183 --printDebug "socialLists_Stemmed" socialLists_Stemmed
184 let groupedWithList = toGroupedTree socialLists_Stemmed allTerms
185 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
186 $ view flc_scores groupedWithList
188 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
190 -- printDebug "stopTerms" stopTerms
192 -- splitting monterms and multiterms to take proportional candidates
194 -- use % of list if to big, or Int if too small
195 listSizeGlobal = 2000 :: Double
196 monoSize = 0.4 :: Double
197 multSize = 1 - monoSize
199 splitAt n' ns = both (HashMap.fromListWith (<>))
200 $ List.splitAt (round $ n' * listSizeGlobal)
201 $ List.sortOn (viewScore . snd)
204 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
205 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
207 -------------------------
208 -- Filter 1 With Set NodeId and SpeGen
209 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
212 -- TO remove (and remove HasNodeError instance)
213 userListId <- defaultList uCid
214 masterListId <- defaultList mCid
217 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
218 [userListId, masterListId]
223 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
224 groupedTreeScores_SetNodeId = setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
226 -- | Coocurrences computation
227 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
228 let mapCooc = HashMap.filter (>2)
229 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
230 | (t1, s1) <- mapStemNodeIds
231 , (t2, s2) <- mapStemNodeIds
234 mapStemNodeIds = HashMap.toList
235 $ HashMap.map viewScores
236 $ groupedTreeScores_SetNodeId
239 mapScores f = HashMap.fromList
240 $ map (\g -> (view scored_terms g, f g))
244 $ Map.fromList -- TODO remove this
245 $ HashMap.toList mapCooc
248 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
249 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity)
255 -- sort / partition / split
256 -- filter mono/multi again
257 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
259 -- filter with max score
260 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
261 > (view scored_speExc $ view gts'_score g)
264 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
265 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
269 -- use % of list if to big, or Int if to small
270 listSizeLocal = 1000 :: Double
271 inclSize = 0.4 :: Double
272 exclSize = 1 - inclSize
274 splitAt' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * listSizeLocal))
275 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
278 monoInc_size = splitAt' $ monoSize * inclSize / 2
279 (monoScoredInclHead, monoScoredInclTail) = monoInc_size $ (sortOn scored_genInc) monoScoredIncl
280 (monoScoredExclHead, monoScoredExclTail) = monoInc_size $ (sortOn scored_speExc) monoScoredExcl
282 multExc_size = splitAt' $ multSize * exclSize / 2
283 (multScoredInclHead, multScoredInclTail) = multExc_size $ (sortOn scored_genInc) multScoredIncl
284 (multScoredExclHead, multScoredExclTail) = multExc_size $ (sortOn scored_speExc) multScoredExcl
286 ------------------------------------------------------------
287 -- Final Step building the Typed list
288 termListHead = maps <> cands
290 maps = setListType (Just MapTerm)
292 <> monoScoredExclHead
293 <> multScoredInclHead
294 <> multScoredExclHead
296 cands = setListType (Just CandidateTerm)
298 <> monoScoredExclTail
299 <> multScoredInclTail
300 <> multScoredExclTail
302 termListTail = (setListType (Just CandidateTerm)) (groupedMonoTail <> groupedMultTail)
304 let result = Map.unionsWith (<>)
305 [ Map.fromList [( nt, toNgramsElement termListHead
306 <> toNgramsElement termListTail
307 <> toNgramsElement stopTerms
311 -- printDebug "result" result