2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 {-# LANGUAGE ScopedTypeVariables #-}
13 {-# LANGUAGE TemplateHaskell #-}
15 module Gargantext.Core.Text.List
18 import Control.Lens hiding (both) -- ((^.), view, over, set, (_1), (_2))
19 import Data.HashMap.Strict (HashMap)
20 import Data.HashSet (HashSet)
22 import Data.Monoid (mempty)
23 import Data.Ord (Down(..))
25 import Data.Tuple.Extra (both)
26 import Gargantext.API.Ngrams.Types (NgramsElement, RepoCmdM, NgramsTerm(..))
27 import Gargantext.Core.Text (size)
28 import Gargantext.Core.Text.List.Group
29 import Gargantext.Core.Text.List.Group.Prelude
30 import Gargantext.Core.Text.List.Group.WithStem
31 import Gargantext.Core.Text.List.Social
32 import Gargantext.Core.Text.List.Social.Prelude
33 import Gargantext.Core.Text.Metrics (scored', Scored(..), scored_speExc, scored_genInc, normalizeGlobal, normalizeLocal, scored_terms)
34 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
35 import Gargantext.Core.Types.Individu (User(..))
36 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, getNodesByNgramsOnlyUser)
37 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
38 import Gargantext.Database.Admin.Types.Node (NodeId)
39 import Gargantext.Database.Prelude (CmdM)
40 import Gargantext.Database.Query.Table.Ngrams (text2ngrams)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.NgramsPostag (selectLems)
43 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
44 import Gargantext.Database.Query.Tree.Error (HasTreeError)
45 import Gargantext.Database.Schema.Ngrams (NgramsType(..), Ngrams(..))
46 import Gargantext.Prelude
47 import qualified Data.HashMap.Strict as HashMap
48 import qualified Data.List as List
49 import qualified Data.Map as Map
50 import qualified Data.Set as Set
51 import qualified Gargantext.Data.HashMap.Strict.Utils as HashMap
52 import qualified Data.HashSet as HashSet
55 -- TODO maybe useful for later
56 isStopTerm :: StopSize -> Text -> Bool
57 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
59 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
63 -- | TODO improve grouping functions of Authors, Sources, Institutes..
64 buildNgramsLists :: ( RepoCmdM env err m
73 -> m (Map NgramsType [NgramsElement])
74 buildNgramsLists gp user uCid mCid = do
75 ngTerms <- buildNgramsTermsList user uCid mCid gp (NgramsTerms, MapListSize 350)
76 othersTerms <- mapM (buildNgramsOthersList user uCid GroupIdentity)
77 [ (Authors , MapListSize 9)
78 , (Sources , MapListSize 9)
79 , (Institutes, MapListSize 9)
82 pure $ Map.unions $ [ngTerms] <> othersTerms
85 data MapListSize = MapListSize { unMapListSize :: !Int }
87 buildNgramsOthersList ::( HasNodeError err
95 -> (NgramsType, MapListSize)
96 -> m (Map NgramsType [NgramsElement])
97 buildNgramsOthersList user uCid _groupParams (nt, MapListSize mapListSize) = do
98 allTerms :: HashMap NgramsTerm (Set NodeId) <- getNodesByNgramsUser uCid nt
100 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
101 socialLists :: FlowCont NgramsTerm FlowListScores
102 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
104 $ List.zip (HashMap.keys allTerms)
105 (List.cycle [mempty])
108 groupedWithList = toGroupedTree {- groupParams -} socialLists allTerms
110 (stopTerms, tailTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
111 $ view flc_scores groupedWithList
113 (mapTerms, tailTerms') = HashMap.partition ((== Just MapTerm) . viewListType) tailTerms
115 listSize = mapListSize - (List.length mapTerms)
116 (mapTerms', candiTerms) = both HashMap.fromList
117 $ List.splitAt listSize
118 $ List.sortOn (Down . viewScore . snd)
119 $ HashMap.toList tailTerms'
121 pure $ Map.fromList [( nt, (toNgramsElement stopTerms)
122 <> (toNgramsElement mapTerms )
123 <> (toNgramsElement $ setListType (Just MapTerm ) mapTerms' )
124 <> (toNgramsElement $ setListType (Just CandidateTerm) candiTerms)
128 getGroupParams :: ( HasNodeError err
133 => GroupParams -> HashSet Ngrams -> m GroupParams
134 getGroupParams gp@(GroupWithPosTag l a _m) ng = do
135 hashMap <- HashMap.fromList <$> selectLems l a (HashSet.toList ng)
136 -- printDebug "hashMap" hashMap
137 pure $ over gwl_map (\x -> x <> hashMap) gp
138 getGroupParams gp _ = pure gp
142 buildNgramsTermsList :: ( HasNodeError err
151 -> (NgramsType, MapListSize)
152 -> m (Map NgramsType [NgramsElement])
153 buildNgramsTermsList user uCid mCid groupParams (nt, _mapListSize)= do
155 -- Filter 0 With Double
156 -- Computing global speGen score
157 allTerms :: HashMap NgramsTerm Double <- getTficf uCid mCid nt
159 -- PrivateFirst for first developments since Public NodeMode is not implemented yet
160 socialLists :: FlowCont NgramsTerm FlowListScores
161 <- flowSocialList MySelfFirst user nt ( FlowCont HashMap.empty
163 $ List.zip (HashMap.keys allTerms)
164 (List.cycle [mempty])
166 let ngramsKeys = HashMap.keysSet allTerms
168 groupParams' <- getGroupParams groupParams (HashSet.map (text2ngrams . unNgramsTerm) ngramsKeys)
171 socialLists_Stemmed = addScoreStem groupParams' ngramsKeys socialLists
172 --printDebug "socialLists_Stemmed" socialLists_Stemmed
173 groupedWithList = toGroupedTree socialLists_Stemmed allTerms
174 (stopTerms, candidateTerms) = HashMap.partition ((== Just StopTerm) . viewListType)
175 $ HashMap.filter (\g -> (view gts'_score g) > 1)
176 $ view flc_scores groupedWithList
178 (groupedMono, groupedMult) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) candidateTerms
180 -- printDebug "stopTerms" stopTerms
182 -- splitting monterms and multiterms to take proportional candidates
183 -- use % of list if to big, or Int if too small
184 listSizeGlobal = 2000 :: Double
185 monoSize = 0.4 :: Double
186 multSize = 1 - monoSize
188 splitAt n' ns = both (HashMap.fromListWith (<>))
189 $ List.splitAt (round $ n' * listSizeGlobal)
190 $ List.sortOn (viewScore . snd)
193 (groupedMonoHead, _groupedMonoTail) = splitAt monoSize groupedMono
194 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
196 -------------------------
197 -- Filter 1 With Set NodeId and SpeGen
198 selectedTerms = Set.toList $ hasTerms (groupedMonoHead <> groupedMultHead)
201 -- TODO remove (and remove HasNodeError instance)
202 userListId <- defaultList uCid
203 masterListId <- defaultList mCid
205 mapTextDocIds <- getNodesByNgramsOnlyUser uCid
206 [userListId, masterListId]
211 groupedTreeScores_SetNodeId :: HashMap NgramsTerm (GroupedTreeScores (Set NodeId))
212 groupedTreeScores_SetNodeId = HashMap.filter (\g -> Set.size (view gts'_score g) > 1) -- removing hapax
213 $ setScoresWithMap mapTextDocIds (groupedMonoHead <> groupedMultHead)
216 printDebug "groupedTreeScores_SetNodeId" groupedTreeScores_SetNodeId
218 -- Coocurrences computation
219 --, t1 >= t2 -- permute byAxis diag -- since matrix symmetric
220 let mapCooc = HashMap.filter (>1) -- removing cooc of 1
221 $ HashMap.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
222 | (t1, s1) <- mapStemNodeIds
223 , (t2, s2) <- mapStemNodeIds
226 mapStemNodeIds = HashMap.toList
227 $ HashMap.map viewScores
228 $ groupedTreeScores_SetNodeId
231 mapScores f = HashMap.fromList
232 $ map (\g -> (view scored_terms g, f g))
236 $ Map.fromList -- TODO remove this
237 $ HashMap.toList mapCooc
240 groupedTreeScores_SpeGen :: HashMap NgramsTerm (GroupedTreeScores (Scored NgramsTerm))
241 groupedTreeScores_SpeGen = setScoresWithMap (mapScores identity) groupedTreeScores_SetNodeId
244 -- sort / partition / split
245 -- filter mono/multi again
246 (monoScored, multScored) = HashMap.partitionWithKey (\(NgramsTerm t) _v -> size t < 2) groupedTreeScores_SpeGen
248 -- filter with max score
249 partitionWithMaxScore = HashMap.partition (\g -> (view scored_genInc $ view gts'_score g)
250 > (view scored_speExc $ view gts'_score g)
253 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
254 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
258 -- use % of list if to big, or Int if to small
259 mapSize = 1000 :: Double
260 canSize = mapSize * 5 :: Double
262 inclSize = 0.4 :: Double
263 exclSize = 1 - inclSize
265 splitAt' max' n' = (both (HashMap.fromList)) . (List.splitAt (round $ n' * max'))
266 sortOn f = (List.sortOn (Down . (view (gts'_score . f)) . snd)) . HashMap.toList
268 monoInc_size n = splitAt' n $ monoSize * inclSize / 2
269 multExc_size n = splitAt' n $ multSize * exclSize / 2
272 (mapMonoScoredInclHead, monoScoredInclTail) = monoInc_size mapSize $ (sortOn scored_genInc) monoScoredIncl
273 (mapMonoScoredExclHead, monoScoredExclTail) = monoInc_size mapSize $ (sortOn scored_speExc) monoScoredExcl
275 (mapMultScoredInclHead, multScoredInclTail) = multExc_size mapSize $ (sortOn scored_genInc) multScoredIncl
276 (mapMultScoredExclHead, multScoredExclTail) = multExc_size mapSize $ (sortOn scored_speExc) multScoredExcl
279 (canMonoScoredIncHead , _) = monoInc_size canSize $ (sortOn scored_genInc) monoScoredInclTail
280 (canMonoScoredExclHead, _) = monoInc_size canSize $ (sortOn scored_speExc) monoScoredExclTail
282 (canMulScoredInclHead, _) = multExc_size canSize $ (sortOn scored_genInc) multScoredInclTail
283 (canMultScoredExclHead, _) = multExc_size canSize $ (sortOn scored_speExc) multScoredExclTail
285 ------------------------------------------------------------
286 -- Final Step building the Typed list
287 -- Candidates Terms need to be filtered
289 maps = setListType (Just MapTerm)
290 $ mapMonoScoredInclHead
291 <> mapMonoScoredExclHead
292 <> mapMultScoredInclHead
293 <> mapMultScoredExclHead
295 -- An original way to filter to start with
296 cands = setListType (Just CandidateTerm)
297 $ canMonoScoredIncHead
298 <> canMonoScoredExclHead
299 <> canMulScoredInclHead
300 <> canMultScoredExclHead
303 cands' = setListType (Just CandidateTerm)
307 result = Map.unionsWith (<>)
308 [ Map.fromList [( nt, toNgramsElement maps
309 <> toNgramsElement cands
310 <> toNgramsElement cands'
311 <> toNgramsElement stopTerms