]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/List.hs
[NgramsGroup] refact parameters
[gargantext.git] / src / Gargantext / Core / Text / List.hs
1 {-|
2 Module : Gargantext.Core.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13
14 module Gargantext.Core.Text.List
15 where
16
17
18 import Control.Lens ((^.))
19 import Data.Maybe (fromMaybe, catMaybes)
20 import Data.Ord (Down(..))
21 import Data.Map (Map)
22 import Data.Text (Text)
23 import qualified Data.Char as Char
24 import qualified Data.List as List
25 import qualified Data.Map as Map
26 import qualified Data.Set as Set
27 import qualified Data.Text as Text
28
29 -- import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
30 import Gargantext.API.Ngrams.Types (NgramsElement, mkNgramsElement, NgramsTerm(..), RootParent(..), mSetFromList)
31 import Gargantext.API.Ngrams.Types (RepoCmdM)
32 import Gargantext.Core.Text (size)
33 import Gargantext.Core.Text.List.Social (flowSocialList, invertForw)
34 import Gargantext.Core.Text.Metrics (scored', Scored(..), normalizeGlobal, normalizeLocal)
35 import Gargantext.Core.Text.Group
36 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
37 import Gargantext.Core.Types.Individu (User(..))
38 import Gargantext.Database.Action.Metrics.NgramsByNode (getNodesByNgramsUser, groupNodesByNgramsWith, getNodesByNgramsOnlyUser)
39 import Gargantext.Database.Action.Metrics.TFICF (getTficf)
40 import Gargantext.Database.Prelude (Cmd, CmdM)
41 import Gargantext.Database.Query.Table.Node (defaultList)
42 import Gargantext.Database.Query.Table.Node.Error (HasNodeError())
43 import Gargantext.Database.Query.Tree.Error (HasTreeError)
44 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
45 import Gargantext.Prelude
46
47
48 -- | TODO improve grouping functions of Authors, Sources, Institutes..
49 buildNgramsLists :: ( RepoCmdM env err m
50 , CmdM env err m
51 , HasTreeError err
52 , HasNodeError err
53 )
54 => User
55 -> GroupParams
56 -> UserCorpusId
57 -> MasterCorpusId
58 -> m (Map NgramsType [NgramsElement])
59 buildNgramsLists user gp uCid mCid = do
60 ngTerms <- buildNgramsTermsList user gp uCid mCid
61 othersTerms <- mapM (buildNgramsOthersList user uCid identity)
62 [Authors, Sources, Institutes]
63 pure $ Map.unions $ othersTerms <> [ngTerms]
64
65
66 buildNgramsOthersList :: (-- RepoCmdM env err m
67 -- , CmdM env err m
68 HasNodeError err
69 -- , HasTreeError err
70 )
71 => User
72 -> UserCorpusId
73 -> (Text -> Text)
74 -> NgramsType
75 -> Cmd err (Map NgramsType [NgramsElement])
76 buildNgramsOthersList _user uCid groupIt nt = do
77 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
78
79 let
80 listSize = 9
81 all' = List.sortOn (Down . Set.size . snd . snd)
82 $ Map.toList ngs
83
84 (graphTerms, candiTerms) = List.splitAt listSize all'
85
86 pure $ Map.unionsWith (<>) [ toElements nt MapTerm graphTerms
87 , toElements nt CandidateTerm candiTerms
88 ]
89
90 toElements :: Ord k => k -> ListType -> [(Text, b)] -> Map k [NgramsElement]
91 toElements nType lType x =
92 Map.fromList [(nType, [ mkNgramsElement (NgramsTerm t) lType Nothing (mSetFromList [])
93 | (t, _ns) <- x
94 ]
95 )]
96
97 -- TODO use ListIds
98 buildNgramsTermsList :: ( HasNodeError err
99 , CmdM env err m
100 , RepoCmdM env err m
101 , HasTreeError err
102 )
103 => User
104 -> GroupParams
105 -> UserCorpusId
106 -> MasterCorpusId
107 -> m (Map NgramsType [NgramsElement])
108 buildNgramsTermsList user groupParams uCid mCid = do
109
110 -- Computing global speGen score
111 allTerms <- Map.toList <$> getTficf uCid mCid NgramsTerms
112
113 -- printDebug "head candidates" (List.take 10 $ allTerms)
114 -- printDebug "tail candidates" (List.take 10 $ List.reverse $ allTerms)
115
116 -- First remove stops terms
117 socialLists <- flowSocialList user NgramsTerms (Set.fromList $ map fst allTerms)
118
119 printDebug "\n * socialLists * \n" socialLists
120
121 {-
122 let
123 _socialMap = fromMaybe Set.empty $ Map.lookup MapTerm socialLists
124 _socialCand = fromMaybe Set.empty $ Map.lookup CandidateTerm socialLists
125 socialStop = fromMaybe Set.empty $ Map.lookup StopTerm socialLists
126 -- stopTerms ignored for now (need to be tagged already)
127 -- (stopTerms, candidateTerms) = List.partition ((\t -> Set.member t socialStop) . fst) allTerms
128 -}
129
130
131 -- Grouping the ngrams and keeping the maximum score for label
132 let grouped = groupStems'
133 $ map (\(t,d) -> let stem = ngramsGroup groupParams t
134 in ( stem
135 , GroupedText Nothing t d Set.empty (size t) stem Set.empty
136 )
137 ) allTerms
138
139 groupedWithList = map (addListType (invertForw socialLists)) grouped
140 (stopTerms, candidateTerms) = Map.partition (\t -> t ^. gt_listType == Just StopTerm) groupedWithList
141 (groupedMono, groupedMult) = Map.partition (\gt -> _gt_size gt < 2) candidateTerms
142
143 -- printDebug "\n * stopTerms * \n" stopTerms
144 -- printDebug "groupedMult" groupedMult
145 -- splitting monterms and multiterms to take proportional candidates
146 let
147 listSizeGlobal = 2000 :: Double -- use % of list if to big, or Int if to small
148 monoSize = 0.4 :: Double
149 multSize = 1 - monoSize
150
151 splitAt n' ns = List.splitAt (round $ n' * listSizeGlobal) $ List.sort $ Map.elems ns
152
153 (groupedMonoHead, groupedMonoTail) = splitAt monoSize groupedMono
154 (groupedMultHead, groupedMultTail) = splitAt multSize groupedMult
155
156 printDebug "groupedMonoHead" (List.length groupedMonoHead)
157 printDebug "groupedMonoTail" (List.length groupedMonoHead)
158 printDebug "groupedMultHead" (List.length groupedMultHead)
159 printDebug "groupedMultTail" (List.length groupedMultTail)
160
161 let
162 -- Get Local Scores now for selected grouped ngrams
163 selectedTerms = Set.toList $ List.foldl'
164 (\set' (GroupedText _ l' _ g _ _ _ ) -> Set.union set'
165 $ Set.union g
166 $ Set.singleton l'
167 )
168 Set.empty
169 (groupedMonoHead <> groupedMultHead)
170
171 -- TO remove (and remove HasNodeError instance)
172 userListId <- defaultList uCid
173 masterListId <- defaultList mCid
174
175 mapTextDocIds <- getNodesByNgramsOnlyUser uCid [userListId, masterListId] NgramsTerms selectedTerms
176 let
177 mapGroups = Map.fromList
178 $ map (\g -> (_gt_stem g, g))
179 $ groupedMonoHead <> groupedMultHead
180
181 -- grouping with Set NodeId
182 contextsAdded = foldl' (\mapGroups' k -> let k' = ngramsGroup groupParams k
183 in case Map.lookup k' mapGroups' of
184 Nothing -> mapGroups'
185 Just g -> case Map.lookup k mapTextDocIds of
186 Nothing -> mapGroups'
187 Just ns -> Map.insert k' ( g { _gt_nodes = Set.union ns (_gt_nodes g)}) mapGroups'
188 )
189 mapGroups
190 $ Map.keys mapTextDocIds
191
192 -- compute cooccurrences
193 mapCooc = Map.filter (>2) $ Map.fromList [ ((t1, t2), Set.size $ Set.intersection s1 s2)
194 | (t1, s1) <- mapStemNodeIds
195 , (t2, s2) <- mapStemNodeIds
196 ]
197 where
198 mapStemNodeIds = Map.toList $ Map.map (_gt_nodes) contextsAdded
199 -- printDebug "mapCooc" mapCooc
200
201 let
202 -- computing scores
203 mapScores f = Map.fromList
204 $ map (\(Scored t g s') -> (t, f (g,s')))
205 $ normalizeGlobal
206 $ map normalizeLocal
207 $ scored' mapCooc
208
209 groupsWithScores = catMaybes
210 $ map (\(stem, g)
211 -> case Map.lookup stem mapScores' of
212 Nothing -> Nothing
213 Just s' -> Just $ g { _gt_score = s'}
214 ) $ Map.toList contextsAdded
215 where
216 mapScores' = mapScores identity
217 -- adapt2 TOCHECK with DC
218 -- printDebug "groupsWithScores" groupsWithScores
219 let
220 -- sort / partition / split
221 -- filter mono/multi again
222 (monoScored, multScored) = List.partition (\g -> _gt_size g < 2) groupsWithScores
223 -- filter with max score
224 partitionWithMaxScore = List.partition (\g -> let (s1,s2) = _gt_score g in s1 > s2 )
225
226 (monoScoredIncl, monoScoredExcl) = partitionWithMaxScore monoScored
227 (multScoredIncl, multScoredExcl) = partitionWithMaxScore multScored
228
229 -- splitAt
230 let
231 listSizeLocal = 1000 :: Double -- use % of list if to big, or Int if to small
232 inclSize = 0.4 :: Double
233 exclSize = 1 - inclSize
234 splitAt' n' = List.splitAt (round $ n' * listSizeLocal)
235
236 (monoScoredInclHead, monoScoredInclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredIncl
237 (monoScoredExclHead, monoScoredExclTail) = splitAt' (monoSize * inclSize / 2) $ List.sortOn (Down . _gt_score) monoScoredExcl
238
239 (multScoredInclHead, multScoredInclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredIncl
240 (multScoredExclHead, multScoredExclTail) = splitAt' (multSize * exclSize / 2) $ List.sortOn (Down . _gt_score) multScoredExcl
241
242
243 -- Final Step building the Typed list
244 -- (map (toGargList $ Just StopTerm) stopTerms) -- Removing stops (needs social score)
245 termListHead =
246 (map (\g -> g { _gt_listType = Just MapTerm} ) ( monoScoredInclHead
247 <> monoScoredExclHead
248 <> multScoredInclHead
249 <> multScoredExclHead
250 )
251 )
252 <> (map (\g -> g { _gt_listType = Just CandidateTerm }) ( monoScoredInclTail
253 <> monoScoredExclTail
254 <> multScoredInclTail
255 <> multScoredExclTail
256 )
257 )
258
259 termListTail = map (\g -> g { _gt_listType = Just CandidateTerm }) ( groupedMonoTail <> groupedMultTail)
260
261 -- printDebug "monoScoredInclHead" monoScoredInclHead
262 -- printDebug "monoScoredExclHead" monoScoredExclTail
263 --
264 printDebug "multScoredInclHead" multScoredInclHead
265 printDebug "multScoredExclTail" multScoredExclTail
266
267 let result = Map.unionsWith (<>)
268 [ Map.fromList [(
269 NgramsTerms, (List.concat $ map toNgramsElement $ termListHead)
270 <> (List.concat $ map toNgramsElement $ termListTail)
271 <> (List.concat $ map toNgramsElement $ stopTerms)
272 )]
273 ]
274 -- printDebug "\n result \n" r
275 pure result
276
277 groupStems :: [(Stem, GroupedText Double)] -> [GroupedText Double]
278 groupStems = Map.elems . groupStems'
279
280 groupStems' :: [(Stem, GroupedText Double)] -> Map Stem (GroupedText Double)
281 groupStems' = Map.fromListWith grouping
282 where
283 grouping (GroupedText lt1 label1 score1 group1 s1 stem1 nodes1)
284 (GroupedText lt2 label2 score2 group2 s2 stem2 nodes2)
285 | score1 >= score2 = GroupedText lt label1 score1 (Set.insert label2 gr) s1 stem1 nodes
286 | otherwise = GroupedText lt label2 score2 (Set.insert label1 gr) s2 stem2 nodes
287 where
288 lt = lt1 <> lt2
289 gr = Set.union group1 group2
290 nodes = Set.union nodes1 nodes2
291
292
293
294
295 toNgramsElement :: GroupedText a -> [NgramsElement]
296 toNgramsElement (GroupedText listType label _ setNgrams _ _ _) =
297 [parentElem] <> childrenElems
298 where
299 parent = label
300 children = Set.toList setNgrams
301 parentElem = mkNgramsElement (NgramsTerm parent)
302 (fromMaybe CandidateTerm listType)
303 Nothing
304 (mSetFromList (NgramsTerm <$> children))
305 childrenElems = map (\t -> mkNgramsElement t (fromMaybe CandidateTerm $ listType)
306 (Just $ RootParent (NgramsTerm parent) (NgramsTerm parent))
307 (mSetFromList [])
308 ) (NgramsTerm <$> children)
309
310
311 toGargList :: Maybe ListType -> b -> (Maybe ListType, b)
312 toGargList l n = (l,n)
313
314
315 isStopTerm :: StopSize -> Text -> Bool
316 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
317 where
318 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)
319
320 ------------------------------------------------------------------------------