]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
[Backup][WIP]
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByContext.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByContext
17 where
18
19 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (Map)
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import Data.Tuple.Extra (first, second, swap)
25 import Database.PostgreSQL.Simple.SqlQQ (sql)
26 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
27 -- import Debug.Trace (trace)
28 import Gargantext.Core
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
39
40 -- | fst is size of Supra Corpus
41 -- snd is Texts and size of Occurrences (different docs)
42
43 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
44 -> HashMap NgramsTerm (Set ContextId)
45 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
46 countContextsByNgramsWith f m = (total, m')
47 where
48 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
49 m' = HM.map ( swap . second (fromIntegral . Set.size))
50 $ groupContextsByNgramsWith f m
51
52
53 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
54 -> HashMap NgramsTerm (Set NodeId)
55 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
56 groupContextsByNgramsWith f' m'' =
57 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
58 $ HM.toList m''
59
60 ------------------------------------------------------------------------
61
62 getContextsByNgramsUser :: HasDBid NodeType
63 => CorpusId
64 -> NgramsType
65 -> Cmd err (HashMap NgramsTerm (Set ContextId))
66 getContextsByNgramsUser cId nt =
67 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
68 <$> selectNgramsByContextUser cId nt
69 where
70
71 selectNgramsByContextUser :: HasDBid NodeType
72 => CorpusId
73 -> NgramsType
74 -> Cmd err [(NodeId, Text)]
75 selectNgramsByContextUser cId' nt' =
76 runPGSQuery queryNgramsByContextUser
77 ( cId'
78 , toDBid NodeDocument
79 , ngramsTypeId nt'
80 -- , 100 :: Int -- limit
81 -- , 0 :: Int -- offset
82 )
83
84 queryNgramsByContextUser :: DPS.Query
85 queryNgramsByContextUser = [sql|
86 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
87 JOIN ngrams ng ON cng.ngrams_id = ng.id
88 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
89 JOIN contexts c ON nc.context_id = c.id
90 WHERE nc.node_id = ? -- CorpusId
91 AND c.typename = ? -- toDBid
92 AND cng.ngrams_type = ? -- NgramsTypeId
93 AND nc.category > 0 -- is not in Trash
94 GROUP BY cng.context_id, ng.terms
95 |]
96
97
98 ------------------------------------------------------------------------
99 -- TODO add groups
100
101 {-
102 getOccByNgramsOnlyFast :: HasDBid NodeType
103 => CorpusId
104 -> NgramsType
105 -> [NgramsTerm]
106 -> Cmd err (HashMap NgramsTerm Int)
107 getOccByNgramsOnlyFast cId nt ngs =
108 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser cId nt ngs
109 -}
110
111 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
112 => CorpusId
113 -> Int
114 -> NgramsType
115 -> [NgramsTerm]
116 -> Cmd err (HashMap NgramsTerm Int)
117 getOccByNgramsOnlyFast_withSample cId int nt ngs =
118 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
119
120
121 getOccByNgramsOnlyFast' :: CorpusId
122 -> ListId
123 -> NgramsType
124 -> [NgramsTerm]
125 -> Cmd err (HashMap NgramsTerm Int)
126 getOccByNgramsOnlyFast' cId lId nt tms = -- trace (show (cId, lId)) $
127 HM.fromListWith (+) <$> map (second round) <$> run cId lId nt tms
128
129 where
130 fields = [QualifiedIdentifier Nothing "text"]
131
132 run :: CorpusId
133 -> ListId
134 -> NgramsType
135 -> [NgramsTerm]
136 -> Cmd err [(NgramsTerm, Double)]
137 run cId' lId' nt' tms' = map (first NgramsTerm) <$> runPGSQuery query
138 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms')
139 , cId'
140 , lId'
141 , ngramsTypeId nt'
142 )
143
144 query :: DPS.Query
145 query = [sql|
146 WITH input_rows(terms) AS (?)
147 SELECT ng.terms, nng.weight FROM nodes_contexts nc
148 JOIN node_node_ngrams nng ON nng.node1_id = nc.node_id
149 JOIN ngrams ng ON nng.ngrams_id = ng.id
150 JOIN input_rows ir ON ir.terms = ng.terms
151 WHERE nng.node1_id = ? -- CorpusId
152 AND nng.node2_id = ? -- ListId
153 AND nng.ngrams_type = ? -- NgramsTypeId
154 AND nc.category > 0 -- Not trash
155 GROUP BY ng.terms, nng.weight
156 |]
157
158
159 {-
160 -- just slower than getOccByNgramsOnlyFast
161 getOccByNgramsOnlySlow :: HasDBid NodeType
162 => NodeType
163 -> CorpusId
164 -> [ListId]
165 -> NgramsType
166 -> [NgramsTerm]
167 -> Cmd err (HashMap NgramsTerm Int)
168 getOccByNgramsOnlySlow t cId ls nt ngs =
169 HM.map Set.size <$> getScore' t cId ls nt ngs
170 where
171 getScore' NodeCorpus = getContextsByNgramsOnlyUser
172 getScore' NodeDocument = getNgramsByDocOnlyUser
173 getScore' _ = getContextsByNgramsOnlyUser
174
175 getOccByNgramsOnlySafe :: HasDBid NodeType
176 => CorpusId
177 -> [ListId]
178 -> NgramsType
179 -> [NgramsTerm]
180 -> Cmd err (HashMap NgramsTerm Int)
181 getOccByNgramsOnlySafe cId ls nt ngs = do
182 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
183 fast <- getOccByNgramsOnlyFast cId nt ngs
184 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
185 when (fast /= slow) $
186 printDebug "getOccByNgramsOnlySafe: difference"
187 (HM.difference slow fast, HM.difference fast slow)
188 -- diff slow fast :: PatchMap Text (Replace (Maybe Int))
189 pure slow
190
191
192 selectNgramsOccurrencesOnlyByContextUser :: HasDBid NodeType
193 => CorpusId
194 -> NgramsType
195 -> [NgramsTerm]
196 -> Cmd err [(NgramsTerm, Int)]
197 selectNgramsOccurrencesOnlyByContextUser cId nt tms =
198 fmap (first NgramsTerm) <$>
199 runPGSQuery queryNgramsOccurrencesOnlyByContextUser
200 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
201 , cId
202 , toDBid NodeDocument
203 , ngramsTypeId nt
204 )
205 where
206 fields = [QualifiedIdentifier Nothing "text"]
207
208 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
209 -- Question: with the grouping is the result exactly the same (since Set NodeId for
210 -- equivalent ngrams intersections are not empty)
211 queryNgramsOccurrencesOnlyByContextUser :: DPS.Query
212 queryNgramsOccurrencesOnlyByContextUser = [sql|
213 WITH input_rows(terms) AS (?)
214 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
215 JOIN ngrams ng ON cng.ngrams_id = ng.id
216 JOIN input_rows ir ON ir.terms = ng.terms
217 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
218 JOIN nodes n ON nn.node_id = n.id
219 WHERE nn.node_id = ? -- CorpusId
220 AND n.typename = ? -- toDBid
221 AND cng.ngrams_type = ? -- NgramsTypeId
222 AND nn.category > 0
223 GROUP BY cng.context_id, ng.terms
224 |]
225
226 -}
227
228 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
229 => CorpusId
230 -> Int
231 -> NgramsType
232 -> [NgramsTerm]
233 -> Cmd err [(NgramsTerm, Int)]
234 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
235 fmap (first NgramsTerm) <$>
236 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
237 ( int
238 , toDBid NodeDocument
239 , cId
240 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
241 , cId
242 , ngramsTypeId nt
243 )
244 where
245 fields = [QualifiedIdentifier Nothing "text"]
246
247 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
248 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
249 WITH nodes_sample AS (SELECT id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
250 JOIN nodes_contexts nn ON n.id = nn.context_id
251 WHERE n.typename = ?
252 AND nn.node_id = ?),
253 input_rows(terms) AS (?)
254 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
255 JOIN ngrams ng ON cng.ngrams_id = ng.id
256 JOIN input_rows ir ON ir.terms = ng.terms
257 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
258 JOIN nodes_sample n ON nn.context_id = n.id
259 WHERE nn.node_id = ? -- CorpusId
260 AND cng.ngrams_type = ? -- NgramsTypeId
261 AND nn.category > 0
262 GROUP BY cng.node_id, ng.terms
263 |]
264
265
266 {-
267 queryNgramsOccurrencesOnlyByContextUser' :: DPS.Query
268 queryNgramsOccurrencesOnlyByContextUser' = [sql|
269 WITH input_rows(terms) AS (?)
270 SELECT ng.terms, COUNT(cng.node_id) FROM context_node_ngrams cng
271 JOIN ngrams ng ON cng.ngrams_id = ng.id
272 JOIN input_rows ir ON ir.terms = ng.terms
273 JOIN nodes_nodes nn ON nn.node2_id = cng.node_id
274 JOIN nodes n ON nn.node2_id = n.id
275 WHERE nn.node1_id = ? -- CorpusId
276 AND n.typename = ? -- toDBid
277 AND cng.ngrams_type = ? -- NgramsTypeId
278 AND nn.category > 0
279 GROUP BY cng.node_id, ng.terms
280 |]
281 -}
282
283 ------------------------------------------------------------------------
284
285 getContextsByNgramsOnlyUser :: HasDBid NodeType
286 => CorpusId
287 -> [ListId]
288 -> NgramsType
289 -> [NgramsTerm]
290 -> Cmd err (HashMap NgramsTerm (Set NodeId))
291 getContextsByNgramsOnlyUser cId ls nt ngs =
292 HM.unionsWith (<>)
293 . map (HM.fromListWith (<>)
294 . map (second Set.singleton))
295 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
296 (splitEvery 1000 ngs)
297
298 getNgramsByContextOnlyUser :: HasDBid NodeType
299 => NodeId
300 -> [ListId]
301 -> NgramsType
302 -> [NgramsTerm]
303 -> Cmd err (Map NodeId (Set NgramsTerm))
304 getNgramsByContextOnlyUser cId ls nt ngs =
305 Map.unionsWith (<>)
306 . map ( Map.fromListWith (<>)
307 . map (second Set.singleton)
308 )
309 . map (map swap)
310 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
311 (splitEvery 1000 ngs)
312
313 ------------------------------------------------------------------------
314 -- used in G.Core.Text.List
315 selectNgramsOnlyByContextUser :: HasDBid NodeType
316 => CorpusId
317 -> [ListId]
318 -> NgramsType
319 -> [NgramsTerm]
320 -> Cmd err [(NgramsTerm, ContextId)]
321 selectNgramsOnlyByContextUser cId ls nt tms =
322 fmap (first NgramsTerm) <$>
323 runPGSQuery queryNgramsOnlyByContextUser
324 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
325 , Values [QualifiedIdentifier Nothing "int4"]
326 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
327 , cId
328 , toDBid NodeDocument
329 , ngramsTypeId nt
330 )
331 where
332 fields = [QualifiedIdentifier Nothing "text"]
333
334 queryNgramsOnlyByContextUser :: DPS.Query
335 queryNgramsOnlyByContextUser = [sql|
336 WITH input_rows(terms) AS (?),
337 input_list(id) AS (?)
338 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
339 JOIN ngrams ng ON cng.ngrams_id = ng.id
340 JOIN input_rows ir ON ir.terms = ng.terms
341 JOIN input_list il ON il.id = cng.node_id
342 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
343 JOIN contexts c ON nn.context_id = c.id
344 WHERE nn.node_id = ? -- CorpusId
345 AND c.typename = ? -- toDBid (maybe not useful with context table)
346 AND cng.ngrams_type = ? -- NgramsTypeId
347 AND nn.category > 0
348 GROUP BY ng.terms, cng.context_id
349 |]
350
351
352 {-
353 selectNgramsOnlyByContextUser' :: HasDBid NodeType
354 => CorpusId
355 -> [ListId]
356 -> NgramsType
357 -> [Text]
358 -> Cmd err [(Text, Int)]
359 selectNgramsOnlyByContextUser' cId ls nt tms =
360 runPGSQuery queryNgramsOnlyByContextUser
361 ( Values fields (DPS.Only <$> tms)
362 , Values [QualifiedIdentifier Nothing "int4"]
363 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
364 , cId
365 , toDBid NodeDocument
366 , ngramsTypeId nt
367 )
368 where
369 fields = [QualifiedIdentifier Nothing "text"]
370
371 queryNgramsOnlyByContextUser' :: DPS.Query
372 queryNgramsOnlyByContextUser' = [sql|
373 WITH input_rows(terms) AS (?),
374 input_list(id) AS (?)
375 SELECT ng.terms, cng.weight FROM context_node_ngrams cng
376 JOIN ngrams ng ON cng.ngrams_id = ng.id
377 JOIN input_rows ir ON ir.terms = ng.terms
378 JOIN input_list il ON il.id = cng.node_id
379 WHERE cng.context_id = ? -- CorpusId
380 AND cng.ngrams_type = ? -- NgramsTypeId
381 -- AND nn.category > 0
382 GROUP BY ng.terms, cng.weight
383 |]
384 -}
385
386 getNgramsByDocOnlyUser :: DocId
387 -> [ListId]
388 -> NgramsType
389 -> [NgramsTerm]
390 -> Cmd err (HashMap NgramsTerm (Set NodeId))
391 getNgramsByDocOnlyUser cId ls nt ngs =
392 HM.unionsWith (<>)
393 . map (HM.fromListWith (<>) . map (second Set.singleton))
394 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
395
396
397 selectNgramsOnlyByDocUser :: DocId
398 -> [ListId]
399 -> NgramsType
400 -> [NgramsTerm]
401 -> Cmd err [(NgramsTerm, NodeId)]
402 selectNgramsOnlyByDocUser dId ls nt tms =
403 fmap (first NgramsTerm) <$>
404 runPGSQuery queryNgramsOnlyByDocUser
405 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
406 , Values [QualifiedIdentifier Nothing "int4"]
407 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
408 , dId
409 , ngramsTypeId nt
410 )
411 where
412 fields = [QualifiedIdentifier Nothing "text"]
413
414
415 queryNgramsOnlyByDocUser :: DPS.Query
416 queryNgramsOnlyByDocUser = [sql|
417 WITH input_rows(terms) AS (?),
418 input_list(id) AS (?)
419 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
420 JOIN ngrams ng ON cng.ngrams_id = ng.id
421 JOIN input_rows ir ON ir.terms = ng.terms
422 JOIN input_list il ON il.id = cng.context_id
423 WHERE cng.node_id = ? -- DocId
424 AND cng.ngrams_type = ? -- NgramsTypeId
425 GROUP BY ng.terms, cng.node_id
426 |]
427
428 ------------------------------------------------------------------------
429 -- | TODO filter by language, database, any social field
430 getContextsByNgramsMaster :: HasDBid NodeType
431 => UserCorpusId
432 -> MasterCorpusId
433 -> Cmd err (HashMap Text (Set NodeId))
434 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
435 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
436 -- . takeWhile (not . List.null)
437 -- . takeWhile (\l -> List.length l > 3)
438 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
439
440 selectNgramsByContextMaster :: HasDBid NodeType
441 => Int
442 -> UserCorpusId
443 -> MasterCorpusId
444 -> Int
445 -> Cmd err [(NodeId, Text)]
446 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
447 queryNgramsByContextMaster'
448 ( ucId
449 , ngramsTypeId NgramsTerms
450 , toDBid NodeDocument
451 , p
452 , toDBid NodeDocument
453 , p
454 , n
455 , mcId
456 , toDBid NodeDocument
457 , ngramsTypeId NgramsTerms
458 )
459
460 -- | TODO fix context_node_ngrams relation
461 queryNgramsByContextMaster' :: DPS.Query
462 queryNgramsByContextMaster' = [sql|
463 WITH contextsByNgramsUser AS (
464
465 SELECT n.id, ng.terms FROM contexts n
466 JOIN nodes_contexts nn ON n.id = nn.context_id
467 JOIN context_node_ngrams cng ON cng.context_id = n.id
468 JOIN ngrams ng ON cng.ngrams_id = ng.id
469 WHERE nn.node_id = ? -- UserCorpusId
470 -- AND n.typename = ? -- toDBid
471 AND cng.ngrams_type = ? -- NgramsTypeId
472 AND nn.category > 0
473 AND node_pos(n.id,?) >= ?
474 AND node_pos(n.id,?) < ?
475 GROUP BY n.id, ng.terms
476
477 ),
478
479 contextsByNgramsMaster AS (
480
481 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
482 JOIN context_node_ngrams cng ON n.id = cng.context_id
483 JOIN ngrams ng ON ng.id = cng.ngrams_id
484
485 WHERE n.parent_id = ? -- Master Corpus toDBid
486 AND n.typename = ? -- toDBid
487 AND cng.ngrams_type = ? -- NgramsTypeId
488 GROUP BY n.id, ng.terms
489 )
490
491 SELECT m.id, m.terms FROM nodesByNgramsMaster m
492 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
493 |]