2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByContext
19 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
20 import Data.HashMap.Strict (HashMap)
23 import Data.Text (Text)
24 import Data.Tuple.Extra (first, second, swap)
25 import Database.PostgreSQL.Simple.SqlQQ (sql)
26 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
27 -- import Debug.Trace (trace)
28 import Gargantext.Core
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
40 -- | fst is size of Supra Corpus
41 -- snd is Texts and size of Occurrences (different docs)
43 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
44 -> HashMap NgramsTerm (Set ContextId)
45 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
46 countContextsByNgramsWith f m = (total, m')
48 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
49 m' = HM.map ( swap . second (fromIntegral . Set.size))
50 $ groupContextsByNgramsWith f m
53 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
54 -> HashMap NgramsTerm (Set NodeId)
55 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
56 groupContextsByNgramsWith f' m'' =
57 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
60 ------------------------------------------------------------------------
62 getContextsByNgramsUser :: HasDBid NodeType
65 -> Cmd err (HashMap NgramsTerm (Set ContextId))
66 getContextsByNgramsUser cId nt =
67 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
68 <$> selectNgramsByContextUser cId nt
71 selectNgramsByContextUser :: HasDBid NodeType
74 -> Cmd err [(NodeId, Text)]
75 selectNgramsByContextUser cId' nt' =
76 runPGSQuery queryNgramsByContextUser
80 -- , 100 :: Int -- limit
81 -- , 0 :: Int -- offset
84 queryNgramsByContextUser :: DPS.Query
85 queryNgramsByContextUser = [sql|
86 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
87 JOIN ngrams ng ON cng.ngrams_id = ng.id
88 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
89 JOIN contexts c ON nc.context_id = c.id
90 WHERE nc.node_id = ? -- CorpusId
91 AND c.typename = ? -- toDBid
92 AND cng.ngrams_type = ? -- NgramsTypeId
93 AND nc.category > 0 -- is not in Trash
94 GROUP BY cng.context_id, ng.terms
98 ------------------------------------------------------------------------
102 getOccByNgramsOnlyFast :: HasDBid NodeType
106 -> Cmd err (HashMap NgramsTerm Int)
107 getOccByNgramsOnlyFast cId nt ngs =
108 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser cId nt ngs
111 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
116 -> Cmd err (HashMap NgramsTerm Int)
117 getOccByNgramsOnlyFast_withSample cId int nt ngs =
118 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
121 getOccByNgramsOnlyFast' :: CorpusId
125 -> Cmd err (HashMap NgramsTerm Int)
126 getOccByNgramsOnlyFast' cId lId nt tms = -- trace (show (cId, lId)) $
127 HM.fromListWith (+) <$> map (second round) <$> run cId lId nt tms
130 fields = [QualifiedIdentifier Nothing "text"]
136 -> Cmd err [(NgramsTerm, Double)]
137 run cId' lId' nt' tms' = map (first NgramsTerm) <$> runPGSQuery query
138 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms')
146 WITH input_rows(terms) AS (?)
147 SELECT ng.terms, nng.weight FROM nodes_contexts nc
148 JOIN node_node_ngrams nng ON nng.node1_id = nc.node_id
149 JOIN ngrams ng ON nng.ngrams_id = ng.id
150 JOIN input_rows ir ON ir.terms = ng.terms
151 WHERE nng.node1_id = ? -- CorpusId
152 AND nng.node2_id = ? -- ListId
153 AND nng.ngrams_type = ? -- NgramsTypeId
154 AND nc.category > 0 -- Not trash
155 GROUP BY ng.terms, nng.weight
160 -- just slower than getOccByNgramsOnlyFast
161 getOccByNgramsOnlySlow :: HasDBid NodeType
167 -> Cmd err (HashMap NgramsTerm Int)
168 getOccByNgramsOnlySlow t cId ls nt ngs =
169 HM.map Set.size <$> getScore' t cId ls nt ngs
171 getScore' NodeCorpus = getContextsByNgramsOnlyUser
172 getScore' NodeDocument = getNgramsByDocOnlyUser
173 getScore' _ = getContextsByNgramsOnlyUser
175 getOccByNgramsOnlySafe :: HasDBid NodeType
180 -> Cmd err (HashMap NgramsTerm Int)
181 getOccByNgramsOnlySafe cId ls nt ngs = do
182 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
183 fast <- getOccByNgramsOnlyFast cId nt ngs
184 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
185 when (fast /= slow) $
186 printDebug "getOccByNgramsOnlySafe: difference"
187 (HM.difference slow fast, HM.difference fast slow)
188 -- diff slow fast :: PatchMap Text (Replace (Maybe Int))
192 selectNgramsOccurrencesOnlyByContextUser :: HasDBid NodeType
196 -> Cmd err [(NgramsTerm, Int)]
197 selectNgramsOccurrencesOnlyByContextUser cId nt tms =
198 fmap (first NgramsTerm) <$>
199 runPGSQuery queryNgramsOccurrencesOnlyByContextUser
200 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
202 , toDBid NodeDocument
206 fields = [QualifiedIdentifier Nothing "text"]
208 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
209 -- Question: with the grouping is the result exactly the same (since Set NodeId for
210 -- equivalent ngrams intersections are not empty)
211 queryNgramsOccurrencesOnlyByContextUser :: DPS.Query
212 queryNgramsOccurrencesOnlyByContextUser = [sql|
213 WITH input_rows(terms) AS (?)
214 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
215 JOIN ngrams ng ON cng.ngrams_id = ng.id
216 JOIN input_rows ir ON ir.terms = ng.terms
217 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
218 JOIN nodes n ON nn.node_id = n.id
219 WHERE nn.node_id = ? -- CorpusId
220 AND n.typename = ? -- toDBid
221 AND cng.ngrams_type = ? -- NgramsTypeId
223 GROUP BY cng.context_id, ng.terms
228 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
233 -> Cmd err [(NgramsTerm, Int)]
234 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
235 fmap (first NgramsTerm) <$>
236 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
238 , toDBid NodeDocument
240 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
245 fields = [QualifiedIdentifier Nothing "text"]
247 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
248 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
249 WITH nodes_sample AS (SELECT id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
250 JOIN nodes_contexts nn ON n.id = nn.context_id
253 input_rows(terms) AS (?)
254 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
255 JOIN ngrams ng ON cng.ngrams_id = ng.id
256 JOIN input_rows ir ON ir.terms = ng.terms
257 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
258 JOIN nodes_sample n ON nn.context_id = n.id
259 WHERE nn.node_id = ? -- CorpusId
260 AND cng.ngrams_type = ? -- NgramsTypeId
262 GROUP BY cng.node_id, ng.terms
267 queryNgramsOccurrencesOnlyByContextUser' :: DPS.Query
268 queryNgramsOccurrencesOnlyByContextUser' = [sql|
269 WITH input_rows(terms) AS (?)
270 SELECT ng.terms, COUNT(cng.node_id) FROM context_node_ngrams cng
271 JOIN ngrams ng ON cng.ngrams_id = ng.id
272 JOIN input_rows ir ON ir.terms = ng.terms
273 JOIN nodes_nodes nn ON nn.node2_id = cng.node_id
274 JOIN nodes n ON nn.node2_id = n.id
275 WHERE nn.node1_id = ? -- CorpusId
276 AND n.typename = ? -- toDBid
277 AND cng.ngrams_type = ? -- NgramsTypeId
279 GROUP BY cng.node_id, ng.terms
283 ------------------------------------------------------------------------
285 getContextsByNgramsOnlyUser :: HasDBid NodeType
290 -> Cmd err (HashMap NgramsTerm (Set NodeId))
291 getContextsByNgramsOnlyUser cId ls nt ngs =
293 . map (HM.fromListWith (<>)
294 . map (second Set.singleton))
295 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
296 (splitEvery 1000 ngs)
298 getNgramsByContextOnlyUser :: HasDBid NodeType
303 -> Cmd err (Map NodeId (Set NgramsTerm))
304 getNgramsByContextOnlyUser cId ls nt ngs =
306 . map ( Map.fromListWith (<>)
307 . map (second Set.singleton)
310 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
311 (splitEvery 1000 ngs)
313 ------------------------------------------------------------------------
314 -- used in G.Core.Text.List
315 selectNgramsOnlyByContextUser :: HasDBid NodeType
320 -> Cmd err [(NgramsTerm, ContextId)]
321 selectNgramsOnlyByContextUser cId ls nt tms =
322 fmap (first NgramsTerm) <$>
323 runPGSQuery queryNgramsOnlyByContextUser
324 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
325 , Values [QualifiedIdentifier Nothing "int4"]
326 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
328 , toDBid NodeDocument
332 fields = [QualifiedIdentifier Nothing "text"]
334 queryNgramsOnlyByContextUser :: DPS.Query
335 queryNgramsOnlyByContextUser = [sql|
336 WITH input_rows(terms) AS (?),
337 input_list(id) AS (?)
338 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
339 JOIN ngrams ng ON cng.ngrams_id = ng.id
340 JOIN input_rows ir ON ir.terms = ng.terms
341 JOIN input_list il ON il.id = cng.node_id
342 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
343 JOIN contexts c ON nn.context_id = c.id
344 WHERE nn.node_id = ? -- CorpusId
345 AND c.typename = ? -- toDBid (maybe not useful with context table)
346 AND cng.ngrams_type = ? -- NgramsTypeId
348 GROUP BY ng.terms, cng.context_id
353 selectNgramsOnlyByContextUser' :: HasDBid NodeType
358 -> Cmd err [(Text, Int)]
359 selectNgramsOnlyByContextUser' cId ls nt tms =
360 runPGSQuery queryNgramsOnlyByContextUser
361 ( Values fields (DPS.Only <$> tms)
362 , Values [QualifiedIdentifier Nothing "int4"]
363 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
365 , toDBid NodeDocument
369 fields = [QualifiedIdentifier Nothing "text"]
371 queryNgramsOnlyByContextUser' :: DPS.Query
372 queryNgramsOnlyByContextUser' = [sql|
373 WITH input_rows(terms) AS (?),
374 input_list(id) AS (?)
375 SELECT ng.terms, cng.weight FROM context_node_ngrams cng
376 JOIN ngrams ng ON cng.ngrams_id = ng.id
377 JOIN input_rows ir ON ir.terms = ng.terms
378 JOIN input_list il ON il.id = cng.node_id
379 WHERE cng.context_id = ? -- CorpusId
380 AND cng.ngrams_type = ? -- NgramsTypeId
381 -- AND nn.category > 0
382 GROUP BY ng.terms, cng.weight
386 getNgramsByDocOnlyUser :: DocId
390 -> Cmd err (HashMap NgramsTerm (Set NodeId))
391 getNgramsByDocOnlyUser cId ls nt ngs =
393 . map (HM.fromListWith (<>) . map (second Set.singleton))
394 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
397 selectNgramsOnlyByDocUser :: DocId
401 -> Cmd err [(NgramsTerm, NodeId)]
402 selectNgramsOnlyByDocUser dId ls nt tms =
403 fmap (first NgramsTerm) <$>
404 runPGSQuery queryNgramsOnlyByDocUser
405 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
406 , Values [QualifiedIdentifier Nothing "int4"]
407 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
412 fields = [QualifiedIdentifier Nothing "text"]
415 queryNgramsOnlyByDocUser :: DPS.Query
416 queryNgramsOnlyByDocUser = [sql|
417 WITH input_rows(terms) AS (?),
418 input_list(id) AS (?)
419 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
420 JOIN ngrams ng ON cng.ngrams_id = ng.id
421 JOIN input_rows ir ON ir.terms = ng.terms
422 JOIN input_list il ON il.id = cng.context_id
423 WHERE cng.node_id = ? -- DocId
424 AND cng.ngrams_type = ? -- NgramsTypeId
425 GROUP BY ng.terms, cng.node_id
428 ------------------------------------------------------------------------
429 -- | TODO filter by language, database, any social field
430 getContextsByNgramsMaster :: HasDBid NodeType
433 -> Cmd err (HashMap Text (Set NodeId))
434 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
435 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
436 -- . takeWhile (not . List.null)
437 -- . takeWhile (\l -> List.length l > 3)
438 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
440 selectNgramsByContextMaster :: HasDBid NodeType
445 -> Cmd err [(NodeId, Text)]
446 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
447 queryNgramsByContextMaster'
449 , ngramsTypeId NgramsTerms
450 , toDBid NodeDocument
452 , toDBid NodeDocument
456 , toDBid NodeDocument
457 , ngramsTypeId NgramsTerms
460 -- | TODO fix context_node_ngrams relation
461 queryNgramsByContextMaster' :: DPS.Query
462 queryNgramsByContextMaster' = [sql|
463 WITH contextsByNgramsUser AS (
465 SELECT n.id, ng.terms FROM contexts n
466 JOIN nodes_contexts nn ON n.id = nn.context_id
467 JOIN context_node_ngrams cng ON cng.context_id = n.id
468 JOIN ngrams ng ON cng.ngrams_id = ng.id
469 WHERE nn.node_id = ? -- UserCorpusId
470 -- AND n.typename = ? -- toDBid
471 AND cng.ngrams_type = ? -- NgramsTypeId
473 AND node_pos(n.id,?) >= ?
474 AND node_pos(n.id,?) < ?
475 GROUP BY n.id, ng.terms
479 contextsByNgramsMaster AS (
481 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
482 JOIN context_node_ngrams cng ON n.id = cng.context_id
483 JOIN ngrams ng ON ng.id = cng.ngrams_id
485 WHERE n.parent_id = ? -- Master Corpus toDBid
486 AND n.typename = ? -- toDBid
487 AND cng.ngrams_type = ? -- NgramsTypeId
488 GROUP BY n.id, ng.terms
491 SELECT m.id, m.terms FROM nodesByNgramsMaster m
492 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id