2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByNode
19 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
20 import Data.HashMap.Strict (HashMap)
23 import Data.Text (Text)
24 import Data.Tuple.Extra (first, second, swap)
25 import Database.PostgreSQL.Simple.SqlQQ (sql)
26 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
27 import Debug.Trace (trace)
28 import Gargantext.Core
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
40 -- | fst is size of Supra Corpus
41 -- snd is Texts and size of Occurrences (different docs)
42 countNodesByNgramsWith :: (NgramsTerm -> NgramsTerm)
43 -> HashMap NgramsTerm (Set NodeId)
44 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
45 countNodesByNgramsWith f m = (total, m')
47 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
48 m' = HM.map ( swap . second (fromIntegral . Set.size))
49 $ groupNodesByNgramsWith f m
52 groupNodesByNgramsWith :: (NgramsTerm -> NgramsTerm)
53 -> HashMap NgramsTerm (Set NodeId)
54 -> HashMap NgramsTerm (Set NgramsTerm, Set NodeId)
55 groupNodesByNgramsWith f m =
56 HM.fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
59 ------------------------------------------------------------------------
60 getNodesByNgramsUser :: HasDBid NodeType
63 -> Cmd err (HashMap NgramsTerm (Set NodeId))
64 getNodesByNgramsUser cId nt =
65 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
66 <$> selectNgramsByNodeUser cId nt
69 selectNgramsByNodeUser :: HasDBid NodeType
72 -> Cmd err [(NodeId, Text)]
73 selectNgramsByNodeUser cId' nt' =
74 runPGSQuery queryNgramsByNodeUser
78 -- , 100 :: Int -- limit
79 -- , 0 :: Int -- offset
82 queryNgramsByNodeUser :: DPS.Query
83 queryNgramsByNodeUser = [sql|
84 SELECT cng.node_id, ng.terms FROM context_node_ngrams cng
85 JOIN ngrams ng ON cng.ngrams_id = ng.id
86 JOIN nodes_nodes nn ON nn.node2_id = cng.node_id
87 JOIN nodes n ON nn.node2_id = n.id
88 WHERE nn.node1_id = ? -- CorpusId
89 AND n.typename = ? -- toDBid
90 AND cng.ngrams_type = ? -- NgramsTypeId
92 GROUP BY cng.node_id, ng.terms
93 ORDER BY (cng.node_id, ng.terms) DESC
97 ------------------------------------------------------------------------
99 getOccByNgramsOnlyFast :: HasDBid NodeType
103 -> Cmd err (HashMap NgramsTerm Int)
104 getOccByNgramsOnlyFast cId nt ngs =
105 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
108 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
113 -> Cmd err (HashMap NgramsTerm Int)
114 getOccByNgramsOnlyFast_withSample cId int nt ngs =
115 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser_withSample cId int nt ngs
120 getOccByNgramsOnlyFast' :: CorpusId
124 -> Cmd err (HashMap NgramsTerm Int)
125 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
126 HM.fromListWith (+) <$> map (second round) <$> run cId lId nt tms
129 fields = [QualifiedIdentifier Nothing "text"]
135 -> Cmd err [(NgramsTerm, Double)]
136 run cId' lId' nt' tms' = fmap (first NgramsTerm) <$> runPGSQuery query
137 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms')
145 WITH input_rows(terms) AS (?)
146 SELECT ng.terms, cng.weight FROM context_node_ngrams cng
147 JOIN ngrams ng ON cng.ngrams_id = ng.id
148 JOIN input_rows ir ON ir.terms = ng.terms
149 WHERE cng.context_id = ? -- CorpusId
150 AND cng.node_id = ? -- ListId
151 AND cng.ngrams_type = ? -- NgramsTypeId
152 -- AND nn.category > 0 -- TODO
153 GROUP BY ng.terms, cng.weight
157 -- just slower than getOccByNgramsOnlyFast
158 getOccByNgramsOnlySlow :: HasDBid NodeType
164 -> Cmd err (HashMap NgramsTerm Int)
165 getOccByNgramsOnlySlow t cId ls nt ngs =
166 HM.map Set.size <$> getScore' t cId ls nt ngs
168 getScore' NodeCorpus = getNodesByNgramsOnlyUser
169 getScore' NodeDocument = getNgramsByDocOnlyUser
170 getScore' _ = getNodesByNgramsOnlyUser
172 getOccByNgramsOnlySafe :: HasDBid NodeType
177 -> Cmd err (HashMap NgramsTerm Int)
178 getOccByNgramsOnlySafe cId ls nt ngs = do
179 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
180 fast <- getOccByNgramsOnlyFast cId nt ngs
181 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
182 when (fast /= slow) $
183 printDebug "getOccByNgramsOnlySafe: difference"
184 (HM.difference slow fast, HM.difference fast slow)
185 -- diff slow fast :: PatchMap Text (Replace (Maybe Int))
189 selectNgramsOccurrencesOnlyByNodeUser :: HasDBid NodeType
193 -> Cmd err [(NgramsTerm, Int)]
194 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
195 fmap (first NgramsTerm) <$>
196 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
197 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
199 , toDBid NodeDocument
203 fields = [QualifiedIdentifier Nothing "text"]
207 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
208 -- Question: with the grouping is the result exactly the same (since Set NodeId for
209 -- equivalent ngrams intersections are not empty)
210 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
211 queryNgramsOccurrencesOnlyByNodeUser = [sql|
212 WITH input_rows(terms) AS (?)
213 SELECT ng.terms, COUNT(cng.node_id) FROM context_node_ngrams cng
214 JOIN ngrams ng ON cng.ngrams_id = ng.id
215 JOIN input_rows ir ON ir.terms = ng.terms
216 JOIN nodes_nodes nn ON nn.node_id = cng.node_id
217 JOIN nodes n ON nn.node_id = n.id
218 WHERE nn.node1_id = ? -- CorpusId
219 AND n.typename = ? -- toDBid
220 AND cng.ngrams_type = ? -- NgramsTypeId
222 GROUP BY cng.node_id, ng.terms
226 selectNgramsOccurrencesOnlyByNodeUser_withSample :: HasDBid NodeType
231 -> Cmd err [(NgramsTerm, Int)]
232 selectNgramsOccurrencesOnlyByNodeUser_withSample cId int nt tms =
233 fmap (first NgramsTerm) <$>
234 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser_withSample
236 , toDBid NodeDocument
238 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
243 fields = [QualifiedIdentifier Nothing "text"]
245 queryNgramsOccurrencesOnlyByNodeUser_withSample :: DPS.Query
246 queryNgramsOccurrencesOnlyByNodeUser_withSample = [sql|
247 WITH nodes_sample AS (SELECT id FROM nodes n TABLESAMPLE SYSTEM_ROWS (?)
248 JOIN nodes_nodes nn ON n.id = nn.node2_id
250 AND nn.node1_id = ?),
251 input_rows(terms) AS (?)
252 SELECT ng.terms, COUNT(cng.node_id) FROM context_node_ngrams cng
253 JOIN ngrams ng ON cng.ngrams_id = ng.id
254 JOIN input_rows ir ON ir.terms = ng.terms
255 JOIN nodes_nodes nn ON nn.node2_id = cng.node_id
256 JOIN nodes_sample n ON nn.node2_id = n.id
257 WHERE nn.node1_id = ? -- CorpusId
258 AND cng.ngrams_type = ? -- NgramsTypeId
260 GROUP BY cng.node_id, ng.terms
265 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
266 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
267 WITH input_rows(terms) AS (?)
268 SELECT ng.terms, COUNT(cng.node_id) FROM context_node_ngrams cng
269 JOIN ngrams ng ON cng.ngrams_id = ng.id
270 JOIN input_rows ir ON ir.terms = ng.terms
271 JOIN nodes_nodes nn ON nn.node2_id = cng.node_id
272 JOIN nodes n ON nn.node2_id = n.id
273 WHERE nn.node1_id = ? -- CorpusId
274 AND n.typename = ? -- toDBid
275 AND cng.ngrams_type = ? -- NgramsTypeId
277 GROUP BY cng.node_id, ng.terms
280 ------------------------------------------------------------------------
281 getNodesByNgramsOnlyUser :: HasDBid NodeType
286 -> Cmd err (HashMap NgramsTerm (Set NodeId))
287 getNodesByNgramsOnlyUser cId ls nt ngs =
289 . map (HM.fromListWith (<>)
290 . map (second Set.singleton))
291 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
292 (splitEvery 1000 ngs)
295 getNgramsByNodeOnlyUser :: HasDBid NodeType
300 -> Cmd err (Map NodeId (Set NgramsTerm))
301 getNgramsByNodeOnlyUser cId ls nt ngs =
303 . map ( Map.fromListWith (<>)
304 . map (second Set.singleton)
307 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
308 (splitEvery 1000 ngs)
310 ------------------------------------------------------------------------
311 selectNgramsOnlyByNodeUser :: HasDBid NodeType
316 -> Cmd err [(NgramsTerm, NodeId)]
317 selectNgramsOnlyByNodeUser cId ls nt tms =
318 fmap (first NgramsTerm) <$>
319 runPGSQuery queryNgramsOnlyByNodeUser
320 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
321 , Values [QualifiedIdentifier Nothing "int4"]
322 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
324 , toDBid NodeDocument
328 fields = [QualifiedIdentifier Nothing "text"]
330 queryNgramsOnlyByNodeUser :: DPS.Query
331 queryNgramsOnlyByNodeUser = [sql|
332 WITH input_rows(terms) AS (?),
333 input_list(id) AS (?)
334 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
335 JOIN ngrams ng ON cng.ngrams_id = ng.id
336 JOIN input_rows ir ON ir.terms = ng.terms
337 JOIN input_list il ON il.id = cng.context_id
338 JOIN nodes_nodes nn ON nn.node2_id = cng.node_id
339 JOIN nodes n ON nn.node2_id = n.id
340 WHERE nn.node1_id = ? -- CorpusId
341 AND n.typename = ? -- toDBid
342 AND cng.ngrams_type = ? -- NgramsTypeId
344 GROUP BY ng.terms, cng.node_id
348 selectNgramsOnlyByNodeUser' :: HasDBid NodeType
353 -> Cmd err [(Text, Int)]
354 selectNgramsOnlyByNodeUser' cId ls nt tms =
355 runPGSQuery queryNgramsOnlyByNodeUser
356 ( Values fields (DPS.Only <$> tms)
357 , Values [QualifiedIdentifier Nothing "int4"]
358 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
360 , toDBid NodeDocument
364 fields = [QualifiedIdentifier Nothing "text"]
366 queryNgramsOnlyByNodeUser' :: DPS.Query
367 queryNgramsOnlyByNodeUser' = [sql|
368 WITH input_rows(terms) AS (?),
369 input_list(id) AS (?)
370 SELECT ng.terms, cng.weight FROM context_node_ngrams cng
371 JOIN ngrams ng ON cng.ngrams_id = ng.id
372 JOIN input_rows ir ON ir.terms = ng.terms
373 JOIN input_list il ON il.id = cng.node_id
374 WHERE cng.context_id = ? -- CorpusId
375 AND cng.ngrams_type = ? -- NgramsTypeId
376 -- AND nn.category > 0
377 GROUP BY ng.terms, cng.weight
381 getNgramsByDocOnlyUser :: DocId
385 -> Cmd err (HashMap NgramsTerm (Set NodeId))
386 getNgramsByDocOnlyUser cId ls nt ngs =
388 . map (HM.fromListWith (<>) . map (second Set.singleton))
389 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
392 selectNgramsOnlyByDocUser :: DocId
396 -> Cmd err [(NgramsTerm, NodeId)]
397 selectNgramsOnlyByDocUser dId ls nt tms =
398 fmap (first NgramsTerm) <$>
399 runPGSQuery queryNgramsOnlyByDocUser
400 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
401 , Values [QualifiedIdentifier Nothing "int4"]
402 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
407 fields = [QualifiedIdentifier Nothing "text"]
410 queryNgramsOnlyByDocUser :: DPS.Query
411 queryNgramsOnlyByDocUser = [sql|
412 WITH input_rows(terms) AS (?),
413 input_list(id) AS (?)
414 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
415 JOIN ngrams ng ON cng.ngrams_id = ng.id
416 JOIN input_rows ir ON ir.terms = ng.terms
417 JOIN input_list il ON il.id = cng.context_id
418 WHERE cng.node_id = ? -- DocId
419 AND cng.ngrams_type = ? -- NgramsTypeId
420 GROUP BY ng.terms, cng.node_id
423 ------------------------------------------------------------------------
424 -- | TODO filter by language, database, any social field
425 getNodesByNgramsMaster :: HasDBid NodeType
426 => UserCorpusId -> MasterCorpusId -> Cmd err (HashMap Text (Set NodeId))
427 getNodesByNgramsMaster ucId mcId = unionsWith (<>)
428 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
429 -- . takeWhile (not . List.null)
430 -- . takeWhile (\l -> List.length l > 3)
431 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
433 selectNgramsByNodeMaster :: HasDBid NodeType
438 -> Cmd err [(NodeId, Text)]
439 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
440 queryNgramsByNodeMaster'
442 , ngramsTypeId NgramsTerms
443 , toDBid NodeDocument
445 , toDBid NodeDocument
449 , toDBid NodeDocument
450 , ngramsTypeId NgramsTerms
453 -- | TODO fix context_node_ngrams relation
454 queryNgramsByNodeMaster' :: DPS.Query
455 queryNgramsByNodeMaster' = [sql|
456 WITH nodesByNgramsUser AS (
458 SELECT n.id, ng.terms FROM nodes n
459 JOIN nodes_nodes nn ON n.id = nn.node2_id
460 JOIN context_node_ngrams cng ON cng.node_id = n.id
461 JOIN ngrams ng ON cng.ngrams_id = ng.id
462 WHERE nn.node1_id = ? -- UserCorpusId
463 -- AND n.typename = ? -- toDBid
464 AND cng.ngrams_type = ? -- NgramsTypeId
466 AND node_pos(n.id,?) >= ?
467 AND node_pos(n.id,?) < ?
468 GROUP BY n.id, ng.terms
472 nodesByNgramsMaster AS (
474 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
475 JOIN context_node_ngrams cng ON n.id = cng.node_id
476 JOIN ngrams ng ON ng.id = cng.ngrams_id
478 WHERE n.parent_id = ? -- Master Corpus toDBid
479 AND n.typename = ? -- toDBid
480 AND cng.ngrams_type = ? -- NgramsTypeId
481 GROUP BY n.id, ng.terms
484 SELECT m.id, m.terms FROM nodesByNgramsMaster m
485 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id