2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByContext
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 -- import Control.Monad (void)
22 import Data.HashMap.Strict (HashMap)
23 import Data.Map.Strict (Map)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (first, second, swap)
27 import Database.PostgreSQL.Simple.SqlQQ (sql)
28 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Core
31 import Gargantext.Data.HashMap.Strict.Utils as HM
32 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
33 import Gargantext.Database.Prelude (Cmd, runPGSQuery) -- , execPGSQuery)
34 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
35 import Gargantext.Prelude
36 import qualified Data.HashMap.Strict as HM
37 import qualified Data.List as List
38 import qualified Data.Map.Strict as Map
39 import qualified Data.Set as Set
40 import qualified Database.PostgreSQL.Simple as DPS
41 import qualified Database.PostgreSQL.Simple.Types as DPST
43 -- | fst is size of Supra Corpus
44 -- snd is Texts and size of Occurrences (different docs)
46 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
47 -> HashMap NgramsTerm (Set ContextId)
48 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
49 countContextsByNgramsWith f m = (total, m')
51 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
52 m' = HM.map ( swap . second (fromIntegral . Set.size))
53 $ groupContextsByNgramsWith f m
56 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
57 -> HashMap NgramsTerm (Set NodeId)
58 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
59 groupContextsByNgramsWith f' m'' =
60 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
63 ------------------------------------------------------------------------
64 getContextsByNgramsUser :: HasDBid NodeType
67 -> Cmd err (HashMap NgramsTerm (Set ContextId))
68 getContextsByNgramsUser cId nt =
69 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
70 <$> selectNgramsByContextUser cId nt
73 selectNgramsByContextUser :: HasDBid NodeType
76 -> Cmd err [(NodeId, Text)]
77 selectNgramsByContextUser cId' nt' =
78 runPGSQuery queryNgramsByContextUser
82 -- , 100 :: Int -- limit
83 -- , 0 :: Int -- offset
86 queryNgramsByContextUser :: DPS.Query
87 queryNgramsByContextUser = [sql|
88 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
89 JOIN ngrams ng ON cng.ngrams_id = ng.id
90 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
91 JOIN contexts c ON nc.context_id = c.id
92 WHERE nc.node_id = ? -- CorpusId
93 AND c.typename = ? -- toDBid
94 AND cng.ngrams_type = ? -- NgramsTypeId
95 AND nc.category > 0 -- is not in Trash
96 GROUP BY cng.context_id, ng.terms
100 ------------------------------------------------------------------------
101 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
106 -> Cmd err (HashMap NgramsTerm Int)
107 getOccByNgramsOnlyFast_withSample cId int nt ngs =
108 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
111 getOccByNgramsOnlyFast :: CorpusId
114 -> Cmd err (HashMap NgramsTerm [ContextId])
115 getOccByNgramsOnlyFast cId lId nt = do
116 --HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
117 HM.fromList <$> map (\(t, ns) -> (NgramsTerm t, NodeId <$> DPST.fromPGArray ns)) <$> run cId lId nt
123 -> Cmd err [(Text, DPST.PGArray Int)]
124 run cId' lId' nt' = runPGSQuery query
133 ( SELECT DISTINCT context_node_ngrams.context_id,
134 context_node_ngrams.ngrams_id,
135 nodes_contexts.node_id
137 JOIN context_node_ngrams ON context_node_ngrams.context_id = nodes_contexts.context_id
140 (SELECT context_id, ngrams_id, terms
142 JOIN ngrams ON cnnv.ngrams_id = ngrams.id
146 (SELECT ngrams_id, terms, array_agg(DISTINCT context_id) AS agg
147 FROM node_context_ids
148 GROUP BY (ngrams_id, terms)),
150 (SELECT ngrams_id, terms
152 JOIN ngrams ON ngrams_id = ngrams.id
153 WHERE node_id = ? AND ngrams_type_id = ?
156 SELECT ns.terms, CASE WHEN agg IS NULL THEN '{}' ELSE agg END
158 LEFT JOIN ncids_agg ON ns.ngrams_id = ncids_agg.ngrams_id
161 -- WITH node_context_ids AS
162 -- (select context_id, ngrams_id
163 -- FROM context_node_ngrams_view
166 -- (select ngrams_id FROM node_stories
167 -- WHERE node_id = ? AND ngrams_type_id = ?
171 -- ARRAY ( SELECT DISTINCT context_id
172 -- FROM node_context_ids
173 -- WHERE ns.ngrams_id = node_context_ids.ngrams_id
177 -- JOIN ns ON ng.id = ns.ngrams_id
181 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
186 -> Cmd err [(NgramsTerm, Int)]
187 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
188 fmap (first NgramsTerm) <$>
189 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
191 , toDBid NodeDocument
193 , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
198 fields = [QualifiedIdentifier Nothing "text"]
200 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
201 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
202 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
203 JOIN nodes_contexts nn ON n.id = nn.context_id
206 input_rows(terms) AS (?)
207 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
208 JOIN ngrams ng ON cng.ngrams_id = ng.id
209 JOIN input_rows ir ON ir.terms = ng.terms
210 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
211 JOIN nodes_sample n ON nn.context_id = n.id
212 WHERE nn.node_id = ? -- CorpusId
213 AND cng.ngrams_type = ? -- NgramsTypeId
215 GROUP BY cng.node_id, ng.terms
218 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
222 -> Cmd err [(NgramsTerm, Int)]
223 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
224 fmap (first NgramsTerm) <$>
225 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
227 , toDBid NodeDocument
233 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
234 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
235 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
236 JOIN nodes_contexts nc ON c.id = nc.context_id
239 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
240 JOIN ngrams ng ON cng.ngrams_id = ng.id
241 JOIN node_stories ns ON ns.ngrams_id = ng.id
242 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
243 JOIN contexts_sample c ON nc.context_id = c.id
244 WHERE nc.node_id = ? -- CorpusId
245 AND cng.ngrams_type = ? -- NgramsTypeId
250 ------------------------------------------------------------------------
251 getContextsByNgramsOnlyUser :: HasDBid NodeType
256 -> Cmd err (HashMap NgramsTerm (Set NodeId))
257 getContextsByNgramsOnlyUser cId ls nt ngs =
259 . map (HM.fromListWith (<>)
260 . map (second Set.singleton))
261 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
262 (splitEvery 1000 ngs)
264 getNgramsByContextOnlyUser :: HasDBid NodeType
269 -> Cmd err (Map NodeId (Set NgramsTerm))
270 getNgramsByContextOnlyUser cId ls nt ngs =
272 . map ( Map.fromListWith (<>)
273 . map (second Set.singleton)
276 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
277 (splitEvery 1000 ngs)
279 ------------------------------------------------------------------------
280 selectNgramsOnlyByContextUser :: HasDBid NodeType
285 -> Cmd err [(NgramsTerm, ContextId)]
286 selectNgramsOnlyByContextUser cId ls nt tms =
287 fmap (first NgramsTerm) <$>
288 runPGSQuery queryNgramsOnlyByContextUser
289 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
290 , Values [QualifiedIdentifier Nothing "int4"]
291 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
293 , toDBid NodeDocument
297 fields = [QualifiedIdentifier Nothing "text"]
299 queryNgramsOnlyByContextUser :: DPS.Query
300 queryNgramsOnlyByContextUser = [sql|
301 WITH input_rows(terms) AS (?),
302 input_list(id) AS (?)
303 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
304 JOIN ngrams ng ON cng.ngrams_id = ng.id
305 JOIN input_rows ir ON ir.terms = ng.terms
306 JOIN input_list il ON il.id = cng.node_id
307 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
308 JOIN contexts c ON nc.context_id = c.id
309 WHERE nc.node_id = ? -- CorpusId
310 AND c.typename = ? -- toDBid (maybe not useful with context table)
311 AND cng.ngrams_type = ? -- NgramsTypeId
313 GROUP BY ng.terms, cng.context_id
316 getNgramsByDocOnlyUser :: DocId
320 -> Cmd err (HashMap NgramsTerm (Set NodeId))
321 getNgramsByDocOnlyUser cId ls nt ngs =
323 . map (HM.fromListWith (<>) . map (second Set.singleton))
324 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
327 selectNgramsOnlyByDocUser :: DocId
331 -> Cmd err [(NgramsTerm, NodeId)]
332 selectNgramsOnlyByDocUser dId ls nt tms =
333 fmap (first NgramsTerm) <$>
334 runPGSQuery queryNgramsOnlyByDocUser
335 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
336 , Values [QualifiedIdentifier Nothing "int4"]
337 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
342 fields = [QualifiedIdentifier Nothing "text"]
345 queryNgramsOnlyByDocUser :: DPS.Query
346 queryNgramsOnlyByDocUser = [sql|
347 WITH input_rows(terms) AS (?),
348 input_list(id) AS (?)
349 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
350 JOIN ngrams ng ON cng.ngrams_id = ng.id
351 JOIN input_rows ir ON ir.terms = ng.terms
352 JOIN input_list il ON il.id = cng.context_id
353 WHERE cng.node_id = ? -- DocId
354 AND cng.ngrams_type = ? -- NgramsTypeId
355 GROUP BY ng.terms, cng.node_id
358 ------------------------------------------------------------------------
359 -- | TODO filter by language, database, any social field
360 getContextsByNgramsMaster :: HasDBid NodeType
363 -> Cmd err (HashMap Text (Set NodeId))
364 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
365 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
366 -- . takeWhile (not . List.null)
367 -- . takeWhile (\l -> List.length l > 3)
368 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
370 selectNgramsByContextMaster :: HasDBid NodeType
375 -> Cmd err [(NodeId, Text)]
376 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
377 queryNgramsByContextMaster'
379 , ngramsTypeId NgramsTerms
380 , toDBid NodeDocument
382 , toDBid NodeDocument
386 , toDBid NodeDocument
387 , ngramsTypeId NgramsTerms
390 -- | TODO fix context_node_ngrams relation
391 queryNgramsByContextMaster' :: DPS.Query
392 queryNgramsByContextMaster' = [sql|
393 WITH contextsByNgramsUser AS (
395 SELECT n.id, ng.terms FROM contexts n
396 JOIN nodes_contexts nn ON n.id = nn.context_id
397 JOIN context_node_ngrams cng ON cng.context_id = n.id
398 JOIN ngrams ng ON cng.ngrams_id = ng.id
399 WHERE nn.node_id = ? -- UserCorpusId
400 -- AND n.typename = ? -- toDBid
401 AND cng.ngrams_type = ? -- NgramsTypeId
403 AND node_pos(n.id,?) >= ?
404 AND node_pos(n.id,?) < ?
405 GROUP BY n.id, ng.terms
409 contextsByNgramsMaster AS (
411 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
412 JOIN context_node_ngrams cng ON n.id = cng.context_id
413 JOIN ngrams ng ON ng.id = cng.ngrams_id
415 WHERE n.parent_id = ? -- Master Corpus toDBid
416 AND n.typename = ? -- toDBid
417 AND cng.ngrams_type = ? -- NgramsTypeId
418 GROUP BY n.id, ng.terms
421 SELECT m.id, m.terms FROM nodesByNgramsMaster m
422 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
425 -- | Refreshes the \"context_node_ngrams_view\" materialized view.
426 -- This function will be run :
428 -- - at reindex stage
429 -- - at the end of each text flow
431 -- refreshNgramsMaterialized :: Cmd err ()
432 -- refreshNgramsMaterialized = void $ execPGSQuery refreshNgramsMaterializedQuery ()
434 -- refreshNgramsMaterializedQuery :: DPS.Query
435 -- refreshNgramsMaterializedQuery =
436 -- [sql| REFRESH MATERIALIZED VIEW CONCURRENTLY context_node_ngrams_view; |]