2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByContext
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Control.Monad (void)
22 import Data.HashMap.Strict (HashMap)
23 import Data.Map.Strict (Map)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (first, second, swap)
27 import Database.PostgreSQL.Simple.SqlQQ (sql)
28 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Core
31 import Gargantext.Data.HashMap.Strict.Utils as HM
32 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
33 import Gargantext.Database.Prelude (Cmd, runPGSQuery, execPGSQuery)
34 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
35 import Gargantext.Prelude
36 import qualified Data.HashMap.Strict as HM
37 import qualified Data.List as List
38 import qualified Data.Map.Strict as Map
39 import qualified Data.Set as Set
40 import qualified Database.PostgreSQL.Simple as DPS
41 import qualified Database.PostgreSQL.Simple.Types as DPST
43 -- | fst is size of Supra Corpus
44 -- snd is Texts and size of Occurrences (different docs)
46 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
47 -> HashMap NgramsTerm (Set ContextId)
48 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
49 countContextsByNgramsWith f m = (total, m')
51 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
52 m' = HM.map ( swap . second (fromIntegral . Set.size))
53 $ groupContextsByNgramsWith f m
56 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
57 -> HashMap NgramsTerm (Set NodeId)
58 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
59 groupContextsByNgramsWith f' m'' =
60 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
63 ------------------------------------------------------------------------
64 getContextsByNgramsUser :: HasDBid NodeType
67 -> Cmd err (HashMap NgramsTerm (Set ContextId))
68 getContextsByNgramsUser cId nt =
69 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
70 <$> selectNgramsByContextUser cId nt
73 selectNgramsByContextUser :: HasDBid NodeType
76 -> Cmd err [(NodeId, Text)]
77 selectNgramsByContextUser cId' nt' =
78 runPGSQuery queryNgramsByContextUser
82 -- , 100 :: Int -- limit
83 -- , 0 :: Int -- offset
86 queryNgramsByContextUser :: DPS.Query
87 queryNgramsByContextUser = [sql|
88 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
89 JOIN ngrams ng ON cng.ngrams_id = ng.id
90 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
91 JOIN contexts c ON nc.context_id = c.id
92 WHERE nc.node_id = ? -- CorpusId
93 AND c.typename = ? -- toDBid
94 AND cng.ngrams_type = ? -- NgramsTypeId
95 AND nc.category > 0 -- is not in Trash
96 GROUP BY cng.context_id, ng.terms
100 ------------------------------------------------------------------------
101 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
106 -> Cmd err (HashMap NgramsTerm Int)
107 getOccByNgramsOnlyFast_withSample cId int nt ngs =
108 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
111 getOccByNgramsOnlyFast :: CorpusId
114 -> Cmd err (HashMap NgramsTerm [ContextId])
115 getOccByNgramsOnlyFast cId lId nt = do
116 --HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
117 HM.fromList <$> map (\(t, ns) -> (NgramsTerm t, NodeId <$> DPST.fromPGArray ns)) <$> run cId lId nt
123 -> Cmd err [(Text, DPST.PGArray Int)]
124 run cId' lId' nt' = runPGSQuery query
133 ( SELECT DISTINCT context_node_ngrams.context_id,
134 context_node_ngrams.ngrams_id,
135 nodes_contexts.node_id
137 JOIN context_node_ngrams ON context_node_ngrams.context_id = nodes_contexts.context_id
138 ), node_context_ids AS
139 (SELECT context_id, ngrams_id, terms
141 JOIN ngrams ON context_node_ngrams_view.ngrams_id = ngrams.id
144 (SELECT ngrams_id FROM node_stories
145 WHERE node_id = ? AND ngrams_type_id = ?
148 SELECT terms, array_agg(DISTINCT context_id)
150 JOIN node_context_ids ON ns.ngrams_id = node_context_ids.ngrams_id
154 -- WITH node_context_ids AS
155 -- (select context_id, ngrams_id
156 -- FROM context_node_ngrams_view
159 -- (select ngrams_id FROM node_stories
160 -- WHERE node_id = ? AND ngrams_type_id = ?
164 -- ARRAY ( SELECT DISTINCT context_id
165 -- FROM node_context_ids
166 -- WHERE ns.ngrams_id = node_context_ids.ngrams_id
170 -- JOIN ns ON ng.id = ns.ngrams_id
174 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
179 -> Cmd err [(NgramsTerm, Int)]
180 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
181 fmap (first NgramsTerm) <$>
182 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
184 , toDBid NodeDocument
186 , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
191 fields = [QualifiedIdentifier Nothing "text"]
193 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
194 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
195 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
196 JOIN nodes_contexts nn ON n.id = nn.context_id
199 input_rows(terms) AS (?)
200 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
201 JOIN ngrams ng ON cng.ngrams_id = ng.id
202 JOIN input_rows ir ON ir.terms = ng.terms
203 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
204 JOIN nodes_sample n ON nn.context_id = n.id
205 WHERE nn.node_id = ? -- CorpusId
206 AND cng.ngrams_type = ? -- NgramsTypeId
208 GROUP BY cng.node_id, ng.terms
211 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
215 -> Cmd err [(NgramsTerm, Int)]
216 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
217 fmap (first NgramsTerm) <$>
218 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
220 , toDBid NodeDocument
226 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
227 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
228 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
229 JOIN nodes_contexts nc ON c.id = nc.context_id
232 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
233 JOIN ngrams ng ON cng.ngrams_id = ng.id
234 JOIN node_stories ns ON ns.ngrams_id = ng.id
235 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
236 JOIN contexts_sample c ON nc.context_id = c.id
237 WHERE nc.node_id = ? -- CorpusId
238 AND cng.ngrams_type = ? -- NgramsTypeId
243 ------------------------------------------------------------------------
244 getContextsByNgramsOnlyUser :: HasDBid NodeType
249 -> Cmd err (HashMap NgramsTerm (Set NodeId))
250 getContextsByNgramsOnlyUser cId ls nt ngs =
252 . map (HM.fromListWith (<>)
253 . map (second Set.singleton))
254 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
255 (splitEvery 1000 ngs)
257 getNgramsByContextOnlyUser :: HasDBid NodeType
262 -> Cmd err (Map NodeId (Set NgramsTerm))
263 getNgramsByContextOnlyUser cId ls nt ngs =
265 . map ( Map.fromListWith (<>)
266 . map (second Set.singleton)
269 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
270 (splitEvery 1000 ngs)
272 ------------------------------------------------------------------------
273 selectNgramsOnlyByContextUser :: HasDBid NodeType
278 -> Cmd err [(NgramsTerm, ContextId)]
279 selectNgramsOnlyByContextUser cId ls nt tms =
280 fmap (first NgramsTerm) <$>
281 runPGSQuery queryNgramsOnlyByContextUser
282 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
283 , Values [QualifiedIdentifier Nothing "int4"]
284 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
286 , toDBid NodeDocument
290 fields = [QualifiedIdentifier Nothing "text"]
292 queryNgramsOnlyByContextUser :: DPS.Query
293 queryNgramsOnlyByContextUser = [sql|
294 WITH input_rows(terms) AS (?),
295 input_list(id) AS (?)
296 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
297 JOIN ngrams ng ON cng.ngrams_id = ng.id
298 JOIN input_rows ir ON ir.terms = ng.terms
299 JOIN input_list il ON il.id = cng.node_id
300 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
301 JOIN contexts c ON nc.context_id = c.id
302 WHERE nc.node_id = ? -- CorpusId
303 AND c.typename = ? -- toDBid (maybe not useful with context table)
304 AND cng.ngrams_type = ? -- NgramsTypeId
306 GROUP BY ng.terms, cng.context_id
309 getNgramsByDocOnlyUser :: DocId
313 -> Cmd err (HashMap NgramsTerm (Set NodeId))
314 getNgramsByDocOnlyUser cId ls nt ngs =
316 . map (HM.fromListWith (<>) . map (second Set.singleton))
317 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
320 selectNgramsOnlyByDocUser :: DocId
324 -> Cmd err [(NgramsTerm, NodeId)]
325 selectNgramsOnlyByDocUser dId ls nt tms =
326 fmap (first NgramsTerm) <$>
327 runPGSQuery queryNgramsOnlyByDocUser
328 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
329 , Values [QualifiedIdentifier Nothing "int4"]
330 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
335 fields = [QualifiedIdentifier Nothing "text"]
338 queryNgramsOnlyByDocUser :: DPS.Query
339 queryNgramsOnlyByDocUser = [sql|
340 WITH input_rows(terms) AS (?),
341 input_list(id) AS (?)
342 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
343 JOIN ngrams ng ON cng.ngrams_id = ng.id
344 JOIN input_rows ir ON ir.terms = ng.terms
345 JOIN input_list il ON il.id = cng.context_id
346 WHERE cng.node_id = ? -- DocId
347 AND cng.ngrams_type = ? -- NgramsTypeId
348 GROUP BY ng.terms, cng.node_id
351 ------------------------------------------------------------------------
352 -- | TODO filter by language, database, any social field
353 getContextsByNgramsMaster :: HasDBid NodeType
356 -> Cmd err (HashMap Text (Set NodeId))
357 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
358 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
359 -- . takeWhile (not . List.null)
360 -- . takeWhile (\l -> List.length l > 3)
361 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
363 selectNgramsByContextMaster :: HasDBid NodeType
368 -> Cmd err [(NodeId, Text)]
369 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
370 queryNgramsByContextMaster'
372 , ngramsTypeId NgramsTerms
373 , toDBid NodeDocument
375 , toDBid NodeDocument
379 , toDBid NodeDocument
380 , ngramsTypeId NgramsTerms
383 -- | TODO fix context_node_ngrams relation
384 queryNgramsByContextMaster' :: DPS.Query
385 queryNgramsByContextMaster' = [sql|
386 WITH contextsByNgramsUser AS (
388 SELECT n.id, ng.terms FROM contexts n
389 JOIN nodes_contexts nn ON n.id = nn.context_id
390 JOIN context_node_ngrams cng ON cng.context_id = n.id
391 JOIN ngrams ng ON cng.ngrams_id = ng.id
392 WHERE nn.node_id = ? -- UserCorpusId
393 -- AND n.typename = ? -- toDBid
394 AND cng.ngrams_type = ? -- NgramsTypeId
396 AND node_pos(n.id,?) >= ?
397 AND node_pos(n.id,?) < ?
398 GROUP BY n.id, ng.terms
402 contextsByNgramsMaster AS (
404 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
405 JOIN context_node_ngrams cng ON n.id = cng.context_id
406 JOIN ngrams ng ON ng.id = cng.ngrams_id
408 WHERE n.parent_id = ? -- Master Corpus toDBid
409 AND n.typename = ? -- toDBid
410 AND cng.ngrams_type = ? -- NgramsTypeId
411 GROUP BY n.id, ng.terms
414 SELECT m.id, m.terms FROM nodesByNgramsMaster m
415 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
418 -- | Refreshes the \"context_node_ngrams_view\" materialized view.
419 -- This function will be run :
421 -- - at reindex stage
422 -- - at the end of each text flow
424 refreshNgramsMaterialized :: Cmd err ()
425 refreshNgramsMaterialized = void $ execPGSQuery refreshNgramsMaterializedQuery ()
427 refreshNgramsMaterializedQuery :: DPS.Query
428 refreshNgramsMaterializedQuery =
429 [sql| REFRESH MATERIALIZED VIEW CONCURRENTLY context_node_ngrams_view; |]