2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByContext
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Control.Monad (void)
22 import Data.HashMap.Strict (HashMap)
23 import Data.Map.Strict (Map)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (first, second, swap)
27 import Database.PostgreSQL.Simple.SqlQQ (sql)
28 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Core
31 import Gargantext.Data.HashMap.Strict.Utils as HM
32 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
33 import Gargantext.Database.Prelude (Cmd, runPGSQuery, execPGSQuery)
34 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
35 import Gargantext.Prelude
36 import qualified Data.HashMap.Strict as HM
37 import qualified Data.List as List
38 import qualified Data.Map.Strict as Map
39 import qualified Data.Set as Set
40 import qualified Database.PostgreSQL.Simple as DPS
41 import qualified Database.PostgreSQL.Simple.Types as DPST
43 -- | fst is size of Supra Corpus
44 -- snd is Texts and size of Occurrences (different docs)
46 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
47 -> HashMap NgramsTerm (Set ContextId)
48 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
49 countContextsByNgramsWith f m = (total, m')
51 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
52 m' = HM.map ( swap . second (fromIntegral . Set.size))
53 $ groupContextsByNgramsWith f m
56 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
57 -> HashMap NgramsTerm (Set NodeId)
58 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
59 groupContextsByNgramsWith f' m'' =
60 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
63 ------------------------------------------------------------------------
64 getContextsByNgramsUser :: HasDBid NodeType
67 -> Cmd err (HashMap NgramsTerm (Set ContextId))
68 getContextsByNgramsUser cId nt =
69 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
70 <$> selectNgramsByContextUser cId nt
73 selectNgramsByContextUser :: HasDBid NodeType
76 -> Cmd err [(NodeId, Text)]
77 selectNgramsByContextUser cId' nt' =
78 runPGSQuery queryNgramsByContextUser
82 -- , 100 :: Int -- limit
83 -- , 0 :: Int -- offset
86 queryNgramsByContextUser :: DPS.Query
87 queryNgramsByContextUser = [sql|
88 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
89 JOIN ngrams ng ON cng.ngrams_id = ng.id
90 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
91 JOIN contexts c ON nc.context_id = c.id
92 WHERE nc.node_id = ? -- CorpusId
93 AND c.typename = ? -- toDBid
94 AND cng.ngrams_type = ? -- NgramsTypeId
95 AND nc.category > 0 -- is not in Trash
96 GROUP BY cng.context_id, ng.terms
100 ------------------------------------------------------------------------
101 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
106 -> Cmd err (HashMap NgramsTerm Int)
107 getOccByNgramsOnlyFast_withSample cId int nt ngs =
108 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
111 getOccByNgramsOnlyFast :: CorpusId
114 -> Cmd err (HashMap NgramsTerm [ContextId])
115 getOccByNgramsOnlyFast cId lId nt = do
116 --HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
117 HM.fromList <$> map (\(t, ns) -> (NgramsTerm t, NodeId <$> DPST.fromPGArray ns)) <$> run cId lId nt
123 -> Cmd err [(Text, DPST.PGArray Int)]
124 run cId' lId' nt' = runPGSQuery query
132 WITH node_context_ids AS
133 (select context_id, ngrams_id
134 FROM context_node_ngrams_view
137 (select ngrams_id FROM node_stories
138 WHERE node_id = ? AND ngrams_type_id = ?
142 ARRAY ( SELECT DISTINCT context_id
143 FROM node_context_ids
144 WHERE ns.ngrams_id = node_context_ids.ngrams_id
148 JOIN ns ON ng.id = ns.ngrams_id
152 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
157 -> Cmd err [(NgramsTerm, Int)]
158 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
159 fmap (first NgramsTerm) <$>
160 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
162 , toDBid NodeDocument
164 , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
169 fields = [QualifiedIdentifier Nothing "text"]
171 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
172 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
173 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
174 JOIN nodes_contexts nn ON n.id = nn.context_id
177 input_rows(terms) AS (?)
178 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
179 JOIN ngrams ng ON cng.ngrams_id = ng.id
180 JOIN input_rows ir ON ir.terms = ng.terms
181 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
182 JOIN nodes_sample n ON nn.context_id = n.id
183 WHERE nn.node_id = ? -- CorpusId
184 AND cng.ngrams_type = ? -- NgramsTypeId
186 GROUP BY cng.node_id, ng.terms
189 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
193 -> Cmd err [(NgramsTerm, Int)]
194 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
195 fmap (first NgramsTerm) <$>
196 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
198 , toDBid NodeDocument
204 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
205 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
206 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
207 JOIN nodes_contexts nc ON c.id = nc.context_id
210 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
211 JOIN ngrams ng ON cng.ngrams_id = ng.id
212 JOIN node_stories ns ON ns.ngrams_id = ng.id
213 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
214 JOIN contexts_sample c ON nc.context_id = c.id
215 WHERE nc.node_id = ? -- CorpusId
216 AND cng.ngrams_type = ? -- NgramsTypeId
221 ------------------------------------------------------------------------
222 getContextsByNgramsOnlyUser :: HasDBid NodeType
227 -> Cmd err (HashMap NgramsTerm (Set NodeId))
228 getContextsByNgramsOnlyUser cId ls nt ngs =
230 . map (HM.fromListWith (<>)
231 . map (second Set.singleton))
232 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
233 (splitEvery 1000 ngs)
235 getNgramsByContextOnlyUser :: HasDBid NodeType
240 -> Cmd err (Map NodeId (Set NgramsTerm))
241 getNgramsByContextOnlyUser cId ls nt ngs =
243 . map ( Map.fromListWith (<>)
244 . map (second Set.singleton)
247 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
248 (splitEvery 1000 ngs)
250 ------------------------------------------------------------------------
251 selectNgramsOnlyByContextUser :: HasDBid NodeType
256 -> Cmd err [(NgramsTerm, ContextId)]
257 selectNgramsOnlyByContextUser cId ls nt tms =
258 fmap (first NgramsTerm) <$>
259 runPGSQuery queryNgramsOnlyByContextUser
260 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
261 , Values [QualifiedIdentifier Nothing "int4"]
262 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
264 , toDBid NodeDocument
268 fields = [QualifiedIdentifier Nothing "text"]
270 queryNgramsOnlyByContextUser :: DPS.Query
271 queryNgramsOnlyByContextUser = [sql|
272 WITH input_rows(terms) AS (?),
273 input_list(id) AS (?)
274 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
275 JOIN ngrams ng ON cng.ngrams_id = ng.id
276 JOIN input_rows ir ON ir.terms = ng.terms
277 JOIN input_list il ON il.id = cng.node_id
278 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
279 JOIN contexts c ON nc.context_id = c.id
280 WHERE nc.node_id = ? -- CorpusId
281 AND c.typename = ? -- toDBid (maybe not useful with context table)
282 AND cng.ngrams_type = ? -- NgramsTypeId
284 GROUP BY ng.terms, cng.context_id
287 getNgramsByDocOnlyUser :: DocId
291 -> Cmd err (HashMap NgramsTerm (Set NodeId))
292 getNgramsByDocOnlyUser cId ls nt ngs =
294 . map (HM.fromListWith (<>) . map (second Set.singleton))
295 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
298 selectNgramsOnlyByDocUser :: DocId
302 -> Cmd err [(NgramsTerm, NodeId)]
303 selectNgramsOnlyByDocUser dId ls nt tms =
304 fmap (first NgramsTerm) <$>
305 runPGSQuery queryNgramsOnlyByDocUser
306 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
307 , Values [QualifiedIdentifier Nothing "int4"]
308 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
313 fields = [QualifiedIdentifier Nothing "text"]
316 queryNgramsOnlyByDocUser :: DPS.Query
317 queryNgramsOnlyByDocUser = [sql|
318 WITH input_rows(terms) AS (?),
319 input_list(id) AS (?)
320 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
321 JOIN ngrams ng ON cng.ngrams_id = ng.id
322 JOIN input_rows ir ON ir.terms = ng.terms
323 JOIN input_list il ON il.id = cng.context_id
324 WHERE cng.node_id = ? -- DocId
325 AND cng.ngrams_type = ? -- NgramsTypeId
326 GROUP BY ng.terms, cng.node_id
329 ------------------------------------------------------------------------
330 -- | TODO filter by language, database, any social field
331 getContextsByNgramsMaster :: HasDBid NodeType
334 -> Cmd err (HashMap Text (Set NodeId))
335 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
336 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
337 -- . takeWhile (not . List.null)
338 -- . takeWhile (\l -> List.length l > 3)
339 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
341 selectNgramsByContextMaster :: HasDBid NodeType
346 -> Cmd err [(NodeId, Text)]
347 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
348 queryNgramsByContextMaster'
350 , ngramsTypeId NgramsTerms
351 , toDBid NodeDocument
353 , toDBid NodeDocument
357 , toDBid NodeDocument
358 , ngramsTypeId NgramsTerms
361 -- | TODO fix context_node_ngrams relation
362 queryNgramsByContextMaster' :: DPS.Query
363 queryNgramsByContextMaster' = [sql|
364 WITH contextsByNgramsUser AS (
366 SELECT n.id, ng.terms FROM contexts n
367 JOIN nodes_contexts nn ON n.id = nn.context_id
368 JOIN context_node_ngrams cng ON cng.context_id = n.id
369 JOIN ngrams ng ON cng.ngrams_id = ng.id
370 WHERE nn.node_id = ? -- UserCorpusId
371 -- AND n.typename = ? -- toDBid
372 AND cng.ngrams_type = ? -- NgramsTypeId
374 AND node_pos(n.id,?) >= ?
375 AND node_pos(n.id,?) < ?
376 GROUP BY n.id, ng.terms
380 contextsByNgramsMaster AS (
382 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
383 JOIN context_node_ngrams cng ON n.id = cng.context_id
384 JOIN ngrams ng ON ng.id = cng.ngrams_id
386 WHERE n.parent_id = ? -- Master Corpus toDBid
387 AND n.typename = ? -- toDBid
388 AND cng.ngrams_type = ? -- NgramsTypeId
389 GROUP BY n.id, ng.terms
392 SELECT m.id, m.terms FROM nodesByNgramsMaster m
393 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
396 -- | Refreshes the \"context_node_ngrams_view\" materialized view.
397 -- This function will be run :
399 -- - at reindex stage
400 -- - at the end of each text flow
402 refreshNgramsMaterialized :: Cmd err ()
403 refreshNgramsMaterialized = void $ execPGSQuery refreshNgramsMaterializedQuery ()
405 refreshNgramsMaterializedQuery :: DPS.Query
406 refreshNgramsMaterializedQuery = [sql| refresh materialized view context_node_ngrams_view; |]