]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
[ngrams] try terms query without the materialized view
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByContext.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByContext
17 where
18
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Control.Monad (void)
22 import Data.HashMap.Strict (HashMap)
23 import Data.Map.Strict (Map)
24 import Data.Set (Set)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (first, second, swap)
27 import Database.PostgreSQL.Simple.SqlQQ (sql)
28 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Core
31 import Gargantext.Data.HashMap.Strict.Utils as HM
32 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
33 import Gargantext.Database.Prelude (Cmd, runPGSQuery, execPGSQuery)
34 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
35 import Gargantext.Prelude
36 import qualified Data.HashMap.Strict as HM
37 import qualified Data.List as List
38 import qualified Data.Map.Strict as Map
39 import qualified Data.Set as Set
40 import qualified Database.PostgreSQL.Simple as DPS
41 import qualified Database.PostgreSQL.Simple.Types as DPST
42
43 -- | fst is size of Supra Corpus
44 -- snd is Texts and size of Occurrences (different docs)
45
46 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
47 -> HashMap NgramsTerm (Set ContextId)
48 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
49 countContextsByNgramsWith f m = (total, m')
50 where
51 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
52 m' = HM.map ( swap . second (fromIntegral . Set.size))
53 $ groupContextsByNgramsWith f m
54
55
56 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
57 -> HashMap NgramsTerm (Set NodeId)
58 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
59 groupContextsByNgramsWith f' m'' =
60 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
61 $ HM.toList m''
62
63 ------------------------------------------------------------------------
64 getContextsByNgramsUser :: HasDBid NodeType
65 => CorpusId
66 -> NgramsType
67 -> Cmd err (HashMap NgramsTerm (Set ContextId))
68 getContextsByNgramsUser cId nt =
69 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
70 <$> selectNgramsByContextUser cId nt
71 where
72
73 selectNgramsByContextUser :: HasDBid NodeType
74 => CorpusId
75 -> NgramsType
76 -> Cmd err [(NodeId, Text)]
77 selectNgramsByContextUser cId' nt' =
78 runPGSQuery queryNgramsByContextUser
79 ( cId'
80 , toDBid NodeDocument
81 , ngramsTypeId nt'
82 -- , 100 :: Int -- limit
83 -- , 0 :: Int -- offset
84 )
85
86 queryNgramsByContextUser :: DPS.Query
87 queryNgramsByContextUser = [sql|
88 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
89 JOIN ngrams ng ON cng.ngrams_id = ng.id
90 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
91 JOIN contexts c ON nc.context_id = c.id
92 WHERE nc.node_id = ? -- CorpusId
93 AND c.typename = ? -- toDBid
94 AND cng.ngrams_type = ? -- NgramsTypeId
95 AND nc.category > 0 -- is not in Trash
96 GROUP BY cng.context_id, ng.terms
97 |]
98
99
100 ------------------------------------------------------------------------
101 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
102 => CorpusId
103 -> Int
104 -> NgramsType
105 -> [NgramsTerm]
106 -> Cmd err (HashMap NgramsTerm Int)
107 getOccByNgramsOnlyFast_withSample cId int nt ngs =
108 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
109
110
111 getOccByNgramsOnlyFast :: CorpusId
112 -> ListId
113 -> NgramsType
114 -> Cmd err (HashMap NgramsTerm [ContextId])
115 getOccByNgramsOnlyFast cId lId nt = do
116 --HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
117 HM.fromList <$> map (\(t, ns) -> (NgramsTerm t, NodeId <$> DPST.fromPGArray ns)) <$> run cId lId nt
118 where
119
120 run :: CorpusId
121 -> ListId
122 -> NgramsType
123 -> Cmd err [(Text, DPST.PGArray Int)]
124 run cId' lId' nt' = runPGSQuery query
125 ( cId'
126 , lId'
127 , ngramsTypeId nt'
128 )
129
130 query :: DPS.Query
131 query = [sql|
132 WITH cnnv AS
133 ( SELECT DISTINCT context_node_ngrams.context_id,
134 context_node_ngrams.ngrams_id,
135 nodes_contexts.node_id
136 FROM nodes_contexts
137 JOIN context_node_ngrams ON context_node_ngrams.context_id = nodes_contexts.context_id
138 ), node_context_ids AS
139 (SELECT context_id, ngrams_id, terms
140 FROM cnnv
141 JOIN ngrams ON context_node_ngrams_view.ngrams_id = ngrams.id
142 WHERE node_id = ?
143 ), ns AS
144 (SELECT ngrams_id FROM node_stories
145 WHERE node_id = ? AND ngrams_type_id = ?
146 )
147
148 SELECT terms, array_agg(DISTINCT context_id)
149 FROM ns
150 JOIN node_context_ids ON ns.ngrams_id = node_context_ids.ngrams_id
151 GROUP BY terms
152 |]
153 -- query = [sql|
154 -- WITH node_context_ids AS
155 -- (select context_id, ngrams_id
156 -- FROM context_node_ngrams_view
157 -- WHERE node_id = ?
158 -- ), ns AS
159 -- (select ngrams_id FROM node_stories
160 -- WHERE node_id = ? AND ngrams_type_id = ?
161 -- )
162
163 -- SELECT ng.terms,
164 -- ARRAY ( SELECT DISTINCT context_id
165 -- FROM node_context_ids
166 -- WHERE ns.ngrams_id = node_context_ids.ngrams_id
167 -- )
168 -- AS context_ids
169 -- FROM ngrams ng
170 -- JOIN ns ON ng.id = ns.ngrams_id
171 -- |]
172
173
174 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
175 => CorpusId
176 -> Int
177 -> NgramsType
178 -> [NgramsTerm]
179 -> Cmd err [(NgramsTerm, Int)]
180 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
181 fmap (first NgramsTerm) <$>
182 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
183 ( int
184 , toDBid NodeDocument
185 , cId
186 , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
187 , cId
188 , ngramsTypeId nt
189 )
190 where
191 fields = [QualifiedIdentifier Nothing "text"]
192
193 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
194 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
195 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
196 JOIN nodes_contexts nn ON n.id = nn.context_id
197 WHERE n.typename = ?
198 AND nn.node_id = ?),
199 input_rows(terms) AS (?)
200 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
201 JOIN ngrams ng ON cng.ngrams_id = ng.id
202 JOIN input_rows ir ON ir.terms = ng.terms
203 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
204 JOIN nodes_sample n ON nn.context_id = n.id
205 WHERE nn.node_id = ? -- CorpusId
206 AND cng.ngrams_type = ? -- NgramsTypeId
207 AND nn.category > 0
208 GROUP BY cng.node_id, ng.terms
209 |]
210
211 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
212 => CorpusId
213 -> Int
214 -> NgramsType
215 -> Cmd err [(NgramsTerm, Int)]
216 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
217 fmap (first NgramsTerm) <$>
218 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
219 ( int
220 , toDBid NodeDocument
221 , cId
222 , cId
223 , ngramsTypeId nt
224 )
225
226 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
227 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
228 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
229 JOIN nodes_contexts nc ON c.id = nc.context_id
230 WHERE c.typename = ?
231 AND nc.node_id = ?)
232 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
233 JOIN ngrams ng ON cng.ngrams_id = ng.id
234 JOIN node_stories ns ON ns.ngrams_id = ng.id
235 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
236 JOIN contexts_sample c ON nc.context_id = c.id
237 WHERE nc.node_id = ? -- CorpusId
238 AND cng.ngrams_type = ? -- NgramsTypeId
239 AND nc.category > 0
240 GROUP BY ng.id
241 |]
242
243 ------------------------------------------------------------------------
244 getContextsByNgramsOnlyUser :: HasDBid NodeType
245 => CorpusId
246 -> [ListId]
247 -> NgramsType
248 -> [NgramsTerm]
249 -> Cmd err (HashMap NgramsTerm (Set NodeId))
250 getContextsByNgramsOnlyUser cId ls nt ngs =
251 HM.unionsWith (<>)
252 . map (HM.fromListWith (<>)
253 . map (second Set.singleton))
254 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
255 (splitEvery 1000 ngs)
256
257 getNgramsByContextOnlyUser :: HasDBid NodeType
258 => NodeId
259 -> [ListId]
260 -> NgramsType
261 -> [NgramsTerm]
262 -> Cmd err (Map NodeId (Set NgramsTerm))
263 getNgramsByContextOnlyUser cId ls nt ngs =
264 Map.unionsWith (<>)
265 . map ( Map.fromListWith (<>)
266 . map (second Set.singleton)
267 )
268 . map (map swap)
269 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
270 (splitEvery 1000 ngs)
271
272 ------------------------------------------------------------------------
273 selectNgramsOnlyByContextUser :: HasDBid NodeType
274 => CorpusId
275 -> [ListId]
276 -> NgramsType
277 -> [NgramsTerm]
278 -> Cmd err [(NgramsTerm, ContextId)]
279 selectNgramsOnlyByContextUser cId ls nt tms =
280 fmap (first NgramsTerm) <$>
281 runPGSQuery queryNgramsOnlyByContextUser
282 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
283 , Values [QualifiedIdentifier Nothing "int4"]
284 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
285 , cId
286 , toDBid NodeDocument
287 , ngramsTypeId nt
288 )
289 where
290 fields = [QualifiedIdentifier Nothing "text"]
291
292 queryNgramsOnlyByContextUser :: DPS.Query
293 queryNgramsOnlyByContextUser = [sql|
294 WITH input_rows(terms) AS (?),
295 input_list(id) AS (?)
296 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
297 JOIN ngrams ng ON cng.ngrams_id = ng.id
298 JOIN input_rows ir ON ir.terms = ng.terms
299 JOIN input_list il ON il.id = cng.node_id
300 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
301 JOIN contexts c ON nc.context_id = c.id
302 WHERE nc.node_id = ? -- CorpusId
303 AND c.typename = ? -- toDBid (maybe not useful with context table)
304 AND cng.ngrams_type = ? -- NgramsTypeId
305 AND nc.category > 0
306 GROUP BY ng.terms, cng.context_id
307 |]
308
309 getNgramsByDocOnlyUser :: DocId
310 -> [ListId]
311 -> NgramsType
312 -> [NgramsTerm]
313 -> Cmd err (HashMap NgramsTerm (Set NodeId))
314 getNgramsByDocOnlyUser cId ls nt ngs =
315 HM.unionsWith (<>)
316 . map (HM.fromListWith (<>) . map (second Set.singleton))
317 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
318
319
320 selectNgramsOnlyByDocUser :: DocId
321 -> [ListId]
322 -> NgramsType
323 -> [NgramsTerm]
324 -> Cmd err [(NgramsTerm, NodeId)]
325 selectNgramsOnlyByDocUser dId ls nt tms =
326 fmap (first NgramsTerm) <$>
327 runPGSQuery queryNgramsOnlyByDocUser
328 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
329 , Values [QualifiedIdentifier Nothing "int4"]
330 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
331 , dId
332 , ngramsTypeId nt
333 )
334 where
335 fields = [QualifiedIdentifier Nothing "text"]
336
337
338 queryNgramsOnlyByDocUser :: DPS.Query
339 queryNgramsOnlyByDocUser = [sql|
340 WITH input_rows(terms) AS (?),
341 input_list(id) AS (?)
342 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
343 JOIN ngrams ng ON cng.ngrams_id = ng.id
344 JOIN input_rows ir ON ir.terms = ng.terms
345 JOIN input_list il ON il.id = cng.context_id
346 WHERE cng.node_id = ? -- DocId
347 AND cng.ngrams_type = ? -- NgramsTypeId
348 GROUP BY ng.terms, cng.node_id
349 |]
350
351 ------------------------------------------------------------------------
352 -- | TODO filter by language, database, any social field
353 getContextsByNgramsMaster :: HasDBid NodeType
354 => UserCorpusId
355 -> MasterCorpusId
356 -> Cmd err (HashMap Text (Set NodeId))
357 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
358 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
359 -- . takeWhile (not . List.null)
360 -- . takeWhile (\l -> List.length l > 3)
361 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
362
363 selectNgramsByContextMaster :: HasDBid NodeType
364 => Int
365 -> UserCorpusId
366 -> MasterCorpusId
367 -> Int
368 -> Cmd err [(NodeId, Text)]
369 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
370 queryNgramsByContextMaster'
371 ( ucId
372 , ngramsTypeId NgramsTerms
373 , toDBid NodeDocument
374 , p
375 , toDBid NodeDocument
376 , p
377 , n
378 , mcId
379 , toDBid NodeDocument
380 , ngramsTypeId NgramsTerms
381 )
382
383 -- | TODO fix context_node_ngrams relation
384 queryNgramsByContextMaster' :: DPS.Query
385 queryNgramsByContextMaster' = [sql|
386 WITH contextsByNgramsUser AS (
387
388 SELECT n.id, ng.terms FROM contexts n
389 JOIN nodes_contexts nn ON n.id = nn.context_id
390 JOIN context_node_ngrams cng ON cng.context_id = n.id
391 JOIN ngrams ng ON cng.ngrams_id = ng.id
392 WHERE nn.node_id = ? -- UserCorpusId
393 -- AND n.typename = ? -- toDBid
394 AND cng.ngrams_type = ? -- NgramsTypeId
395 AND nn.category > 0
396 AND node_pos(n.id,?) >= ?
397 AND node_pos(n.id,?) < ?
398 GROUP BY n.id, ng.terms
399
400 ),
401
402 contextsByNgramsMaster AS (
403
404 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
405 JOIN context_node_ngrams cng ON n.id = cng.context_id
406 JOIN ngrams ng ON ng.id = cng.ngrams_id
407
408 WHERE n.parent_id = ? -- Master Corpus toDBid
409 AND n.typename = ? -- toDBid
410 AND cng.ngrams_type = ? -- NgramsTypeId
411 GROUP BY n.id, ng.terms
412 )
413
414 SELECT m.id, m.terms FROM nodesByNgramsMaster m
415 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
416 |]
417
418 -- | Refreshes the \"context_node_ngrams_view\" materialized view.
419 -- This function will be run :
420 -- - periodically
421 -- - at reindex stage
422 -- - at the end of each text flow
423
424 refreshNgramsMaterialized :: Cmd err ()
425 refreshNgramsMaterialized = void $ execPGSQuery refreshNgramsMaterializedQuery ()
426 where
427 refreshNgramsMaterializedQuery :: DPS.Query
428 refreshNgramsMaterializedQuery =
429 [sql| REFRESH MATERIALIZED VIEW CONCURRENTLY context_node_ngrams_view; |]