]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
[MERGE] Phylo
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByContext.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByContext
17 where
18
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 -- import Control.Monad (void)
22 import Data.HashMap.Strict (HashMap)
23 import Data.Map.Strict (Map)
24 import Data.Set (Set)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (first, second, swap)
27 import Database.PostgreSQL.Simple.SqlQQ (sql)
28 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Core
31 import Gargantext.Data.HashMap.Strict.Utils as HM
32 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
33 import Gargantext.Database.Prelude (Cmd, runPGSQuery) -- , execPGSQuery)
34 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
35 import Gargantext.Prelude
36 import qualified Data.HashMap.Strict as HM
37 import qualified Data.List as List
38 import qualified Data.Map.Strict as Map
39 import qualified Data.Set as Set
40 import qualified Database.PostgreSQL.Simple as DPS
41 import qualified Database.PostgreSQL.Simple.Types as DPST
42
43 -- | fst is size of Supra Corpus
44 -- snd is Texts and size of Occurrences (different docs)
45
46 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
47 -> HashMap NgramsTerm (Set ContextId)
48 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
49 countContextsByNgramsWith f m = (total, m')
50 where
51 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
52 m' = HM.map ( swap . second (fromIntegral . Set.size))
53 $ groupContextsByNgramsWith f m
54
55
56 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
57 -> HashMap NgramsTerm (Set NodeId)
58 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
59 groupContextsByNgramsWith f' m'' =
60 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
61 $ HM.toList m''
62
63 ------------------------------------------------------------------------
64 getContextsByNgramsUser :: HasDBid NodeType
65 => CorpusId
66 -> NgramsType
67 -> Cmd err (HashMap NgramsTerm (Set ContextId))
68 getContextsByNgramsUser cId nt =
69 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
70 <$> selectNgramsByContextUser cId nt
71 where
72
73 selectNgramsByContextUser :: HasDBid NodeType
74 => CorpusId
75 -> NgramsType
76 -> Cmd err [(NodeId, Text)]
77 selectNgramsByContextUser cId' nt' =
78 runPGSQuery queryNgramsByContextUser
79 ( cId'
80 , toDBid NodeDocument
81 , ngramsTypeId nt'
82 -- , 100 :: Int -- limit
83 -- , 0 :: Int -- offset
84 )
85
86 queryNgramsByContextUser :: DPS.Query
87 queryNgramsByContextUser = [sql|
88 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
89 JOIN ngrams ng ON cng.ngrams_id = ng.id
90 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
91 JOIN contexts c ON nc.context_id = c.id
92 WHERE nc.node_id = ? -- CorpusId
93 AND c.typename = ? -- toDBid
94 AND cng.ngrams_type = ? -- NgramsTypeId
95 AND nc.category > 0 -- is not in Trash
96 GROUP BY cng.context_id, ng.terms
97 |]
98
99
100 ------------------------------------------------------------------------
101 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
102 => CorpusId
103 -> Int
104 -> NgramsType
105 -> [NgramsTerm]
106 -> Cmd err (HashMap NgramsTerm Int)
107 getOccByNgramsOnlyFast_withSample cId int nt ngs =
108 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
109
110
111 getOccByNgramsOnlyFast :: CorpusId
112 -> ListId
113 -> NgramsType
114 -> Cmd err (HashMap NgramsTerm [ContextId])
115 getOccByNgramsOnlyFast cId lId nt = do
116 --HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
117 HM.fromList <$> map (\(t, ns) -> (NgramsTerm t, NodeId <$> DPST.fromPGArray ns)) <$> run cId lId nt
118 where
119
120 run :: CorpusId
121 -> ListId
122 -> NgramsType
123 -> Cmd err [(Text, DPST.PGArray Int)]
124 run cId' lId' nt' = runPGSQuery query
125 ( cId'
126 , lId'
127 , ngramsTypeId nt'
128 )
129
130 query :: DPS.Query
131 query = [sql|
132 WITH cnnv AS
133 ( SELECT DISTINCT context_node_ngrams.context_id,
134 context_node_ngrams.ngrams_id,
135 nodes_contexts.node_id
136 FROM nodes_contexts
137 JOIN context_node_ngrams ON context_node_ngrams.context_id = nodes_contexts.context_id
138 ),
139 node_context_ids AS
140 (SELECT context_id, ngrams_id, terms
141 FROM cnnv
142 JOIN ngrams ON cnnv.ngrams_id = ngrams.id
143 WHERE node_id = ?
144 ),
145 ncids_agg AS
146 (SELECT ngrams_id, terms, array_agg(DISTINCT context_id) AS agg
147 FROM node_context_ids
148 GROUP BY (ngrams_id, terms)),
149 ns AS
150 (SELECT ngrams_id, terms
151 FROM node_stories
152 JOIN ngrams ON ngrams_id = ngrams.id
153 WHERE node_id = ? AND ngrams_type_id = ?
154 )
155
156 SELECT ns.terms, CASE WHEN agg IS NULL THEN '{}' ELSE agg END
157 FROM ns
158 LEFT JOIN ncids_agg ON ns.ngrams_id = ncids_agg.ngrams_id
159 |]
160 -- query = [sql|
161 -- WITH node_context_ids AS
162 -- (select context_id, ngrams_id
163 -- FROM context_node_ngrams_view
164 -- WHERE node_id = ?
165 -- ), ns AS
166 -- (select ngrams_id FROM node_stories
167 -- WHERE node_id = ? AND ngrams_type_id = ?
168 -- )
169
170 -- SELECT ng.terms,
171 -- ARRAY ( SELECT DISTINCT context_id
172 -- FROM node_context_ids
173 -- WHERE ns.ngrams_id = node_context_ids.ngrams_id
174 -- )
175 -- AS context_ids
176 -- FROM ngrams ng
177 -- JOIN ns ON ng.id = ns.ngrams_id
178 -- |]
179
180
181 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
182 => CorpusId
183 -> Int
184 -> NgramsType
185 -> [NgramsTerm]
186 -> Cmd err [(NgramsTerm, Int)]
187 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
188 fmap (first NgramsTerm) <$>
189 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
190 ( int
191 , toDBid NodeDocument
192 , cId
193 , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
194 , cId
195 , ngramsTypeId nt
196 )
197 where
198 fields = [QualifiedIdentifier Nothing "text"]
199
200 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
201 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
202 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
203 JOIN nodes_contexts nn ON n.id = nn.context_id
204 WHERE n.typename = ?
205 AND nn.node_id = ?),
206 input_rows(terms) AS (?)
207 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
208 JOIN ngrams ng ON cng.ngrams_id = ng.id
209 JOIN input_rows ir ON ir.terms = ng.terms
210 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
211 JOIN nodes_sample n ON nn.context_id = n.id
212 WHERE nn.node_id = ? -- CorpusId
213 AND cng.ngrams_type = ? -- NgramsTypeId
214 AND nn.category > 0
215 GROUP BY cng.node_id, ng.terms
216 |]
217
218 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
219 => CorpusId
220 -> Int
221 -> NgramsType
222 -> Cmd err [(NgramsTerm, Int)]
223 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
224 fmap (first NgramsTerm) <$>
225 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
226 ( int
227 , toDBid NodeDocument
228 , cId
229 , cId
230 , ngramsTypeId nt
231 )
232
233 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
234 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
235 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
236 JOIN nodes_contexts nc ON c.id = nc.context_id
237 WHERE c.typename = ?
238 AND nc.node_id = ?)
239 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
240 JOIN ngrams ng ON cng.ngrams_id = ng.id
241 JOIN node_stories ns ON ns.ngrams_id = ng.id
242 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
243 JOIN contexts_sample c ON nc.context_id = c.id
244 WHERE nc.node_id = ? -- CorpusId
245 AND cng.ngrams_type = ? -- NgramsTypeId
246 AND nc.category > 0
247 GROUP BY ng.id
248 |]
249
250 ------------------------------------------------------------------------
251 getContextsByNgramsOnlyUser :: HasDBid NodeType
252 => CorpusId
253 -> [ListId]
254 -> NgramsType
255 -> [NgramsTerm]
256 -> Cmd err (HashMap NgramsTerm (Set NodeId))
257 getContextsByNgramsOnlyUser cId ls nt ngs =
258 HM.unionsWith (<>)
259 . map (HM.fromListWith (<>)
260 . map (second Set.singleton))
261 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
262 (splitEvery 1000 ngs)
263
264 getNgramsByContextOnlyUser :: HasDBid NodeType
265 => NodeId
266 -> [ListId]
267 -> NgramsType
268 -> [NgramsTerm]
269 -> Cmd err (Map NodeId (Set NgramsTerm))
270 getNgramsByContextOnlyUser cId ls nt ngs =
271 Map.unionsWith (<>)
272 . map ( Map.fromListWith (<>)
273 . map (second Set.singleton)
274 )
275 . map (map swap)
276 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
277 (splitEvery 1000 ngs)
278
279 ------------------------------------------------------------------------
280 selectNgramsOnlyByContextUser :: HasDBid NodeType
281 => CorpusId
282 -> [ListId]
283 -> NgramsType
284 -> [NgramsTerm]
285 -> Cmd err [(NgramsTerm, ContextId)]
286 selectNgramsOnlyByContextUser cId ls nt tms =
287 fmap (first NgramsTerm) <$>
288 runPGSQuery queryNgramsOnlyByContextUser
289 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
290 , Values [QualifiedIdentifier Nothing "int4"]
291 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
292 , cId
293 , toDBid NodeDocument
294 , ngramsTypeId nt
295 )
296 where
297 fields = [QualifiedIdentifier Nothing "text"]
298
299 queryNgramsOnlyByContextUser :: DPS.Query
300 queryNgramsOnlyByContextUser = [sql|
301 WITH input_rows(terms) AS (?),
302 input_list(id) AS (?)
303 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
304 JOIN ngrams ng ON cng.ngrams_id = ng.id
305 JOIN input_rows ir ON ir.terms = ng.terms
306 JOIN input_list il ON il.id = cng.node_id
307 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
308 JOIN contexts c ON nc.context_id = c.id
309 WHERE nc.node_id = ? -- CorpusId
310 AND c.typename = ? -- toDBid (maybe not useful with context table)
311 AND cng.ngrams_type = ? -- NgramsTypeId
312 AND nc.category > 0
313 GROUP BY ng.terms, cng.context_id
314 |]
315
316 getNgramsByDocOnlyUser :: DocId
317 -> [ListId]
318 -> NgramsType
319 -> [NgramsTerm]
320 -> Cmd err (HashMap NgramsTerm (Set NodeId))
321 getNgramsByDocOnlyUser cId ls nt ngs =
322 HM.unionsWith (<>)
323 . map (HM.fromListWith (<>) . map (second Set.singleton))
324 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
325
326
327 selectNgramsOnlyByDocUser :: DocId
328 -> [ListId]
329 -> NgramsType
330 -> [NgramsTerm]
331 -> Cmd err [(NgramsTerm, NodeId)]
332 selectNgramsOnlyByDocUser dId ls nt tms =
333 fmap (first NgramsTerm) <$>
334 runPGSQuery queryNgramsOnlyByDocUser
335 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
336 , Values [QualifiedIdentifier Nothing "int4"]
337 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
338 , dId
339 , ngramsTypeId nt
340 )
341 where
342 fields = [QualifiedIdentifier Nothing "text"]
343
344
345 queryNgramsOnlyByDocUser :: DPS.Query
346 queryNgramsOnlyByDocUser = [sql|
347 WITH input_rows(terms) AS (?),
348 input_list(id) AS (?)
349 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
350 JOIN ngrams ng ON cng.ngrams_id = ng.id
351 JOIN input_rows ir ON ir.terms = ng.terms
352 JOIN input_list il ON il.id = cng.context_id
353 WHERE cng.node_id = ? -- DocId
354 AND cng.ngrams_type = ? -- NgramsTypeId
355 GROUP BY ng.terms, cng.node_id
356 |]
357
358 ------------------------------------------------------------------------
359 -- | TODO filter by language, database, any social field
360 getContextsByNgramsMaster :: HasDBid NodeType
361 => UserCorpusId
362 -> MasterCorpusId
363 -> Cmd err (HashMap Text (Set NodeId))
364 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
365 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
366 -- . takeWhile (not . List.null)
367 -- . takeWhile (\l -> List.length l > 3)
368 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
369
370 selectNgramsByContextMaster :: HasDBid NodeType
371 => Int
372 -> UserCorpusId
373 -> MasterCorpusId
374 -> Int
375 -> Cmd err [(NodeId, Text)]
376 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
377 queryNgramsByContextMaster'
378 ( ucId
379 , ngramsTypeId NgramsTerms
380 , toDBid NodeDocument
381 , p
382 , toDBid NodeDocument
383 , p
384 , n
385 , mcId
386 , toDBid NodeDocument
387 , ngramsTypeId NgramsTerms
388 )
389
390 -- | TODO fix context_node_ngrams relation
391 queryNgramsByContextMaster' :: DPS.Query
392 queryNgramsByContextMaster' = [sql|
393 WITH contextsByNgramsUser AS (
394
395 SELECT n.id, ng.terms FROM contexts n
396 JOIN nodes_contexts nn ON n.id = nn.context_id
397 JOIN context_node_ngrams cng ON cng.context_id = n.id
398 JOIN ngrams ng ON cng.ngrams_id = ng.id
399 WHERE nn.node_id = ? -- UserCorpusId
400 -- AND n.typename = ? -- toDBid
401 AND cng.ngrams_type = ? -- NgramsTypeId
402 AND nn.category > 0
403 AND node_pos(n.id,?) >= ?
404 AND node_pos(n.id,?) < ?
405 GROUP BY n.id, ng.terms
406
407 ),
408
409 contextsByNgramsMaster AS (
410
411 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
412 JOIN context_node_ngrams cng ON n.id = cng.context_id
413 JOIN ngrams ng ON ng.id = cng.ngrams_id
414
415 WHERE n.parent_id = ? -- Master Corpus toDBid
416 AND n.typename = ? -- toDBid
417 AND cng.ngrams_type = ? -- NgramsTypeId
418 GROUP BY n.id, ng.terms
419 )
420
421 SELECT m.id, m.terms FROM nodesByNgramsMaster m
422 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
423 |]
424
425 -- | Refreshes the \"context_node_ngrams_view\" materialized view.
426 -- This function will be run :
427 -- - periodically
428 -- - at reindex stage
429 -- - at the end of each text flow
430
431 -- refreshNgramsMaterialized :: Cmd err ()
432 -- refreshNgramsMaterialized = void $ execPGSQuery refreshNgramsMaterializedQuery ()
433 -- where
434 -- refreshNgramsMaterializedQuery :: DPS.Query
435 -- refreshNgramsMaterializedQuery =
436 -- [sql| REFRESH MATERIALIZED VIEW CONCURRENTLY context_node_ngrams_view; |]