]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByNode.hs
[searx] custom flow for searx docs insert
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByNode.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByNode
17 where
18
19 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (Map)
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import Data.Tuple.Extra (first, second, swap)
25 import Database.PostgreSQL.Simple.SqlQQ (sql)
26 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
27 import Debug.Trace (trace)
28 import Gargantext.Core
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
39
40 -- | fst is size of Supra Corpus
41 -- snd is Texts and size of Occurrences (different docs)
42 countNodesByNgramsWith :: (NgramsTerm -> NgramsTerm)
43 -> HashMap NgramsTerm (Set NodeId)
44 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
45 countNodesByNgramsWith f m = (total, m')
46 where
47 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
48 m' = HM.map ( swap . second (fromIntegral . Set.size))
49 $ groupNodesByNgramsWith f m
50
51
52 groupNodesByNgramsWith :: (NgramsTerm -> NgramsTerm)
53 -> HashMap NgramsTerm (Set NodeId)
54 -> HashMap NgramsTerm (Set NgramsTerm, Set NodeId)
55 groupNodesByNgramsWith f m =
56 HM.fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
57 $ HM.toList m
58
59 ------------------------------------------------------------------------
60 getNodesByNgramsUser :: HasDBid NodeType
61 => CorpusId
62 -> NgramsType
63 -> Cmd err (HashMap NgramsTerm (Set NodeId))
64 getNodesByNgramsUser cId nt =
65 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
66 <$> selectNgramsByNodeUser cId nt
67 where
68
69 selectNgramsByNodeUser :: HasDBid NodeType
70 => CorpusId
71 -> NgramsType
72 -> Cmd err [(NodeId, Text)]
73 selectNgramsByNodeUser cId' nt' =
74 runPGSQuery queryNgramsByNodeUser
75 ( cId'
76 , toDBid NodeDocument
77 , ngramsTypeId nt'
78 -- , 100 :: Int -- limit
79 -- , 0 :: Int -- offset
80 )
81
82 queryNgramsByNodeUser :: DPS.Query
83 queryNgramsByNodeUser = [sql|
84 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
85 JOIN ngrams ng ON nng.ngrams_id = ng.id
86 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
87 JOIN nodes n ON nn.node2_id = n.id
88 WHERE nn.node1_id = ? -- CorpusId
89 AND n.typename = ? -- toDBid
90 AND nng.ngrams_type = ? -- NgramsTypeId
91 AND nn.category > 0
92 GROUP BY nng.node2_id, ng.terms
93 ORDER BY (nng.node2_id, ng.terms) DESC
94 -- LIMIT ?
95 -- OFFSET ?
96 |]
97 ------------------------------------------------------------------------
98 -- TODO add groups
99 getOccByNgramsOnlyFast :: HasDBid NodeType
100 => CorpusId
101 -> NgramsType
102 -> [NgramsTerm]
103 -> Cmd err (HashMap NgramsTerm Int)
104 getOccByNgramsOnlyFast cId nt ngs =
105 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
106
107
108 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
109 => CorpusId
110 -> Int
111 -> NgramsType
112 -> [NgramsTerm]
113 -> Cmd err (HashMap NgramsTerm Int)
114 getOccByNgramsOnlyFast_withSample cId int nt ngs =
115 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser_withSample cId int nt ngs
116
117
118
119
120 getOccByNgramsOnlyFast' :: CorpusId
121 -> ListId
122 -> NgramsType
123 -> [NgramsTerm]
124 -> Cmd err (HashMap NgramsTerm Int)
125 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
126 HM.fromListWith (+) <$> map (second round) <$> run cId lId nt tms
127
128 where
129 fields = [QualifiedIdentifier Nothing "text"]
130
131 run :: CorpusId
132 -> ListId
133 -> NgramsType
134 -> [NgramsTerm]
135 -> Cmd err [(NgramsTerm, Double)]
136 run cId' lId' nt' tms' = fmap (first NgramsTerm) <$> runPGSQuery query
137 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms')
138 , cId'
139 , lId'
140 , ngramsTypeId nt'
141 )
142
143 query :: DPS.Query
144 query = [sql|
145 WITH input_rows(terms) AS (?)
146 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
147 JOIN ngrams ng ON nng.ngrams_id = ng.id
148 JOIN input_rows ir ON ir.terms = ng.terms
149 WHERE nng.node1_id = ? -- CorpusId
150 AND nng.node2_id = ? -- ListId
151 AND nng.ngrams_type = ? -- NgramsTypeId
152 -- AND nn.category > 0 -- TODO
153 GROUP BY ng.terms, nng.weight
154 |]
155
156
157 -- just slower than getOccByNgramsOnlyFast
158 getOccByNgramsOnlySlow :: HasDBid NodeType
159 => NodeType
160 -> CorpusId
161 -> [ListId]
162 -> NgramsType
163 -> [NgramsTerm]
164 -> Cmd err (HashMap NgramsTerm Int)
165 getOccByNgramsOnlySlow t cId ls nt ngs =
166 HM.map Set.size <$> getScore' t cId ls nt ngs
167 where
168 getScore' NodeCorpus = getNodesByNgramsOnlyUser
169 getScore' NodeDocument = getNgramsByDocOnlyUser
170 getScore' _ = getNodesByNgramsOnlyUser
171
172 getOccByNgramsOnlySafe :: HasDBid NodeType
173 => CorpusId
174 -> [ListId]
175 -> NgramsType
176 -> [NgramsTerm]
177 -> Cmd err (HashMap NgramsTerm Int)
178 getOccByNgramsOnlySafe cId ls nt ngs = do
179 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
180 fast <- getOccByNgramsOnlyFast cId nt ngs
181 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
182 when (fast /= slow) $
183 printDebug "getOccByNgramsOnlySafe: difference"
184 (HM.difference slow fast, HM.difference fast slow)
185 -- diff slow fast :: PatchMap Text (Replace (Maybe Int))
186 pure slow
187
188
189 selectNgramsOccurrencesOnlyByNodeUser :: HasDBid NodeType
190 => CorpusId
191 -> NgramsType
192 -> [NgramsTerm]
193 -> Cmd err [(NgramsTerm, Int)]
194 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
195 fmap (first NgramsTerm) <$>
196 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
197 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
198 , cId
199 , toDBid NodeDocument
200 , ngramsTypeId nt
201 )
202 where
203 fields = [QualifiedIdentifier Nothing "text"]
204
205
206
207 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
208 -- Question: with the grouping is the result exactly the same (since Set NodeId for
209 -- equivalent ngrams intersections are not empty)
210 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
211 queryNgramsOccurrencesOnlyByNodeUser = [sql|
212 WITH input_rows(terms) AS (?)
213 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
214 JOIN ngrams ng ON nng.ngrams_id = ng.id
215 JOIN input_rows ir ON ir.terms = ng.terms
216 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
217 JOIN nodes n ON nn.node2_id = n.id
218 WHERE nn.node1_id = ? -- CorpusId
219 AND n.typename = ? -- toDBid
220 AND nng.ngrams_type = ? -- NgramsTypeId
221 AND nn.category > 0
222 GROUP BY nng.node2_id, ng.terms
223 |]
224
225
226 selectNgramsOccurrencesOnlyByNodeUser_withSample :: HasDBid NodeType
227 => CorpusId
228 -> Int
229 -> NgramsType
230 -> [NgramsTerm]
231 -> Cmd err [(NgramsTerm, Int)]
232 selectNgramsOccurrencesOnlyByNodeUser_withSample cId int nt tms =
233 fmap (first NgramsTerm) <$>
234 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser_withSample
235 ( int
236 , toDBid NodeDocument
237 , cId
238 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
239 , cId
240 , ngramsTypeId nt
241 )
242 where
243 fields = [QualifiedIdentifier Nothing "text"]
244
245 queryNgramsOccurrencesOnlyByNodeUser_withSample :: DPS.Query
246 queryNgramsOccurrencesOnlyByNodeUser_withSample = [sql|
247 WITH nodes_sample AS (SELECT id FROM nodes n TABLESAMPLE SYSTEM_ROWS (?)
248 JOIN nodes_nodes nn ON n.id = nn.node2_id
249 WHERE n.typename = ?
250 AND nn.node1_id = ?),
251 input_rows(terms) AS (?)
252 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
253 JOIN ngrams ng ON nng.ngrams_id = ng.id
254 JOIN input_rows ir ON ir.terms = ng.terms
255 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
256 JOIN nodes_sample n ON nn.node2_id = n.id
257 WHERE nn.node1_id = ? -- CorpusId
258 AND nng.ngrams_type = ? -- NgramsTypeId
259 AND nn.category > 0
260 GROUP BY nng.node2_id, ng.terms
261 |]
262
263
264
265 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
266 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
267 WITH input_rows(terms) AS (?)
268 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
269 JOIN ngrams ng ON nng.ngrams_id = ng.id
270 JOIN input_rows ir ON ir.terms = ng.terms
271 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
272 JOIN nodes n ON nn.node2_id = n.id
273 WHERE nn.node1_id = ? -- CorpusId
274 AND n.typename = ? -- toDBid
275 AND nng.ngrams_type = ? -- NgramsTypeId
276 AND nn.category > 0
277 GROUP BY nng.node2_id, ng.terms
278 |]
279
280 ------------------------------------------------------------------------
281 getNodesByNgramsOnlyUser :: HasDBid NodeType
282 => CorpusId
283 -> [ListId]
284 -> NgramsType
285 -> [NgramsTerm]
286 -> Cmd err (HashMap NgramsTerm (Set NodeId))
287 getNodesByNgramsOnlyUser cId ls nt ngs =
288 HM.unionsWith (<>)
289 . map (HM.fromListWith (<>)
290 . map (second Set.singleton))
291 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
292 (splitEvery 1000 ngs)
293
294
295 getNgramsByNodeOnlyUser :: HasDBid NodeType
296 => NodeId
297 -> [ListId]
298 -> NgramsType
299 -> [NgramsTerm]
300 -> Cmd err (Map NodeId (Set NgramsTerm))
301 getNgramsByNodeOnlyUser cId ls nt ngs =
302 Map.unionsWith (<>)
303 . map ( Map.fromListWith (<>)
304 . map (second Set.singleton)
305 )
306 . map (map swap)
307 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
308 (splitEvery 1000 ngs)
309
310 ------------------------------------------------------------------------
311 selectNgramsOnlyByNodeUser :: HasDBid NodeType
312 => CorpusId
313 -> [ListId]
314 -> NgramsType
315 -> [NgramsTerm]
316 -> Cmd err [(NgramsTerm, NodeId)]
317 selectNgramsOnlyByNodeUser cId ls nt tms =
318 fmap (first NgramsTerm) <$>
319 runPGSQuery queryNgramsOnlyByNodeUser
320 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
321 , Values [QualifiedIdentifier Nothing "int4"]
322 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
323 , cId
324 , toDBid NodeDocument
325 , ngramsTypeId nt
326 )
327 where
328 fields = [QualifiedIdentifier Nothing "text"]
329
330 queryNgramsOnlyByNodeUser :: DPS.Query
331 queryNgramsOnlyByNodeUser = [sql|
332 WITH input_rows(terms) AS (?),
333 input_list(id) AS (?)
334 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
335 JOIN ngrams ng ON nng.ngrams_id = ng.id
336 JOIN input_rows ir ON ir.terms = ng.terms
337 JOIN input_list il ON il.id = nng.node1_id
338 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
339 JOIN nodes n ON nn.node2_id = n.id
340 WHERE nn.node1_id = ? -- CorpusId
341 AND n.typename = ? -- toDBid
342 AND nng.ngrams_type = ? -- NgramsTypeId
343 AND nn.category > 0
344 GROUP BY ng.terms, nng.node2_id
345 |]
346
347
348 selectNgramsOnlyByNodeUser' :: HasDBid NodeType
349 => CorpusId
350 -> [ListId]
351 -> NgramsType
352 -> [Text]
353 -> Cmd err [(Text, Int)]
354 selectNgramsOnlyByNodeUser' cId ls nt tms =
355 runPGSQuery queryNgramsOnlyByNodeUser
356 ( Values fields (DPS.Only <$> tms)
357 , Values [QualifiedIdentifier Nothing "int4"]
358 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
359 , cId
360 , toDBid NodeDocument
361 , ngramsTypeId nt
362 )
363 where
364 fields = [QualifiedIdentifier Nothing "text"]
365
366 queryNgramsOnlyByNodeUser' :: DPS.Query
367 queryNgramsOnlyByNodeUser' = [sql|
368 WITH input_rows(terms) AS (?),
369 input_list(id) AS (?)
370 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
371 JOIN ngrams ng ON nng.ngrams_id = ng.id
372 JOIN input_rows ir ON ir.terms = ng.terms
373 JOIN input_list il ON il.id = nng.node2_id
374 WHERE nng.node1_id = ? -- CorpusId
375 AND nng.ngrams_type = ? -- NgramsTypeId
376 -- AND nn.category > 0
377 GROUP BY ng.terms, nng.weight
378 |]
379
380
381 getNgramsByDocOnlyUser :: DocId
382 -> [ListId]
383 -> NgramsType
384 -> [NgramsTerm]
385 -> Cmd err (HashMap NgramsTerm (Set NodeId))
386 getNgramsByDocOnlyUser cId ls nt ngs =
387 HM.unionsWith (<>)
388 . map (HM.fromListWith (<>) . map (second Set.singleton))
389 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
390
391
392 selectNgramsOnlyByDocUser :: DocId
393 -> [ListId]
394 -> NgramsType
395 -> [NgramsTerm]
396 -> Cmd err [(NgramsTerm, NodeId)]
397 selectNgramsOnlyByDocUser dId ls nt tms =
398 fmap (first NgramsTerm) <$>
399 runPGSQuery queryNgramsOnlyByDocUser
400 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
401 , Values [QualifiedIdentifier Nothing "int4"]
402 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
403 , dId
404 , ngramsTypeId nt
405 )
406 where
407 fields = [QualifiedIdentifier Nothing "text"]
408
409
410 queryNgramsOnlyByDocUser :: DPS.Query
411 queryNgramsOnlyByDocUser = [sql|
412 WITH input_rows(terms) AS (?),
413 input_list(id) AS (?)
414 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
415 JOIN ngrams ng ON nng.ngrams_id = ng.id
416 JOIN input_rows ir ON ir.terms = ng.terms
417 JOIN input_list il ON il.id = nng.node1_id
418 WHERE nng.node2_id = ? -- DocId
419 AND nng.ngrams_type = ? -- NgramsTypeId
420 GROUP BY ng.terms, nng.node2_id
421 |]
422
423 ------------------------------------------------------------------------
424 -- | TODO filter by language, database, any social field
425 getNodesByNgramsMaster :: HasDBid NodeType
426 => UserCorpusId -> MasterCorpusId -> Cmd err (HashMap Text (Set NodeId))
427 getNodesByNgramsMaster ucId mcId = unionsWith (<>)
428 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
429 -- . takeWhile (not . List.null)
430 -- . takeWhile (\l -> List.length l > 3)
431 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
432
433 selectNgramsByNodeMaster :: HasDBid NodeType
434 => Int
435 -> UserCorpusId
436 -> MasterCorpusId
437 -> Int
438 -> Cmd err [(NodeId, Text)]
439 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
440 queryNgramsByNodeMaster'
441 ( ucId
442 , ngramsTypeId NgramsTerms
443 , toDBid NodeDocument
444 , p
445 , toDBid NodeDocument
446 , p
447 , n
448 , mcId
449 , toDBid NodeDocument
450 , ngramsTypeId NgramsTerms
451 )
452
453 -- | TODO fix node_node_ngrams relation
454 queryNgramsByNodeMaster' :: DPS.Query
455 queryNgramsByNodeMaster' = [sql|
456 WITH nodesByNgramsUser AS (
457
458 SELECT n.id, ng.terms FROM nodes n
459 JOIN nodes_nodes nn ON n.id = nn.node2_id
460 JOIN node_node_ngrams nng ON nng.node2_id = n.id
461 JOIN ngrams ng ON nng.ngrams_id = ng.id
462 WHERE nn.node1_id = ? -- UserCorpusId
463 -- AND n.typename = ? -- toDBid
464 AND nng.ngrams_type = ? -- NgramsTypeId
465 AND nn.category > 0
466 AND node_pos(n.id,?) >= ?
467 AND node_pos(n.id,?) < ?
468 GROUP BY n.id, ng.terms
469
470 ),
471
472 nodesByNgramsMaster AS (
473
474 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
475 JOIN node_node_ngrams nng ON n.id = nng.node2_id
476 JOIN ngrams ng ON ng.id = nng.ngrams_id
477
478 WHERE n.parent_id = ? -- Master Corpus toDBid
479 AND n.typename = ? -- toDBid
480 AND nng.ngrams_type = ? -- NgramsTypeId
481 GROUP BY n.id, ng.terms
482 )
483
484 SELECT m.id, m.terms FROM nodesByNgramsMaster m
485 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
486 |]