]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByNode.hs
[OPTIM] Insert ngrams
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByNode.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByNode
17 where
18
19 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (Map)
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import Data.Tuple.Extra (first, second, swap)
25 import Database.PostgreSQL.Simple.SqlQQ (sql)
26 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
27 import Debug.Trace (trace)
28 import Gargantext.Core
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
39
40 -- | fst is size of Supra Corpus
41 -- snd is Texts and size of Occurrences (different docs)
42 countNodesByNgramsWith :: (NgramsTerm -> NgramsTerm)
43 -> HashMap NgramsTerm (Set NodeId)
44 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
45 countNodesByNgramsWith f m = (total, m')
46 where
47 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
48 m' = HM.map ( swap . second (fromIntegral . Set.size))
49 $ groupNodesByNgramsWith f m
50
51
52 groupNodesByNgramsWith :: (NgramsTerm -> NgramsTerm)
53 -> HashMap NgramsTerm (Set NodeId)
54 -> HashMap NgramsTerm (Set NgramsTerm, Set NodeId)
55 groupNodesByNgramsWith f m =
56 HM.fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
57 $ HM.toList m
58
59 ------------------------------------------------------------------------
60 getNodesByNgramsUser :: HasDBid NodeType
61 => CorpusId
62 -> NgramsType
63 -> Cmd err (HashMap NgramsTerm (Set NodeId))
64 getNodesByNgramsUser cId nt =
65 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
66 <$> selectNgramsByNodeUser cId nt
67 where
68
69 selectNgramsByNodeUser :: HasDBid NodeType
70 => CorpusId
71 -> NgramsType
72 -> Cmd err [(NodeId, Text)]
73 selectNgramsByNodeUser cId' nt' =
74 runPGSQuery queryNgramsByNodeUser
75 ( cId'
76 , toDBid NodeDocument
77 , ngramsTypeId nt'
78 -- , 100 :: Int -- limit
79 -- , 0 :: Int -- offset
80 )
81
82 queryNgramsByNodeUser :: DPS.Query
83 queryNgramsByNodeUser = [sql|
84 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
85 JOIN ngrams ng ON nng.ngrams_id = ng.id
86 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
87 JOIN nodes n ON nn.node2_id = n.id
88 WHERE nn.node1_id = ? -- CorpusId
89 AND n.typename = ? -- toDBid
90 AND nng.ngrams_type = ? -- NgramsTypeId
91 AND nn.category > 0
92 GROUP BY nng.node2_id, ng.terms
93 ORDER BY (nng.node2_id, ng.terms) DESC
94 -- LIMIT ?
95 -- OFFSET ?
96 |]
97 ------------------------------------------------------------------------
98 -- TODO add groups
99 getOccByNgramsOnlyFast :: HasDBid NodeType
100 => CorpusId
101 -> NgramsType
102 -> [NgramsTerm]
103 -> Cmd err (HashMap NgramsTerm Int)
104 getOccByNgramsOnlyFast cId nt ngs =
105 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
106
107
108 getOccByNgramsOnlyFast' :: CorpusId
109 -> ListId
110 -> NgramsType
111 -> [NgramsTerm]
112 -> Cmd err (HashMap NgramsTerm Int)
113 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
114 HM.fromListWith (+) <$> map (second round) <$> run cId lId nt tms
115
116 where
117 fields = [QualifiedIdentifier Nothing "text"]
118
119 run :: CorpusId
120 -> ListId
121 -> NgramsType
122 -> [NgramsTerm]
123 -> Cmd err [(NgramsTerm, Double)]
124 run cId' lId' nt' tms' = fmap (first NgramsTerm) <$> runPGSQuery query
125 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms')
126 , cId'
127 , lId'
128 , ngramsTypeId nt'
129 )
130
131 query :: DPS.Query
132 query = [sql|
133 WITH input_rows(terms) AS (?)
134 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
135 JOIN ngrams ng ON nng.ngrams_id = ng.id
136 JOIN input_rows ir ON ir.terms = ng.terms
137 WHERE nng.node1_id = ? -- CorpusId
138 AND nng.node2_id = ? -- ListId
139 AND nng.ngrams_type = ? -- NgramsTypeId
140 -- AND nn.category > 0 -- TODO
141 GROUP BY ng.terms, nng.weight
142 |]
143
144
145 -- just slower than getOccByNgramsOnlyFast
146 getOccByNgramsOnlySlow :: HasDBid NodeType
147 => NodeType
148 -> CorpusId
149 -> [ListId]
150 -> NgramsType
151 -> [NgramsTerm]
152 -> Cmd err (HashMap NgramsTerm Int)
153 getOccByNgramsOnlySlow t cId ls nt ngs =
154 HM.map Set.size <$> getScore' t cId ls nt ngs
155 where
156 getScore' NodeCorpus = getNodesByNgramsOnlyUser
157 getScore' NodeDocument = getNgramsByDocOnlyUser
158 getScore' _ = getNodesByNgramsOnlyUser
159
160 getOccByNgramsOnlySafe :: HasDBid NodeType
161 => CorpusId
162 -> [ListId]
163 -> NgramsType
164 -> [NgramsTerm]
165 -> Cmd err (HashMap NgramsTerm Int)
166 getOccByNgramsOnlySafe cId ls nt ngs = do
167 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
168 fast <- getOccByNgramsOnlyFast cId nt ngs
169 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
170 when (fast /= slow) $
171 printDebug "getOccByNgramsOnlySafe: difference"
172 (HM.difference slow fast, HM.difference fast slow)
173 -- diff slow fast :: PatchMap Text (Replace (Maybe Int))
174 pure slow
175
176
177 selectNgramsOccurrencesOnlyByNodeUser :: HasDBid NodeType
178 => CorpusId
179 -> NgramsType
180 -> [NgramsTerm]
181 -> Cmd err [(NgramsTerm, Int)]
182 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
183 fmap (first NgramsTerm) <$>
184 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
185 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
186 , cId
187 , toDBid NodeDocument
188 , ngramsTypeId nt
189 )
190 where
191 fields = [QualifiedIdentifier Nothing "text"]
192
193 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
194 -- Question: with the grouping is the result exactly the same (since Set NodeId for
195 -- equivalent ngrams intersections are not empty)
196 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
197 queryNgramsOccurrencesOnlyByNodeUser = [sql|
198 WITH input_rows(terms) AS (?)
199 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
200 JOIN ngrams ng ON nng.ngrams_id = ng.id
201 JOIN input_rows ir ON ir.terms = ng.terms
202 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
203 JOIN nodes n ON nn.node2_id = n.id
204 WHERE nn.node1_id = ? -- CorpusId
205 AND n.typename = ? -- toDBid
206 AND nng.ngrams_type = ? -- NgramsTypeId
207 AND nn.category > 0
208 GROUP BY nng.node2_id, ng.terms
209 |]
210
211 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
212 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
213 WITH input_rows(terms) AS (?)
214 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
215 JOIN ngrams ng ON nng.ngrams_id = ng.id
216 JOIN input_rows ir ON ir.terms = ng.terms
217 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
218 JOIN nodes n ON nn.node2_id = n.id
219 WHERE nn.node1_id = ? -- CorpusId
220 AND n.typename = ? -- toDBid
221 AND nng.ngrams_type = ? -- NgramsTypeId
222 AND nn.category > 0
223 GROUP BY nng.node2_id, ng.terms
224 |]
225
226 ------------------------------------------------------------------------
227 getNodesByNgramsOnlyUser :: HasDBid NodeType
228 => CorpusId
229 -> [ListId]
230 -> NgramsType
231 -> [NgramsTerm]
232 -> Cmd err (HashMap NgramsTerm (Set NodeId))
233 getNodesByNgramsOnlyUser cId ls nt ngs =
234 HM.unionsWith (<>)
235 . map (HM.fromListWith (<>)
236 . map (second Set.singleton))
237 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
238 (splitEvery 1000 ngs)
239
240
241 getNgramsByNodeOnlyUser :: HasDBid NodeType
242 => NodeId
243 -> [ListId]
244 -> NgramsType
245 -> [NgramsTerm]
246 -> Cmd err (Map NodeId (Set NgramsTerm))
247 getNgramsByNodeOnlyUser cId ls nt ngs =
248 Map.unionsWith (<>)
249 . map ( Map.fromListWith (<>)
250 . map (second Set.singleton)
251 )
252 . map (map swap)
253 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
254 (splitEvery 1000 ngs)
255
256 ------------------------------------------------------------------------
257 selectNgramsOnlyByNodeUser :: HasDBid NodeType
258 => CorpusId
259 -> [ListId]
260 -> NgramsType
261 -> [NgramsTerm]
262 -> Cmd err [(NgramsTerm, NodeId)]
263 selectNgramsOnlyByNodeUser cId ls nt tms =
264 fmap (first NgramsTerm) <$>
265 runPGSQuery queryNgramsOnlyByNodeUser
266 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
267 , Values [QualifiedIdentifier Nothing "int4"]
268 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
269 , cId
270 , toDBid NodeDocument
271 , ngramsTypeId nt
272 )
273 where
274 fields = [QualifiedIdentifier Nothing "text"]
275
276 queryNgramsOnlyByNodeUser :: DPS.Query
277 queryNgramsOnlyByNodeUser = [sql|
278 WITH input_rows(terms) AS (?),
279 input_list(id) AS (?)
280 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
281 JOIN ngrams ng ON nng.ngrams_id = ng.id
282 JOIN input_rows ir ON ir.terms = ng.terms
283 JOIN input_list il ON il.id = nng.node1_id
284 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
285 JOIN nodes n ON nn.node2_id = n.id
286 WHERE nn.node1_id = ? -- CorpusId
287 AND n.typename = ? -- toDBid
288 AND nng.ngrams_type = ? -- NgramsTypeId
289 AND nn.category > 0
290 GROUP BY ng.terms, nng.node2_id
291 |]
292
293
294 selectNgramsOnlyByNodeUser' :: HasDBid NodeType
295 => CorpusId
296 -> [ListId]
297 -> NgramsType
298 -> [Text]
299 -> Cmd err [(Text, Int)]
300 selectNgramsOnlyByNodeUser' cId ls nt tms =
301 runPGSQuery queryNgramsOnlyByNodeUser
302 ( Values fields (DPS.Only <$> tms)
303 , Values [QualifiedIdentifier Nothing "int4"]
304 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
305 , cId
306 , toDBid NodeDocument
307 , ngramsTypeId nt
308 )
309 where
310 fields = [QualifiedIdentifier Nothing "text"]
311
312 queryNgramsOnlyByNodeUser' :: DPS.Query
313 queryNgramsOnlyByNodeUser' = [sql|
314 WITH input_rows(terms) AS (?),
315 input_list(id) AS (?)
316 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
317 JOIN ngrams ng ON nng.ngrams_id = ng.id
318 JOIN input_rows ir ON ir.terms = ng.terms
319 JOIN input_list il ON il.id = nng.node2_id
320 WHERE nng.node1_id = ? -- CorpusId
321 AND nng.ngrams_type = ? -- NgramsTypeId
322 -- AND nn.category > 0
323 GROUP BY ng.terms, nng.weight
324 |]
325
326
327 getNgramsByDocOnlyUser :: DocId
328 -> [ListId]
329 -> NgramsType
330 -> [NgramsTerm]
331 -> Cmd err (HashMap NgramsTerm (Set NodeId))
332 getNgramsByDocOnlyUser cId ls nt ngs =
333 HM.unionsWith (<>)
334 . map (HM.fromListWith (<>) . map (second Set.singleton))
335 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
336
337
338 selectNgramsOnlyByDocUser :: DocId
339 -> [ListId]
340 -> NgramsType
341 -> [NgramsTerm]
342 -> Cmd err [(NgramsTerm, NodeId)]
343 selectNgramsOnlyByDocUser dId ls nt tms =
344 fmap (first NgramsTerm) <$>
345 runPGSQuery queryNgramsOnlyByDocUser
346 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
347 , Values [QualifiedIdentifier Nothing "int4"]
348 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
349 , dId
350 , ngramsTypeId nt
351 )
352 where
353 fields = [QualifiedIdentifier Nothing "text"]
354
355
356 queryNgramsOnlyByDocUser :: DPS.Query
357 queryNgramsOnlyByDocUser = [sql|
358 WITH input_rows(terms) AS (?),
359 input_list(id) AS (?)
360 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
361 JOIN ngrams ng ON nng.ngrams_id = ng.id
362 JOIN input_rows ir ON ir.terms = ng.terms
363 JOIN input_list il ON il.id = nng.node1_id
364 WHERE nng.node2_id = ? -- DocId
365 AND nng.ngrams_type = ? -- NgramsTypeId
366 GROUP BY ng.terms, nng.node2_id
367 |]
368
369 ------------------------------------------------------------------------
370 -- | TODO filter by language, database, any social field
371 getNodesByNgramsMaster :: HasDBid NodeType
372 => UserCorpusId -> MasterCorpusId -> Cmd err (HashMap Text (Set NodeId))
373 getNodesByNgramsMaster ucId mcId = unionsWith (<>)
374 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
375 -- . takeWhile (not . List.null)
376 -- . takeWhile (\l -> List.length l > 3)
377 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
378
379 selectNgramsByNodeMaster :: HasDBid NodeType
380 => Int
381 -> UserCorpusId
382 -> MasterCorpusId
383 -> Int
384 -> Cmd err [(NodeId, Text)]
385 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
386 queryNgramsByNodeMaster'
387 ( ucId
388 , ngramsTypeId NgramsTerms
389 , toDBid NodeDocument
390 , p
391 , toDBid NodeDocument
392 , p
393 , n
394 , mcId
395 , toDBid NodeDocument
396 , ngramsTypeId NgramsTerms
397 )
398
399 -- | TODO fix node_node_ngrams relation
400 queryNgramsByNodeMaster' :: DPS.Query
401 queryNgramsByNodeMaster' = [sql|
402 WITH nodesByNgramsUser AS (
403
404 SELECT n.id, ng.terms FROM nodes n
405 JOIN nodes_nodes nn ON n.id = nn.node2_id
406 JOIN node_node_ngrams nng ON nng.node2_id = n.id
407 JOIN ngrams ng ON nng.ngrams_id = ng.id
408 WHERE nn.node1_id = ? -- UserCorpusId
409 -- AND n.typename = ? -- toDBid
410 AND nng.ngrams_type = ? -- NgramsTypeId
411 AND nn.category > 0
412 AND node_pos(n.id,?) >= ?
413 AND node_pos(n.id,?) < ?
414 GROUP BY n.id, ng.terms
415
416 ),
417
418 nodesByNgramsMaster AS (
419
420 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
421 JOIN node_node_ngrams nng ON n.id = nng.node2_id
422 JOIN ngrams ng ON ng.id = nng.ngrams_id
423
424 WHERE n.parent_id = ? -- Master Corpus toDBid
425 AND n.typename = ? -- toDBid
426 AND nng.ngrams_type = ? -- NgramsTypeId
427 GROUP BY n.id, ng.terms
428 )
429
430 SELECT m.id, m.terms FROM nodesByNgramsMaster m
431 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
432 |]