]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByNode.hs
[FEAT] SQL fun to insert postagging
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByNode.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByNode
17 where
18
19 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (Map)
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import Data.Tuple.Extra (first, second, swap)
25 import Database.PostgreSQL.Simple.SqlQQ (sql)
26 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
27 import Debug.Trace (trace)
28 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
29 import Gargantext.Data.HashMap.Strict.Utils as HM
30 import Gargantext.Database.Admin.Config (nodeTypeId)
31 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
39
40 -- | fst is size of Supra Corpus
41 -- snd is Texts and size of Occurrences (different docs)
42 countNodesByNgramsWith :: (NgramsTerm -> NgramsTerm)
43 -> HashMap NgramsTerm (Set NodeId)
44 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
45 countNodesByNgramsWith f m = (total, m')
46 where
47 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
48 m' = HM.map ( swap . second (fromIntegral . Set.size))
49 $ groupNodesByNgramsWith f m
50
51
52 groupNodesByNgramsWith :: (NgramsTerm -> NgramsTerm)
53 -> HashMap NgramsTerm (Set NodeId)
54 -> HashMap NgramsTerm (Set NgramsTerm, Set NodeId)
55 groupNodesByNgramsWith f m =
56 HM.fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
57 $ HM.toList m
58
59 ------------------------------------------------------------------------
60 getNodesByNgramsUser :: CorpusId
61 -> NgramsType
62 -> Cmd err (HashMap NgramsTerm (Set NodeId))
63 getNodesByNgramsUser cId nt =
64 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
65 <$> selectNgramsByNodeUser cId nt
66 where
67
68 selectNgramsByNodeUser :: CorpusId
69 -> NgramsType
70 -> Cmd err [(NodeId, Text)]
71 selectNgramsByNodeUser cId' nt' =
72 runPGSQuery queryNgramsByNodeUser
73 ( cId'
74 , nodeTypeId NodeDocument
75 , ngramsTypeId nt'
76 -- , 100 :: Int -- limit
77 -- , 0 :: Int -- offset
78 )
79
80 queryNgramsByNodeUser :: DPS.Query
81 queryNgramsByNodeUser = [sql|
82 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
83 JOIN ngrams ng ON nng.ngrams_id = ng.id
84 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
85 JOIN nodes n ON nn.node2_id = n.id
86 WHERE nn.node1_id = ? -- CorpusId
87 AND n.typename = ? -- NodeTypeId
88 AND nng.ngrams_type = ? -- NgramsTypeId
89 AND nn.category > 0
90 GROUP BY nng.node2_id, ng.terms
91 ORDER BY (nng.node2_id, ng.terms) DESC
92 -- LIMIT ?
93 -- OFFSET ?
94 |]
95 ------------------------------------------------------------------------
96 -- TODO add groups
97 getOccByNgramsOnlyFast :: CorpusId
98 -> NgramsType
99 -> [NgramsTerm]
100 -> Cmd err (HashMap NgramsTerm Int)
101 getOccByNgramsOnlyFast cId nt ngs =
102 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
103
104
105 getOccByNgramsOnlyFast' :: CorpusId
106 -> ListId
107 -> NgramsType
108 -> [NgramsTerm]
109 -> Cmd err (HashMap NgramsTerm Int)
110 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
111 HM.fromListWith (+) <$> map (second round) <$> run cId lId nt tms
112
113 where
114 fields = [QualifiedIdentifier Nothing "text"]
115
116 run :: CorpusId
117 -> ListId
118 -> NgramsType
119 -> [NgramsTerm]
120 -> Cmd err [(NgramsTerm, Double)]
121 run cId' lId' nt' tms' = fmap (first NgramsTerm) <$> runPGSQuery query
122 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms')
123 , cId'
124 , lId'
125 , ngramsTypeId nt'
126 )
127
128 query :: DPS.Query
129 query = [sql|
130 WITH input_rows(terms) AS (?)
131 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
132 JOIN ngrams ng ON nng.ngrams_id = ng.id
133 JOIN input_rows ir ON ir.terms = ng.terms
134 WHERE nng.node1_id = ? -- CorpusId
135 AND nng.node2_id = ? -- ListId
136 AND nng.ngrams_type = ? -- NgramsTypeId
137 -- AND nn.category > 0 -- TODO
138 GROUP BY ng.terms, nng.weight
139 |]
140
141
142 -- just slower than getOccByNgramsOnlyFast
143 getOccByNgramsOnlySlow :: NodeType
144 -> CorpusId
145 -> [ListId]
146 -> NgramsType
147 -> [NgramsTerm]
148 -> Cmd err (HashMap NgramsTerm Int)
149 getOccByNgramsOnlySlow t cId ls nt ngs =
150 HM.map Set.size <$> getScore' t cId ls nt ngs
151 where
152 getScore' NodeCorpus = getNodesByNgramsOnlyUser
153 getScore' NodeDocument = getNgramsByDocOnlyUser
154 getScore' _ = getNodesByNgramsOnlyUser
155
156 getOccByNgramsOnlySafe :: CorpusId
157 -> [ListId]
158 -> NgramsType
159 -> [NgramsTerm]
160 -> Cmd err (HashMap NgramsTerm Int)
161 getOccByNgramsOnlySafe cId ls nt ngs = do
162 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
163 fast <- getOccByNgramsOnlyFast cId nt ngs
164 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
165 when (fast /= slow) $
166 printDebug "getOccByNgramsOnlySafe: difference"
167 (HM.difference slow fast, HM.difference fast slow)
168 -- diff slow fast :: PatchMap Text (Replace (Maybe Int))
169 pure slow
170
171
172 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
173 -> NgramsType
174 -> [NgramsTerm]
175 -> Cmd err [(NgramsTerm, Int)]
176 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
177 fmap (first NgramsTerm) <$>
178 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
179 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
180 , cId
181 , nodeTypeId NodeDocument
182 , ngramsTypeId nt
183 )
184 where
185 fields = [QualifiedIdentifier Nothing "text"]
186
187 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
188 -- Question: with the grouping is the result exactly the same (since Set NodeId for
189 -- equivalent ngrams intersections are not empty)
190 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
191 queryNgramsOccurrencesOnlyByNodeUser = [sql|
192 WITH input_rows(terms) AS (?)
193 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
194 JOIN ngrams ng ON nng.ngrams_id = ng.id
195 JOIN input_rows ir ON ir.terms = ng.terms
196 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
197 JOIN nodes n ON nn.node2_id = n.id
198 WHERE nn.node1_id = ? -- CorpusId
199 AND n.typename = ? -- NodeTypeId
200 AND nng.ngrams_type = ? -- NgramsTypeId
201 AND nn.category > 0
202 GROUP BY nng.node2_id, ng.terms
203 |]
204
205 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
206 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
207 WITH input_rows(terms) AS (?)
208 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
209 JOIN ngrams ng ON nng.ngrams_id = ng.id
210 JOIN input_rows ir ON ir.terms = ng.terms
211 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
212 JOIN nodes n ON nn.node2_id = n.id
213 WHERE nn.node1_id = ? -- CorpusId
214 AND n.typename = ? -- NodeTypeId
215 AND nng.ngrams_type = ? -- NgramsTypeId
216 AND nn.category > 0
217 GROUP BY nng.node2_id, ng.terms
218 |]
219
220 ------------------------------------------------------------------------
221 getNodesByNgramsOnlyUser :: CorpusId
222 -> [ListId]
223 -> NgramsType
224 -> [NgramsTerm]
225 -> Cmd err (HashMap NgramsTerm (Set NodeId))
226 getNodesByNgramsOnlyUser cId ls nt ngs =
227 HM.unionsWith (<>)
228 . map (HM.fromListWith (<>)
229 . map (second Set.singleton))
230 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
231 (splitEvery 1000 ngs)
232
233
234 getNgramsByNodeOnlyUser :: NodeId
235 -> [ListId]
236 -> NgramsType
237 -> [NgramsTerm]
238 -> Cmd err (Map NodeId (Set NgramsTerm))
239 getNgramsByNodeOnlyUser cId ls nt ngs =
240 Map.unionsWith (<>)
241 . map ( Map.fromListWith (<>)
242 . map (second Set.singleton)
243 )
244 . map (map swap)
245 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
246 (splitEvery 1000 ngs)
247
248 ------------------------------------------------------------------------
249 selectNgramsOnlyByNodeUser :: CorpusId
250 -> [ListId]
251 -> NgramsType
252 -> [NgramsTerm]
253 -> Cmd err [(NgramsTerm, NodeId)]
254 selectNgramsOnlyByNodeUser cId ls nt tms =
255 fmap (first NgramsTerm) <$>
256 runPGSQuery queryNgramsOnlyByNodeUser
257 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
258 , Values [QualifiedIdentifier Nothing "int4"]
259 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
260 , cId
261 , nodeTypeId NodeDocument
262 , ngramsTypeId nt
263 )
264 where
265 fields = [QualifiedIdentifier Nothing "text"]
266
267 queryNgramsOnlyByNodeUser :: DPS.Query
268 queryNgramsOnlyByNodeUser = [sql|
269 WITH input_rows(terms) AS (?),
270 input_list(id) AS (?)
271 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
272 JOIN ngrams ng ON nng.ngrams_id = ng.id
273 JOIN input_rows ir ON ir.terms = ng.terms
274 JOIN input_list il ON il.id = nng.node1_id
275 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
276 JOIN nodes n ON nn.node2_id = n.id
277 WHERE nn.node1_id = ? -- CorpusId
278 AND n.typename = ? -- NodeTypeId
279 AND nng.ngrams_type = ? -- NgramsTypeId
280 AND nn.category > 0
281 GROUP BY ng.terms, nng.node2_id
282 |]
283
284
285 selectNgramsOnlyByNodeUser' :: CorpusId
286 -> [ListId]
287 -> NgramsType
288 -> [Text]
289 -> Cmd err [(Text, Int)]
290 selectNgramsOnlyByNodeUser' cId ls nt tms =
291 runPGSQuery queryNgramsOnlyByNodeUser
292 ( Values fields (DPS.Only <$> tms)
293 , Values [QualifiedIdentifier Nothing "int4"]
294 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
295 , cId
296 , nodeTypeId NodeDocument
297 , ngramsTypeId nt
298 )
299 where
300 fields = [QualifiedIdentifier Nothing "text"]
301
302 queryNgramsOnlyByNodeUser' :: DPS.Query
303 queryNgramsOnlyByNodeUser' = [sql|
304 WITH input_rows(terms) AS (?),
305 input_list(id) AS (?)
306 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
307 JOIN ngrams ng ON nng.ngrams_id = ng.id
308 JOIN input_rows ir ON ir.terms = ng.terms
309 JOIN input_list il ON il.id = nng.node2_id
310 WHERE nng.node1_id = ? -- CorpusId
311 AND nng.ngrams_type = ? -- NgramsTypeId
312 -- AND nn.category > 0
313 GROUP BY ng.terms, nng.weight
314 |]
315
316
317 getNgramsByDocOnlyUser :: DocId
318 -> [ListId]
319 -> NgramsType
320 -> [NgramsTerm]
321 -> Cmd err (HashMap NgramsTerm (Set NodeId))
322 getNgramsByDocOnlyUser cId ls nt ngs =
323 HM.unionsWith (<>)
324 . map (HM.fromListWith (<>) . map (second Set.singleton))
325 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
326
327
328 selectNgramsOnlyByDocUser :: DocId
329 -> [ListId]
330 -> NgramsType
331 -> [NgramsTerm]
332 -> Cmd err [(NgramsTerm, NodeId)]
333 selectNgramsOnlyByDocUser dId ls nt tms =
334 fmap (first NgramsTerm) <$>
335 runPGSQuery queryNgramsOnlyByDocUser
336 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
337 , Values [QualifiedIdentifier Nothing "int4"]
338 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
339 , dId
340 , ngramsTypeId nt
341 )
342 where
343 fields = [QualifiedIdentifier Nothing "text"]
344
345
346 queryNgramsOnlyByDocUser :: DPS.Query
347 queryNgramsOnlyByDocUser = [sql|
348 WITH input_rows(terms) AS (?),
349 input_list(id) AS (?)
350 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
351 JOIN ngrams ng ON nng.ngrams_id = ng.id
352 JOIN input_rows ir ON ir.terms = ng.terms
353 JOIN input_list il ON il.id = nng.node1_id
354 WHERE nng.node2_id = ? -- DocId
355 AND nng.ngrams_type = ? -- NgramsTypeId
356 GROUP BY ng.terms, nng.node2_id
357 |]
358
359 ------------------------------------------------------------------------
360 -- | TODO filter by language, database, any social field
361 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (HashMap Text (Set NodeId))
362 getNodesByNgramsMaster ucId mcId = unionsWith (<>)
363 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
364 -- . takeWhile (not . List.null)
365 -- . takeWhile (\l -> List.length l > 3)
366 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
367
368 selectNgramsByNodeMaster :: Int
369 -> UserCorpusId
370 -> MasterCorpusId
371 -> Int
372 -> Cmd err [(NodeId, Text)]
373 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
374 queryNgramsByNodeMaster'
375 ( ucId
376 , ngramsTypeId NgramsTerms
377 , nodeTypeId NodeDocument
378 , p
379 , nodeTypeId NodeDocument
380 , p
381 , n
382 , mcId
383 , nodeTypeId NodeDocument
384 , ngramsTypeId NgramsTerms
385 )
386
387 -- | TODO fix node_node_ngrams relation
388 queryNgramsByNodeMaster' :: DPS.Query
389 queryNgramsByNodeMaster' = [sql|
390 WITH nodesByNgramsUser AS (
391
392 SELECT n.id, ng.terms FROM nodes n
393 JOIN nodes_nodes nn ON n.id = nn.node2_id
394 JOIN node_node_ngrams nng ON nng.node2_id = n.id
395 JOIN ngrams ng ON nng.ngrams_id = ng.id
396 WHERE nn.node1_id = ? -- UserCorpusId
397 -- AND n.typename = ? -- NodeTypeId
398 AND nng.ngrams_type = ? -- NgramsTypeId
399 AND nn.category > 0
400 AND node_pos(n.id,?) >= ?
401 AND node_pos(n.id,?) < ?
402 GROUP BY n.id, ng.terms
403
404 ),
405
406 nodesByNgramsMaster AS (
407
408 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
409 JOIN node_node_ngrams nng ON n.id = nng.node2_id
410 JOIN ngrams ng ON ng.id = nng.ngrams_id
411
412 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
413 AND n.typename = ? -- NodeTypeId
414 AND nng.ngrams_type = ? -- NgramsTypeId
415 GROUP BY n.id, ng.terms
416 )
417
418 SELECT m.id, m.terms FROM nodesByNgramsMaster m
419 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
420 |]