]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByNode.hs
[REFACT] Group fun and types
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByNode.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByNode
17 where
18
19 import Data.Map.Strict (Map, fromListWith, elems, toList)
20 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.Set (Set)
22 import Data.Text (Text)
23 import Data.Tuple.Extra (second, swap)
24 import Database.PostgreSQL.Simple.SqlQQ (sql)
25 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
26 import Debug.Trace (trace)
27 import qualified Data.List as List
28 import qualified Data.Map.Strict as Map
29 import qualified Data.Set as Set
30 import qualified Data.Text as Text
31 import qualified Database.PostgreSQL.Simple as DPS
32
33 import Gargantext.Core (Lang(..))
34 import Gargantext.Database.Admin.Config (nodeTypeId)
35 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
36 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
37 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
38 import Gargantext.Prelude
39
40
41
42 -- | fst is size of Supra Corpus
43 -- snd is Texts and size of Occurrences (different docs)
44 countNodesByNgramsWith :: (Text -> Text)
45 -> Map Text (Set NodeId)
46 -> (Double, Map Text (Double, Set Text))
47 countNodesByNgramsWith f m = (total, m')
48 where
49 total = fromIntegral $ Set.size $ Set.unions $ elems m
50 m' = Map.map ( swap . second (fromIntegral . Set.size))
51 $ groupNodesByNgramsWith f m
52
53
54 groupNodesByNgramsWith :: (Text -> Text)
55 -> Map Text (Set NodeId)
56 -> Map Text (Set Text, Set NodeId)
57 groupNodesByNgramsWith f m =
58 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
59 $ toList m
60
61 ------------------------------------------------------------------------
62 getNodesByNgramsUser :: CorpusId
63 -> NgramsType
64 -> Cmd err (Map Text (Set NodeId))
65 getNodesByNgramsUser cId nt =
66 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
67 <$> selectNgramsByNodeUser cId nt
68 where
69
70 selectNgramsByNodeUser :: CorpusId
71 -> NgramsType
72 -> Cmd err [(NodeId, Text)]
73 selectNgramsByNodeUser cId' nt' =
74 runPGSQuery queryNgramsByNodeUser
75 ( cId'
76 , nodeTypeId NodeDocument
77 , ngramsTypeId nt'
78 -- , 100 :: Int -- limit
79 -- , 0 :: Int -- offset
80 )
81
82 queryNgramsByNodeUser :: DPS.Query
83 queryNgramsByNodeUser = [sql|
84 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
85 JOIN ngrams ng ON nng.ngrams_id = ng.id
86 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
87 JOIN nodes n ON nn.node2_id = n.id
88 WHERE nn.node1_id = ? -- CorpusId
89 AND n.typename = ? -- NodeTypeId
90 AND nng.ngrams_type = ? -- NgramsTypeId
91 AND nn.category > 0
92 GROUP BY nng.node2_id, ng.terms
93 ORDER BY (nng.node2_id, ng.terms) DESC
94 -- LIMIT ?
95 -- OFFSET ?
96 |]
97 ------------------------------------------------------------------------
98 -- TODO add groups
99 getOccByNgramsOnlyFast :: CorpusId
100 -> NgramsType
101 -> [Text]
102 -> Cmd err (Map Text Int)
103 getOccByNgramsOnlyFast cId nt ngs =
104 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
105
106
107 getOccByNgramsOnlyFast' :: CorpusId
108 -> ListId
109 -> NgramsType
110 -> [Text]
111 -> Cmd err (Map Text Int)
112 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
113 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
114
115 where
116 fields = [QualifiedIdentifier Nothing "text"]
117
118 run :: CorpusId
119 -> ListId
120 -> NgramsType
121 -> [Text]
122 -> Cmd err [(Text, Double)]
123 run cId' lId' nt' tms' = runPGSQuery query
124 ( Values fields (DPS.Only <$> tms')
125 , cId'
126 , lId'
127 , ngramsTypeId nt'
128 )
129
130 query :: DPS.Query
131 query = [sql|
132 WITH input_rows(terms) AS (?)
133 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
134 JOIN ngrams ng ON nng.ngrams_id = ng.id
135 JOIN input_rows ir ON ir.terms = ng.terms
136 WHERE nng.node1_id = ? -- CorpusId
137 AND nng.node2_id = ? -- ListId
138 AND nng.ngrams_type = ? -- NgramsTypeId
139 -- AND nn.category > 0 -- TODO
140 GROUP BY ng.terms, nng.weight
141 |]
142
143
144 -- just slower than getOccByNgramsOnlyFast
145 getOccByNgramsOnlySlow :: NodeType
146 -> CorpusId
147 -> [ListId]
148 -> NgramsType
149 -> [Text]
150 -> Cmd err (Map Text Int)
151 getOccByNgramsOnlySlow t cId ls nt ngs =
152 Map.map Set.size <$> getScore' t cId ls nt ngs
153 where
154 getScore' NodeCorpus = getNodesByNgramsOnlyUser
155 getScore' NodeDocument = getNgramsByDocOnlyUser
156 getScore' _ = getNodesByNgramsOnlyUser
157
158 getOccByNgramsOnlySafe :: CorpusId
159 -> [ListId]
160 -> NgramsType
161 -> [Text]
162 -> Cmd err (Map Text Int)
163 getOccByNgramsOnlySafe cId ls nt ngs = do
164 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
165 fast <- getOccByNgramsOnlyFast cId nt ngs
166 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
167 when (fast /= slow) $
168 printDebug "getOccByNgramsOnlySafe: difference"
169 (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
170 pure slow
171
172
173 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
174 -> NgramsType
175 -> [Text]
176 -> Cmd err [(Text, Int)]
177 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
178 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
179 ( Values fields (DPS.Only <$> tms)
180 , cId
181 , nodeTypeId NodeDocument
182 , ngramsTypeId nt
183 )
184 where
185 fields = [QualifiedIdentifier Nothing "text"]
186
187 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
188 -- Question: with the grouping is the result exactly the same (since Set NodeId for
189 -- equivalent ngrams intersections are not empty)
190 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
191 queryNgramsOccurrencesOnlyByNodeUser = [sql|
192 WITH input_rows(terms) AS (?)
193 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
194 JOIN ngrams ng ON nng.ngrams_id = ng.id
195 JOIN input_rows ir ON ir.terms = ng.terms
196 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
197 JOIN nodes n ON nn.node2_id = n.id
198 WHERE nn.node1_id = ? -- CorpusId
199 AND n.typename = ? -- NodeTypeId
200 AND nng.ngrams_type = ? -- NgramsTypeId
201 AND nn.category > 0
202 GROUP BY nng.node2_id, ng.terms
203 |]
204
205 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
206 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
207 WITH input_rows(terms) AS (?)
208 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
209 JOIN ngrams ng ON nng.ngrams_id = ng.id
210 JOIN input_rows ir ON ir.terms = ng.terms
211 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
212 JOIN nodes n ON nn.node2_id = n.id
213 WHERE nn.node1_id = ? -- CorpusId
214 AND n.typename = ? -- NodeTypeId
215 AND nng.ngrams_type = ? -- NgramsTypeId
216 AND nn.category > 0
217 GROUP BY nng.node2_id, ng.terms
218 |]
219
220 ------------------------------------------------------------------------
221 getNodesByNgramsOnlyUser :: CorpusId
222 -> [ListId]
223 -> NgramsType
224 -> [Text]
225 -> Cmd err (Map Text (Set NodeId))
226 getNodesByNgramsOnlyUser cId ls nt ngs =
227 Map.unionsWith (<>)
228 . map (fromListWith (<>)
229 . map (second Set.singleton))
230 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
231 (splitEvery 1000 ngs)
232
233
234 getNgramsByNodeOnlyUser :: NodeId
235 -> [ListId]
236 -> NgramsType
237 -> [Text]
238 -> Cmd err (Map NodeId (Set Text))
239 getNgramsByNodeOnlyUser cId ls nt ngs =
240 Map.unionsWith (<>)
241 . map (fromListWith (<>)
242 . map (second Set.singleton))
243 . map (map swap)
244 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
245 (splitEvery 1000 ngs)
246
247 ------------------------------------------------------------------------
248 selectNgramsOnlyByNodeUser :: CorpusId
249 -> [ListId]
250 -> NgramsType
251 -> [Text]
252 -> Cmd err [(Text, NodeId)]
253 selectNgramsOnlyByNodeUser cId ls nt tms =
254 runPGSQuery queryNgramsOnlyByNodeUser
255 ( Values fields (DPS.Only <$> tms)
256 , Values [QualifiedIdentifier Nothing "int4"]
257 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
258 , cId
259 , nodeTypeId NodeDocument
260 , ngramsTypeId nt
261 )
262 where
263 fields = [QualifiedIdentifier Nothing "text"]
264
265 queryNgramsOnlyByNodeUser :: DPS.Query
266 queryNgramsOnlyByNodeUser = [sql|
267 WITH input_rows(terms) AS (?),
268 input_list(id) AS (?)
269 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
270 JOIN ngrams ng ON nng.ngrams_id = ng.id
271 JOIN input_rows ir ON ir.terms = ng.terms
272 JOIN input_list il ON il.id = nng.node1_id
273 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
274 JOIN nodes n ON nn.node2_id = n.id
275 WHERE nn.node1_id = ? -- CorpusId
276 AND n.typename = ? -- NodeTypeId
277 AND nng.ngrams_type = ? -- NgramsTypeId
278 AND nn.category > 0
279 GROUP BY ng.terms, nng.node2_id
280 |]
281
282
283 selectNgramsOnlyByNodeUser' :: CorpusId
284 -> [ListId]
285 -> NgramsType
286 -> [Text]
287 -> Cmd err [(Text, Int)]
288 selectNgramsOnlyByNodeUser' cId ls nt tms =
289 runPGSQuery queryNgramsOnlyByNodeUser
290 ( Values fields (DPS.Only <$> tms)
291 , Values [QualifiedIdentifier Nothing "int4"]
292 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
293 , cId
294 , nodeTypeId NodeDocument
295 , ngramsTypeId nt
296 )
297 where
298 fields = [QualifiedIdentifier Nothing "text"]
299
300 queryNgramsOnlyByNodeUser' :: DPS.Query
301 queryNgramsOnlyByNodeUser' = [sql|
302 WITH input_rows(terms) AS (?),
303 input_list(id) AS (?)
304 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
305 JOIN ngrams ng ON nng.ngrams_id = ng.id
306 JOIN input_rows ir ON ir.terms = ng.terms
307 JOIN input_list il ON il.id = nng.node2_id
308 WHERE nng.node1_id = ? -- CorpusId
309 AND nng.ngrams_type = ? -- NgramsTypeId
310 -- AND nn.category > 0
311 GROUP BY ng.terms, nng.weight
312 |]
313
314
315 getNgramsByDocOnlyUser :: DocId
316 -> [ListId]
317 -> NgramsType
318 -> [Text]
319 -> Cmd err (Map Text (Set NodeId))
320 getNgramsByDocOnlyUser cId ls nt ngs =
321 Map.unionsWith (<>)
322 . map (fromListWith (<>) . map (second Set.singleton))
323 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
324
325
326 selectNgramsOnlyByDocUser :: DocId
327 -> [ListId]
328 -> NgramsType
329 -> [Text]
330 -> Cmd err [(Text, NodeId)]
331 selectNgramsOnlyByDocUser dId ls nt tms =
332 runPGSQuery queryNgramsOnlyByDocUser
333 ( Values fields (DPS.Only <$> tms)
334 , Values [QualifiedIdentifier Nothing "int4"]
335 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
336 , dId
337 , ngramsTypeId nt
338 )
339 where
340 fields = [QualifiedIdentifier Nothing "text"]
341
342
343 queryNgramsOnlyByDocUser :: DPS.Query
344 queryNgramsOnlyByDocUser = [sql|
345 WITH input_rows(terms) AS (?),
346 input_list(id) AS (?)
347 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
348 JOIN ngrams ng ON nng.ngrams_id = ng.id
349 JOIN input_rows ir ON ir.terms = ng.terms
350 JOIN input_list il ON il.id = nng.node1_id
351 WHERE nng.node2_id = ? -- DocId
352 AND nng.ngrams_type = ? -- NgramsTypeId
353 GROUP BY ng.terms, nng.node2_id
354 |]
355
356 ------------------------------------------------------------------------
357 -- | TODO filter by language, database, any social field
358 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
359 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
360 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
361 -- . takeWhile (not . List.null)
362 -- . takeWhile (\l -> List.length l > 3)
363 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
364
365 selectNgramsByNodeMaster :: Int
366 -> UserCorpusId
367 -> MasterCorpusId
368 -> Int
369 -> Cmd err [(NodeId, Text)]
370 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
371 queryNgramsByNodeMaster'
372 ( ucId
373 , ngramsTypeId NgramsTerms
374 , nodeTypeId NodeDocument
375 , p
376 , nodeTypeId NodeDocument
377 , p
378 , n
379 , mcId
380 , nodeTypeId NodeDocument
381 , ngramsTypeId NgramsTerms
382 )
383
384 -- | TODO fix node_node_ngrams relation
385 queryNgramsByNodeMaster' :: DPS.Query
386 queryNgramsByNodeMaster' = [sql|
387 WITH nodesByNgramsUser AS (
388
389 SELECT n.id, ng.terms FROM nodes n
390 JOIN nodes_nodes nn ON n.id = nn.node2_id
391 JOIN node_node_ngrams nng ON nng.node2_id = n.id
392 JOIN ngrams ng ON nng.ngrams_id = ng.id
393 WHERE nn.node1_id = ? -- UserCorpusId
394 -- AND n.typename = ? -- NodeTypeId
395 AND nng.ngrams_type = ? -- NgramsTypeId
396 AND nn.category > 0
397 AND node_pos(n.id,?) >= ?
398 AND node_pos(n.id,?) < ?
399 GROUP BY n.id, ng.terms
400
401 ),
402
403 nodesByNgramsMaster AS (
404
405 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
406 JOIN node_node_ngrams nng ON n.id = nng.node2_id
407 JOIN ngrams ng ON ng.id = nng.ngrams_id
408
409 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
410 AND n.typename = ? -- NodeTypeId
411 AND nng.ngrams_type = ? -- NgramsTypeId
412 GROUP BY n.id, ng.terms
413 )
414
415 SELECT m.id, m.terms FROM nodesByNgramsMaster m
416 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
417 |]