]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByNode.hs
fix
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByNode.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByNode
17 where
18
19
20 import Data.Map.Strict (Map, fromListWith, elems, toList)
21 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import Data.Tuple.Extra (second, swap)
25 import Database.PostgreSQL.Simple.SqlQQ (sql)
26 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
27 import Debug.Trace (trace)
28 import Gargantext.Database.Admin.Config (nodeTypeId)
29 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
30 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
31 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
32 import Gargantext.Prelude
33 import qualified Data.Map.Strict as Map
34 import qualified Data.Set as Set
35 import qualified Database.PostgreSQL.Simple as DPS
36
37
38
39 -- | fst is size of Supra Corpus
40 -- snd is Texts and size of Occurrences (different docs)
41 countNodesByNgramsWith :: (Text -> Text)
42 -> Map Text (Set NodeId)
43 -> (Double, Map Text (Double, Set Text))
44 countNodesByNgramsWith f m = (total, m')
45 where
46 total = fromIntegral $ Set.size $ Set.unions $ elems m
47 m' = Map.map ( swap . second (fromIntegral . Set.size))
48 $ groupNodesByNgramsWith f m
49
50
51 groupNodesByNgramsWith :: (Text -> Text)
52 -> Map Text (Set NodeId)
53 -> Map Text (Set Text, Set NodeId)
54 groupNodesByNgramsWith f m =
55 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
56 $ toList m
57
58 ------------------------------------------------------------------------
59 getNodesByNgramsUser :: CorpusId
60 -> NgramsType
61 -> Cmd err (Map Text (Set NodeId))
62 getNodesByNgramsUser cId nt =
63 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
64 <$> selectNgramsByNodeUser cId nt
65 where
66
67 selectNgramsByNodeUser :: CorpusId
68 -> NgramsType
69 -> Cmd err [(NodeId, Text)]
70 selectNgramsByNodeUser cId' nt' =
71 runPGSQuery queryNgramsByNodeUser
72 ( cId'
73 , nodeTypeId NodeDocument
74 , ngramsTypeId nt'
75 -- , 100 :: Int -- limit
76 -- , 0 :: Int -- offset
77 )
78
79 queryNgramsByNodeUser :: DPS.Query
80 queryNgramsByNodeUser = [sql|
81 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
82 JOIN ngrams ng ON nng.ngrams_id = ng.id
83 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
84 JOIN nodes n ON nn.node2_id = n.id
85 WHERE nn.node1_id = ? -- CorpusId
86 AND n.typename = ? -- NodeTypeId
87 AND nng.ngrams_type = ? -- NgramsTypeId
88 AND nn.category > 0
89 GROUP BY nng.node2_id, ng.terms
90 ORDER BY (nng.node2_id, ng.terms) DESC
91 -- LIMIT ?
92 -- OFFSET ?
93 |]
94 ------------------------------------------------------------------------
95 -- TODO add groups
96 getOccByNgramsOnlyFast :: CorpusId
97 -> NgramsType
98 -> [Text]
99 -> Cmd err (Map Text Int)
100 getOccByNgramsOnlyFast cId nt ngs =
101 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
102
103
104 getOccByNgramsOnlyFast' :: CorpusId
105 -> ListId
106 -> NgramsType
107 -> [Text]
108 -> Cmd err (Map Text Int)
109 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
110 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
111
112 where
113 fields = [QualifiedIdentifier Nothing "text"]
114
115 run :: CorpusId
116 -> ListId
117 -> NgramsType
118 -> [Text]
119 -> Cmd err [(Text, Double)]
120 run cId' lId' nt' tms' = runPGSQuery query
121 ( Values fields (DPS.Only <$> tms')
122 , cId'
123 , lId'
124 , ngramsTypeId nt'
125 )
126
127 query :: DPS.Query
128 query = [sql|
129 WITH input_rows(terms) AS (?)
130 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
131 JOIN ngrams ng ON nng.ngrams_id = ng.id
132 JOIN input_rows ir ON ir.terms = ng.terms
133 WHERE nng.node1_id = ? -- CorpusId
134 AND nng.node2_id = ? -- ListId
135 AND nng.ngrams_type = ? -- NgramsTypeId
136 -- AND nn.category > 0 -- TODO
137 GROUP BY ng.terms, nng.weight
138 |]
139
140
141 -- just slower than getOccByNgramsOnlyFast
142 getOccByNgramsOnlySlow :: NodeType
143 -> CorpusId
144 -> [ListId]
145 -> NgramsType
146 -> [Text]
147 -> Cmd err (Map Text Int)
148 getOccByNgramsOnlySlow t cId ls nt ngs =
149 Map.map Set.size <$> getScore' t cId ls nt ngs
150 where
151 getScore' NodeCorpus = getNodesByNgramsOnlyUser
152 getScore' NodeDocument = getNgramsByDocOnlyUser
153 getScore' _ = getNodesByNgramsOnlyUser
154
155 getOccByNgramsOnlySafe :: CorpusId
156 -> [ListId]
157 -> NgramsType
158 -> [Text]
159 -> Cmd err (Map Text Int)
160 getOccByNgramsOnlySafe cId ls nt ngs = do
161 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
162 fast <- getOccByNgramsOnlyFast cId nt ngs
163 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
164 when (fast /= slow) $
165 printDebug "getOccByNgramsOnlySafe: difference"
166 (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
167 pure slow
168
169
170 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
171 -> NgramsType
172 -> [Text]
173 -> Cmd err [(Text, Int)]
174 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
175 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
176 ( Values fields (DPS.Only <$> tms)
177 , cId
178 , nodeTypeId NodeDocument
179 , ngramsTypeId nt
180 )
181 where
182 fields = [QualifiedIdentifier Nothing "text"]
183
184 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
185 -- Question: with the grouping is the result exactly the same (since Set NodeId for
186 -- equivalent ngrams intersections are not empty)
187 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
188 queryNgramsOccurrencesOnlyByNodeUser = [sql|
189 WITH input_rows(terms) AS (?)
190 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
191 JOIN ngrams ng ON nng.ngrams_id = ng.id
192 JOIN input_rows ir ON ir.terms = ng.terms
193 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
194 JOIN nodes n ON nn.node2_id = n.id
195 WHERE nn.node1_id = ? -- CorpusId
196 AND n.typename = ? -- NodeTypeId
197 AND nng.ngrams_type = ? -- NgramsTypeId
198 AND nn.category > 0
199 GROUP BY nng.node2_id, ng.terms
200 |]
201
202 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
203 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
204 WITH input_rows(terms) AS (?)
205 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
206 JOIN ngrams ng ON nng.ngrams_id = ng.id
207 JOIN input_rows ir ON ir.terms = ng.terms
208 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
209 JOIN nodes n ON nn.node2_id = n.id
210 WHERE nn.node1_id = ? -- CorpusId
211 AND n.typename = ? -- NodeTypeId
212 AND nng.ngrams_type = ? -- NgramsTypeId
213 AND nn.category > 0
214 GROUP BY nng.node2_id, ng.terms
215 |]
216
217 ------------------------------------------------------------------------
218 getNodesByNgramsOnlyUser :: CorpusId
219 -> [ListId]
220 -> NgramsType
221 -> [Text]
222 -> Cmd err (Map Text (Set NodeId))
223 getNodesByNgramsOnlyUser cId ls nt ngs =
224 Map.unionsWith (<>)
225 . map (fromListWith (<>)
226 . map (second Set.singleton))
227 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
228 (splitEvery 1000 ngs)
229
230
231 getNgramsByNodeOnlyUser :: NodeId
232 -> [ListId]
233 -> NgramsType
234 -> [Text]
235 -> Cmd err (Map NodeId (Set Text))
236 getNgramsByNodeOnlyUser cId ls nt ngs =
237 Map.unionsWith (<>)
238 . map (fromListWith (<>)
239 . map (second Set.singleton))
240 . map (map swap)
241 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
242 (splitEvery 1000 ngs)
243
244 ------------------------------------------------------------------------
245 selectNgramsOnlyByNodeUser :: CorpusId
246 -> [ListId]
247 -> NgramsType
248 -> [Text]
249 -> Cmd err [(Text, NodeId)]
250 selectNgramsOnlyByNodeUser cId ls nt tms =
251 runPGSQuery queryNgramsOnlyByNodeUser
252 ( Values fields (DPS.Only <$> tms)
253 , Values [QualifiedIdentifier Nothing "int4"]
254 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
255 , cId
256 , nodeTypeId NodeDocument
257 , ngramsTypeId nt
258 )
259 where
260 fields = [QualifiedIdentifier Nothing "text"]
261
262 queryNgramsOnlyByNodeUser :: DPS.Query
263 queryNgramsOnlyByNodeUser = [sql|
264 WITH input_rows(terms) AS (?),
265 input_list(id) AS (?)
266 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
267 JOIN ngrams ng ON nng.ngrams_id = ng.id
268 JOIN input_rows ir ON ir.terms = ng.terms
269 JOIN input_list il ON il.id = nng.node1_id
270 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
271 JOIN nodes n ON nn.node2_id = n.id
272 WHERE nn.node1_id = ? -- CorpusId
273 AND n.typename = ? -- NodeTypeId
274 AND nng.ngrams_type = ? -- NgramsTypeId
275 AND nn.category > 0
276 GROUP BY ng.terms, nng.node2_id
277 |]
278
279
280 selectNgramsOnlyByNodeUser' :: CorpusId
281 -> [ListId]
282 -> NgramsType
283 -> [Text]
284 -> Cmd err [(Text, Int)]
285 selectNgramsOnlyByNodeUser' cId ls nt tms =
286 runPGSQuery queryNgramsOnlyByNodeUser
287 ( Values fields (DPS.Only <$> tms)
288 , Values [QualifiedIdentifier Nothing "int4"]
289 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
290 , cId
291 , nodeTypeId NodeDocument
292 , ngramsTypeId nt
293 )
294 where
295 fields = [QualifiedIdentifier Nothing "text"]
296
297 queryNgramsOnlyByNodeUser' :: DPS.Query
298 queryNgramsOnlyByNodeUser' = [sql|
299 WITH input_rows(terms) AS (?),
300 input_list(id) AS (?)
301 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
302 JOIN ngrams ng ON nng.ngrams_id = ng.id
303 JOIN input_rows ir ON ir.terms = ng.terms
304 JOIN input_list il ON il.id = nng.node2_id
305 WHERE nng.node1_id = ? -- CorpusId
306 AND nng.ngrams_type = ? -- NgramsTypeId
307 -- AND nn.category > 0
308 GROUP BY ng.terms, nng.weight
309 |]
310
311
312 getNgramsByDocOnlyUser :: DocId
313 -> [ListId]
314 -> NgramsType
315 -> [Text]
316 -> Cmd err (Map Text (Set NodeId))
317 getNgramsByDocOnlyUser cId ls nt ngs =
318 Map.unionsWith (<>)
319 . map (fromListWith (<>) . map (second Set.singleton))
320 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
321
322
323 selectNgramsOnlyByDocUser :: DocId
324 -> [ListId]
325 -> NgramsType
326 -> [Text]
327 -> Cmd err [(Text, NodeId)]
328 selectNgramsOnlyByDocUser dId ls nt tms =
329 runPGSQuery queryNgramsOnlyByDocUser
330 ( Values fields (DPS.Only <$> tms)
331 , Values [QualifiedIdentifier Nothing "int4"]
332 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
333 , dId
334 , ngramsTypeId nt
335 )
336 where
337 fields = [QualifiedIdentifier Nothing "text"]
338
339
340 queryNgramsOnlyByDocUser :: DPS.Query
341 queryNgramsOnlyByDocUser = [sql|
342 WITH input_rows(terms) AS (?),
343 input_list(id) AS (?)
344 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
345 JOIN ngrams ng ON nng.ngrams_id = ng.id
346 JOIN input_rows ir ON ir.terms = ng.terms
347 JOIN input_list il ON il.id = nng.node1_id
348 WHERE nng.node2_id = ? -- DocId
349 AND nng.ngrams_type = ? -- NgramsTypeId
350 GROUP BY ng.terms, nng.node2_id
351 |]
352
353 ------------------------------------------------------------------------
354 -- | TODO filter by language, database, any social field
355 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
356 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
357 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
358 -- . takeWhile (not . List.null)
359 -- . takeWhile (\l -> List.length l > 3)
360 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
361
362 selectNgramsByNodeMaster :: Int
363 -> UserCorpusId
364 -> MasterCorpusId
365 -> Int
366 -> Cmd err [(NodeId, Text)]
367 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
368 queryNgramsByNodeMaster'
369 ( ucId
370 , ngramsTypeId NgramsTerms
371 , nodeTypeId NodeDocument
372 , p
373 , nodeTypeId NodeDocument
374 , p
375 , n
376 , mcId
377 , nodeTypeId NodeDocument
378 , ngramsTypeId NgramsTerms
379 )
380
381 -- | TODO fix node_node_ngrams relation
382 queryNgramsByNodeMaster' :: DPS.Query
383 queryNgramsByNodeMaster' = [sql|
384 WITH nodesByNgramsUser AS (
385
386 SELECT n.id, ng.terms FROM nodes n
387 JOIN nodes_nodes nn ON n.id = nn.node2_id
388 JOIN node_node_ngrams nng ON nng.node2_id = n.id
389 JOIN ngrams ng ON nng.ngrams_id = ng.id
390 WHERE nn.node1_id = ? -- UserCorpusId
391 -- AND n.typename = ? -- NodeTypeId
392 AND nng.ngrams_type = ? -- NgramsTypeId
393 AND nn.category > 0
394 AND node_pos(n.id,?) >= ?
395 AND node_pos(n.id,?) < ?
396 GROUP BY n.id, ng.terms
397
398 ),
399
400 nodesByNgramsMaster AS (
401
402 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
403 JOIN node_node_ngrams nng ON n.id = nng.node2_id
404 JOIN ngrams ng ON ng.id = nng.ngrams_id
405
406 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
407 AND n.typename = ? -- NodeTypeId
408 AND nng.ngrams_type = ? -- NgramsTypeId
409 GROUP BY n.id, ng.terms
410 )
411
412 SELECT m.id, m.terms FROM nodesByNgramsMaster m
413 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
414 |]