2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByNode
20 import Data.Map.Strict (Map, fromListWith, elems, toList)
21 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
23 import Data.Text (Text)
24 import Data.Tuple.Extra (second, swap)
25 import Database.PostgreSQL.Simple.SqlQQ (sql)
26 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
27 import Debug.Trace (trace)
28 import Gargantext.Database.Admin.Config (nodeTypeId)
29 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
30 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
31 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
32 import Gargantext.Prelude
33 import qualified Data.Map.Strict as Map
34 import qualified Data.Set as Set
35 import qualified Database.PostgreSQL.Simple as DPS
39 -- | fst is size of Supra Corpus
40 -- snd is Texts and size of Occurrences (different docs)
41 countNodesByNgramsWith :: (Text -> Text)
42 -> Map Text (Set NodeId)
43 -> (Double, Map Text (Double, Set Text))
44 countNodesByNgramsWith f m = (total, m')
46 total = fromIntegral $ Set.size $ Set.unions $ elems m
47 m' = Map.map ( swap . second (fromIntegral . Set.size))
48 $ groupNodesByNgramsWith f m
51 groupNodesByNgramsWith :: (Text -> Text)
52 -> Map Text (Set NodeId)
53 -> Map Text (Set Text, Set NodeId)
54 groupNodesByNgramsWith f m =
55 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
58 ------------------------------------------------------------------------
59 getNodesByNgramsUser :: CorpusId
61 -> Cmd err (Map Text (Set NodeId))
62 getNodesByNgramsUser cId nt =
63 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
64 <$> selectNgramsByNodeUser cId nt
67 selectNgramsByNodeUser :: CorpusId
69 -> Cmd err [(NodeId, Text)]
70 selectNgramsByNodeUser cId' nt' =
71 runPGSQuery queryNgramsByNodeUser
73 , nodeTypeId NodeDocument
75 -- , 100 :: Int -- limit
76 -- , 0 :: Int -- offset
79 queryNgramsByNodeUser :: DPS.Query
80 queryNgramsByNodeUser = [sql|
81 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
82 JOIN ngrams ng ON nng.ngrams_id = ng.id
83 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
84 JOIN nodes n ON nn.node2_id = n.id
85 WHERE nn.node1_id = ? -- CorpusId
86 AND n.typename = ? -- NodeTypeId
87 AND nng.ngrams_type = ? -- NgramsTypeId
89 GROUP BY nng.node2_id, ng.terms
90 ORDER BY (nng.node2_id, ng.terms) DESC
94 ------------------------------------------------------------------------
96 getOccByNgramsOnlyFast :: CorpusId
99 -> Cmd err (Map Text Int)
100 getOccByNgramsOnlyFast cId nt ngs =
101 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
104 getOccByNgramsOnlyFast' :: CorpusId
108 -> Cmd err (Map Text Int)
109 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
110 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
113 fields = [QualifiedIdentifier Nothing "text"]
119 -> Cmd err [(Text, Double)]
120 run cId' lId' nt' tms' = runPGSQuery query
121 ( Values fields (DPS.Only <$> tms')
129 WITH input_rows(terms) AS (?)
130 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
131 JOIN ngrams ng ON nng.ngrams_id = ng.id
132 JOIN input_rows ir ON ir.terms = ng.terms
133 WHERE nng.node1_id = ? -- CorpusId
134 AND nng.node2_id = ? -- ListId
135 AND nng.ngrams_type = ? -- NgramsTypeId
136 -- AND nn.category > 0 -- TODO
137 GROUP BY ng.terms, nng.weight
141 -- just slower than getOccByNgramsOnlyFast
142 getOccByNgramsOnlySlow :: NodeType
147 -> Cmd err (Map Text Int)
148 getOccByNgramsOnlySlow t cId ls nt ngs =
149 Map.map Set.size <$> getScore' t cId ls nt ngs
151 getScore' NodeCorpus = getNodesByNgramsOnlyUser
152 getScore' NodeDocument = getNgramsByDocOnlyUser
153 getScore' _ = getNodesByNgramsOnlyUser
155 getOccByNgramsOnlySafe :: CorpusId
159 -> Cmd err (Map Text Int)
160 getOccByNgramsOnlySafe cId ls nt ngs = do
161 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
162 fast <- getOccByNgramsOnlyFast cId nt ngs
163 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
164 when (fast /= slow) $
165 printDebug "getOccByNgramsOnlySafe: difference"
166 (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
170 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
173 -> Cmd err [(Text, Int)]
174 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
175 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
176 ( Values fields (DPS.Only <$> tms)
178 , nodeTypeId NodeDocument
182 fields = [QualifiedIdentifier Nothing "text"]
184 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
185 -- Question: with the grouping is the result exactly the same (since Set NodeId for
186 -- equivalent ngrams intersections are not empty)
187 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
188 queryNgramsOccurrencesOnlyByNodeUser = [sql|
189 WITH input_rows(terms) AS (?)
190 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
191 JOIN ngrams ng ON nng.ngrams_id = ng.id
192 JOIN input_rows ir ON ir.terms = ng.terms
193 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
194 JOIN nodes n ON nn.node2_id = n.id
195 WHERE nn.node1_id = ? -- CorpusId
196 AND n.typename = ? -- NodeTypeId
197 AND nng.ngrams_type = ? -- NgramsTypeId
199 GROUP BY nng.node2_id, ng.terms
202 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
203 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
204 WITH input_rows(terms) AS (?)
205 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
206 JOIN ngrams ng ON nng.ngrams_id = ng.id
207 JOIN input_rows ir ON ir.terms = ng.terms
208 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
209 JOIN nodes n ON nn.node2_id = n.id
210 WHERE nn.node1_id = ? -- CorpusId
211 AND n.typename = ? -- NodeTypeId
212 AND nng.ngrams_type = ? -- NgramsTypeId
214 GROUP BY nng.node2_id, ng.terms
217 ------------------------------------------------------------------------
218 getNodesByNgramsOnlyUser :: CorpusId
222 -> Cmd err (Map Text (Set NodeId))
223 getNodesByNgramsOnlyUser cId ls nt ngs =
225 . map (fromListWith (<>)
226 . map (second Set.singleton))
227 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
228 (splitEvery 1000 ngs)
231 getNgramsByNodeOnlyUser :: NodeId
235 -> Cmd err (Map NodeId (Set Text))
236 getNgramsByNodeOnlyUser cId ls nt ngs =
238 . map (fromListWith (<>)
239 . map (second Set.singleton))
241 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
242 (splitEvery 1000 ngs)
244 ------------------------------------------------------------------------
245 selectNgramsOnlyByNodeUser :: CorpusId
249 -> Cmd err [(Text, NodeId)]
250 selectNgramsOnlyByNodeUser cId ls nt tms =
251 runPGSQuery queryNgramsOnlyByNodeUser
252 ( Values fields (DPS.Only <$> tms)
253 , Values [QualifiedIdentifier Nothing "int4"]
254 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
256 , nodeTypeId NodeDocument
260 fields = [QualifiedIdentifier Nothing "text"]
262 queryNgramsOnlyByNodeUser :: DPS.Query
263 queryNgramsOnlyByNodeUser = [sql|
264 WITH input_rows(terms) AS (?),
265 input_list(id) AS (?)
266 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
267 JOIN ngrams ng ON nng.ngrams_id = ng.id
268 JOIN input_rows ir ON ir.terms = ng.terms
269 JOIN input_list il ON il.id = nng.node1_id
270 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
271 JOIN nodes n ON nn.node2_id = n.id
272 WHERE nn.node1_id = ? -- CorpusId
273 AND n.typename = ? -- NodeTypeId
274 AND nng.ngrams_type = ? -- NgramsTypeId
276 GROUP BY ng.terms, nng.node2_id
280 selectNgramsOnlyByNodeUser' :: CorpusId
284 -> Cmd err [(Text, Int)]
285 selectNgramsOnlyByNodeUser' cId ls nt tms =
286 runPGSQuery queryNgramsOnlyByNodeUser
287 ( Values fields (DPS.Only <$> tms)
288 , Values [QualifiedIdentifier Nothing "int4"]
289 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
291 , nodeTypeId NodeDocument
295 fields = [QualifiedIdentifier Nothing "text"]
297 queryNgramsOnlyByNodeUser' :: DPS.Query
298 queryNgramsOnlyByNodeUser' = [sql|
299 WITH input_rows(terms) AS (?),
300 input_list(id) AS (?)
301 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
302 JOIN ngrams ng ON nng.ngrams_id = ng.id
303 JOIN input_rows ir ON ir.terms = ng.terms
304 JOIN input_list il ON il.id = nng.node2_id
305 WHERE nng.node1_id = ? -- CorpusId
306 AND nng.ngrams_type = ? -- NgramsTypeId
307 -- AND nn.category > 0
308 GROUP BY ng.terms, nng.weight
312 getNgramsByDocOnlyUser :: DocId
316 -> Cmd err (Map Text (Set NodeId))
317 getNgramsByDocOnlyUser cId ls nt ngs =
319 . map (fromListWith (<>) . map (second Set.singleton))
320 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
323 selectNgramsOnlyByDocUser :: DocId
327 -> Cmd err [(Text, NodeId)]
328 selectNgramsOnlyByDocUser dId ls nt tms =
329 runPGSQuery queryNgramsOnlyByDocUser
330 ( Values fields (DPS.Only <$> tms)
331 , Values [QualifiedIdentifier Nothing "int4"]
332 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
337 fields = [QualifiedIdentifier Nothing "text"]
340 queryNgramsOnlyByDocUser :: DPS.Query
341 queryNgramsOnlyByDocUser = [sql|
342 WITH input_rows(terms) AS (?),
343 input_list(id) AS (?)
344 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
345 JOIN ngrams ng ON nng.ngrams_id = ng.id
346 JOIN input_rows ir ON ir.terms = ng.terms
347 JOIN input_list il ON il.id = nng.node1_id
348 WHERE nng.node2_id = ? -- DocId
349 AND nng.ngrams_type = ? -- NgramsTypeId
350 GROUP BY ng.terms, nng.node2_id
353 ------------------------------------------------------------------------
354 -- | TODO filter by language, database, any social field
355 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
356 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
357 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
358 -- . takeWhile (not . List.null)
359 -- . takeWhile (\l -> List.length l > 3)
360 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
362 selectNgramsByNodeMaster :: Int
366 -> Cmd err [(NodeId, Text)]
367 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
368 queryNgramsByNodeMaster'
370 , ngramsTypeId NgramsTerms
371 , nodeTypeId NodeDocument
373 , nodeTypeId NodeDocument
377 , nodeTypeId NodeDocument
378 , ngramsTypeId NgramsTerms
381 -- | TODO fix node_node_ngrams relation
382 queryNgramsByNodeMaster' :: DPS.Query
383 queryNgramsByNodeMaster' = [sql|
384 WITH nodesByNgramsUser AS (
386 SELECT n.id, ng.terms FROM nodes n
387 JOIN nodes_nodes nn ON n.id = nn.node2_id
388 JOIN node_node_ngrams nng ON nng.node2_id = n.id
389 JOIN ngrams ng ON nng.ngrams_id = ng.id
390 WHERE nn.node1_id = ? -- UserCorpusId
391 -- AND n.typename = ? -- NodeTypeId
392 AND nng.ngrams_type = ? -- NgramsTypeId
394 AND node_pos(n.id,?) >= ?
395 AND node_pos(n.id,?) < ?
396 GROUP BY n.id, ng.terms
400 nodesByNgramsMaster AS (
402 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
403 JOIN node_node_ngrams nng ON n.id = nng.node2_id
404 JOIN ngrams ng ON ng.id = nng.ngrams_id
406 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
407 AND n.typename = ? -- NodeTypeId
408 AND nng.ngrams_type = ? -- NgramsTypeId
409 GROUP BY n.id, ng.terms
412 SELECT m.id, m.terms FROM nodesByNgramsMaster m
413 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id