2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByNode
19 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
20 import Data.HashMap.Strict (HashMap)
23 import Data.Text (Text)
24 import Data.Tuple.Extra (first, second, swap)
25 import Database.PostgreSQL.Simple.SqlQQ (sql)
26 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
27 import Debug.Trace (trace)
28 import Gargantext.Core
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
40 -- | fst is size of Supra Corpus
41 -- snd is Texts and size of Occurrences (different docs)
42 countNodesByNgramsWith :: (NgramsTerm -> NgramsTerm)
43 -> HashMap NgramsTerm (Set NodeId)
44 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
45 countNodesByNgramsWith f m = (total, m')
47 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
48 m' = HM.map ( swap . second (fromIntegral . Set.size))
49 $ groupNodesByNgramsWith f m
52 groupNodesByNgramsWith :: (NgramsTerm -> NgramsTerm)
53 -> HashMap NgramsTerm (Set NodeId)
54 -> HashMap NgramsTerm (Set NgramsTerm, Set NodeId)
55 groupNodesByNgramsWith f m =
56 HM.fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
59 ------------------------------------------------------------------------
60 getNodesByNgramsUser :: HasDBid NodeType
63 -> Cmd err (HashMap NgramsTerm (Set NodeId))
64 getNodesByNgramsUser cId nt =
65 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
66 <$> selectNgramsByNodeUser cId nt
69 selectNgramsByNodeUser :: HasDBid NodeType
72 -> Cmd err [(NodeId, Text)]
73 selectNgramsByNodeUser cId' nt' =
74 runPGSQuery queryNgramsByNodeUser
78 -- , 100 :: Int -- limit
79 -- , 0 :: Int -- offset
82 queryNgramsByNodeUser :: DPS.Query
83 queryNgramsByNodeUser = [sql|
84 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
85 JOIN ngrams ng ON nng.ngrams_id = ng.id
86 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
87 JOIN nodes n ON nn.node2_id = n.id
88 WHERE nn.node1_id = ? -- CorpusId
89 AND n.typename = ? -- toDBid
90 AND nng.ngrams_type = ? -- NgramsTypeId
92 GROUP BY nng.node2_id, ng.terms
93 ORDER BY (nng.node2_id, ng.terms) DESC
97 ------------------------------------------------------------------------
99 getOccByNgramsOnlyFast :: HasDBid NodeType
103 -> Cmd err (HashMap NgramsTerm Int)
104 getOccByNgramsOnlyFast cId nt ngs =
105 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
108 getOccByNgramsOnlyFast' :: CorpusId
112 -> Cmd err (HashMap NgramsTerm Int)
113 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
114 HM.fromListWith (+) <$> map (second round) <$> run cId lId nt tms
117 fields = [QualifiedIdentifier Nothing "text"]
123 -> Cmd err [(NgramsTerm, Double)]
124 run cId' lId' nt' tms' = fmap (first NgramsTerm) <$> runPGSQuery query
125 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms')
133 WITH input_rows(terms) AS (?)
134 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
135 JOIN ngrams ng ON nng.ngrams_id = ng.id
136 JOIN input_rows ir ON ir.terms = ng.terms
137 WHERE nng.node1_id = ? -- CorpusId
138 AND nng.node2_id = ? -- ListId
139 AND nng.ngrams_type = ? -- NgramsTypeId
140 -- AND nn.category > 0 -- TODO
141 GROUP BY ng.terms, nng.weight
145 -- just slower than getOccByNgramsOnlyFast
146 getOccByNgramsOnlySlow :: HasDBid NodeType
152 -> Cmd err (HashMap NgramsTerm Int)
153 getOccByNgramsOnlySlow t cId ls nt ngs =
154 HM.map Set.size <$> getScore' t cId ls nt ngs
156 getScore' NodeCorpus = getNodesByNgramsOnlyUser
157 getScore' NodeDocument = getNgramsByDocOnlyUser
158 getScore' _ = getNodesByNgramsOnlyUser
160 getOccByNgramsOnlySafe :: HasDBid NodeType
165 -> Cmd err (HashMap NgramsTerm Int)
166 getOccByNgramsOnlySafe cId ls nt ngs = do
167 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
168 fast <- getOccByNgramsOnlyFast cId nt ngs
169 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
170 when (fast /= slow) $
171 printDebug "getOccByNgramsOnlySafe: difference"
172 (HM.difference slow fast, HM.difference fast slow)
173 -- diff slow fast :: PatchMap Text (Replace (Maybe Int))
177 selectNgramsOccurrencesOnlyByNodeUser :: HasDBid NodeType
181 -> Cmd err [(NgramsTerm, Int)]
182 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
183 fmap (first NgramsTerm) <$>
184 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
185 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
187 , toDBid NodeDocument
191 fields = [QualifiedIdentifier Nothing "text"]
193 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
194 -- Question: with the grouping is the result exactly the same (since Set NodeId for
195 -- equivalent ngrams intersections are not empty)
196 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
197 queryNgramsOccurrencesOnlyByNodeUser = [sql|
198 WITH input_rows(terms) AS (?)
199 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
200 JOIN ngrams ng ON nng.ngrams_id = ng.id
201 JOIN input_rows ir ON ir.terms = ng.terms
202 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
203 JOIN nodes n ON nn.node2_id = n.id
204 WHERE nn.node1_id = ? -- CorpusId
205 AND n.typename = ? -- toDBid
206 AND nng.ngrams_type = ? -- NgramsTypeId
208 GROUP BY nng.node2_id, ng.terms
211 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
212 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
213 WITH input_rows(terms) AS (?)
214 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
215 JOIN ngrams ng ON nng.ngrams_id = ng.id
216 JOIN input_rows ir ON ir.terms = ng.terms
217 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
218 JOIN nodes n ON nn.node2_id = n.id
219 WHERE nn.node1_id = ? -- CorpusId
220 AND n.typename = ? -- toDBid
221 AND nng.ngrams_type = ? -- NgramsTypeId
223 GROUP BY nng.node2_id, ng.terms
226 ------------------------------------------------------------------------
227 getNodesByNgramsOnlyUser :: HasDBid NodeType
232 -> Cmd err (HashMap NgramsTerm (Set NodeId))
233 getNodesByNgramsOnlyUser cId ls nt ngs =
235 . map (HM.fromListWith (<>)
236 . map (second Set.singleton))
237 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
238 (splitEvery 1000 ngs)
241 getNgramsByNodeOnlyUser :: HasDBid NodeType
246 -> Cmd err (Map NodeId (Set NgramsTerm))
247 getNgramsByNodeOnlyUser cId ls nt ngs =
249 . map ( Map.fromListWith (<>)
250 . map (second Set.singleton)
253 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
254 (splitEvery 1000 ngs)
256 ------------------------------------------------------------------------
257 selectNgramsOnlyByNodeUser :: HasDBid NodeType
262 -> Cmd err [(NgramsTerm, NodeId)]
263 selectNgramsOnlyByNodeUser cId ls nt tms =
264 fmap (first NgramsTerm) <$>
265 runPGSQuery queryNgramsOnlyByNodeUser
266 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
267 , Values [QualifiedIdentifier Nothing "int4"]
268 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
270 , toDBid NodeDocument
274 fields = [QualifiedIdentifier Nothing "text"]
276 queryNgramsOnlyByNodeUser :: DPS.Query
277 queryNgramsOnlyByNodeUser = [sql|
278 WITH input_rows(terms) AS (?),
279 input_list(id) AS (?)
280 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
281 JOIN ngrams ng ON nng.ngrams_id = ng.id
282 JOIN input_rows ir ON ir.terms = ng.terms
283 JOIN input_list il ON il.id = nng.node1_id
284 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
285 JOIN nodes n ON nn.node2_id = n.id
286 WHERE nn.node1_id = ? -- CorpusId
287 AND n.typename = ? -- toDBid
288 AND nng.ngrams_type = ? -- NgramsTypeId
290 GROUP BY ng.terms, nng.node2_id
294 selectNgramsOnlyByNodeUser' :: HasDBid NodeType
299 -> Cmd err [(Text, Int)]
300 selectNgramsOnlyByNodeUser' cId ls nt tms =
301 runPGSQuery queryNgramsOnlyByNodeUser
302 ( Values fields (DPS.Only <$> tms)
303 , Values [QualifiedIdentifier Nothing "int4"]
304 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
306 , toDBid NodeDocument
310 fields = [QualifiedIdentifier Nothing "text"]
312 queryNgramsOnlyByNodeUser' :: DPS.Query
313 queryNgramsOnlyByNodeUser' = [sql|
314 WITH input_rows(terms) AS (?),
315 input_list(id) AS (?)
316 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
317 JOIN ngrams ng ON nng.ngrams_id = ng.id
318 JOIN input_rows ir ON ir.terms = ng.terms
319 JOIN input_list il ON il.id = nng.node2_id
320 WHERE nng.node1_id = ? -- CorpusId
321 AND nng.ngrams_type = ? -- NgramsTypeId
322 -- AND nn.category > 0
323 GROUP BY ng.terms, nng.weight
327 getNgramsByDocOnlyUser :: DocId
331 -> Cmd err (HashMap NgramsTerm (Set NodeId))
332 getNgramsByDocOnlyUser cId ls nt ngs =
334 . map (HM.fromListWith (<>) . map (second Set.singleton))
335 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
338 selectNgramsOnlyByDocUser :: DocId
342 -> Cmd err [(NgramsTerm, NodeId)]
343 selectNgramsOnlyByDocUser dId ls nt tms =
344 fmap (first NgramsTerm) <$>
345 runPGSQuery queryNgramsOnlyByDocUser
346 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
347 , Values [QualifiedIdentifier Nothing "int4"]
348 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
353 fields = [QualifiedIdentifier Nothing "text"]
356 queryNgramsOnlyByDocUser :: DPS.Query
357 queryNgramsOnlyByDocUser = [sql|
358 WITH input_rows(terms) AS (?),
359 input_list(id) AS (?)
360 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
361 JOIN ngrams ng ON nng.ngrams_id = ng.id
362 JOIN input_rows ir ON ir.terms = ng.terms
363 JOIN input_list il ON il.id = nng.node1_id
364 WHERE nng.node2_id = ? -- DocId
365 AND nng.ngrams_type = ? -- NgramsTypeId
366 GROUP BY ng.terms, nng.node2_id
369 ------------------------------------------------------------------------
370 -- | TODO filter by language, database, any social field
371 getNodesByNgramsMaster :: HasDBid NodeType
372 => UserCorpusId -> MasterCorpusId -> Cmd err (HashMap Text (Set NodeId))
373 getNodesByNgramsMaster ucId mcId = unionsWith (<>)
374 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
375 -- . takeWhile (not . List.null)
376 -- . takeWhile (\l -> List.length l > 3)
377 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
379 selectNgramsByNodeMaster :: HasDBid NodeType
384 -> Cmd err [(NodeId, Text)]
385 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
386 queryNgramsByNodeMaster'
388 , ngramsTypeId NgramsTerms
389 , toDBid NodeDocument
391 , toDBid NodeDocument
395 , toDBid NodeDocument
396 , ngramsTypeId NgramsTerms
399 -- | TODO fix node_node_ngrams relation
400 queryNgramsByNodeMaster' :: DPS.Query
401 queryNgramsByNodeMaster' = [sql|
402 WITH nodesByNgramsUser AS (
404 SELECT n.id, ng.terms FROM nodes n
405 JOIN nodes_nodes nn ON n.id = nn.node2_id
406 JOIN node_node_ngrams nng ON nng.node2_id = n.id
407 JOIN ngrams ng ON nng.ngrams_id = ng.id
408 WHERE nn.node1_id = ? -- UserCorpusId
409 -- AND n.typename = ? -- toDBid
410 AND nng.ngrams_type = ? -- NgramsTypeId
412 AND node_pos(n.id,?) >= ?
413 AND node_pos(n.id,?) < ?
414 GROUP BY n.id, ng.terms
418 nodesByNgramsMaster AS (
420 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
421 JOIN node_node_ngrams nng ON n.id = nng.node2_id
422 JOIN ngrams ng ON ng.id = nng.ngrams_id
424 WHERE n.parent_id = ? -- Master Corpus toDBid
425 AND n.typename = ? -- toDBid
426 AND nng.ngrams_type = ? -- NgramsTypeId
427 GROUP BY n.id, ng.terms
430 SELECT m.id, m.terms FROM nodesByNgramsMaster m
431 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id