2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByNode
19 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
20 import Data.HashMap.Strict (HashMap)
23 import Data.Text (Text)
24 import Data.Tuple.Extra (first, second, swap)
25 import Database.PostgreSQL.Simple.SqlQQ (sql)
26 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
27 import Debug.Trace (trace)
28 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
29 import Gargantext.Data.HashMap.Strict.Utils as HM
30 import Gargantext.Database.Admin.Config (nodeTypeId)
31 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
40 -- | fst is size of Supra Corpus
41 -- snd is Texts and size of Occurrences (different docs)
42 countNodesByNgramsWith :: (NgramsTerm -> NgramsTerm)
43 -> HashMap NgramsTerm (Set NodeId)
44 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
45 countNodesByNgramsWith f m = (total, m')
47 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
48 m' = HM.map ( swap . second (fromIntegral . Set.size))
49 $ groupNodesByNgramsWith f m
52 groupNodesByNgramsWith :: (NgramsTerm -> NgramsTerm)
53 -> HashMap NgramsTerm (Set NodeId)
54 -> HashMap NgramsTerm (Set NgramsTerm, Set NodeId)
55 groupNodesByNgramsWith f m =
56 HM.fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
59 ------------------------------------------------------------------------
60 getNodesByNgramsUser :: CorpusId
62 -> Cmd err (HashMap NgramsTerm (Set NodeId))
63 getNodesByNgramsUser cId nt =
64 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
65 <$> selectNgramsByNodeUser cId nt
68 selectNgramsByNodeUser :: CorpusId
70 -> Cmd err [(NodeId, Text)]
71 selectNgramsByNodeUser cId' nt' =
72 runPGSQuery queryNgramsByNodeUser
74 , nodeTypeId NodeDocument
76 -- , 100 :: Int -- limit
77 -- , 0 :: Int -- offset
80 queryNgramsByNodeUser :: DPS.Query
81 queryNgramsByNodeUser = [sql|
82 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
83 JOIN ngrams ng ON nng.ngrams_id = ng.id
84 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
85 JOIN nodes n ON nn.node2_id = n.id
86 WHERE nn.node1_id = ? -- CorpusId
87 AND n.typename = ? -- NodeTypeId
88 AND nng.ngrams_type = ? -- NgramsTypeId
90 GROUP BY nng.node2_id, ng.terms
91 ORDER BY (nng.node2_id, ng.terms) DESC
95 ------------------------------------------------------------------------
97 getOccByNgramsOnlyFast :: CorpusId
100 -> Cmd err (HashMap NgramsTerm Int)
101 getOccByNgramsOnlyFast cId nt ngs =
102 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
105 getOccByNgramsOnlyFast' :: CorpusId
109 -> Cmd err (HashMap NgramsTerm Int)
110 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
111 HM.fromListWith (+) <$> map (second round) <$> run cId lId nt tms
114 fields = [QualifiedIdentifier Nothing "text"]
120 -> Cmd err [(NgramsTerm, Double)]
121 run cId' lId' nt' tms' = fmap (first NgramsTerm) <$> runPGSQuery query
122 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms')
130 WITH input_rows(terms) AS (?)
131 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
132 JOIN ngrams ng ON nng.ngrams_id = ng.id
133 JOIN input_rows ir ON ir.terms = ng.terms
134 WHERE nng.node1_id = ? -- CorpusId
135 AND nng.node2_id = ? -- ListId
136 AND nng.ngrams_type = ? -- NgramsTypeId
137 -- AND nn.category > 0 -- TODO
138 GROUP BY ng.terms, nng.weight
142 -- just slower than getOccByNgramsOnlyFast
143 getOccByNgramsOnlySlow :: NodeType
148 -> Cmd err (HashMap NgramsTerm Int)
149 getOccByNgramsOnlySlow t cId ls nt ngs =
150 HM.map Set.size <$> getScore' t cId ls nt ngs
152 getScore' NodeCorpus = getNodesByNgramsOnlyUser
153 getScore' NodeDocument = getNgramsByDocOnlyUser
154 getScore' _ = getNodesByNgramsOnlyUser
156 getOccByNgramsOnlySafe :: CorpusId
160 -> Cmd err (HashMap NgramsTerm Int)
161 getOccByNgramsOnlySafe cId ls nt ngs = do
162 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
163 fast <- getOccByNgramsOnlyFast cId nt ngs
164 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
165 when (fast /= slow) $
166 printDebug "getOccByNgramsOnlySafe: difference"
167 (HM.difference slow fast, HM.difference fast slow)
168 -- diff slow fast :: PatchMap Text (Replace (Maybe Int))
172 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
175 -> Cmd err [(NgramsTerm, Int)]
176 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
177 fmap (first NgramsTerm) <$>
178 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
179 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
181 , nodeTypeId NodeDocument
185 fields = [QualifiedIdentifier Nothing "text"]
187 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
188 -- Question: with the grouping is the result exactly the same (since Set NodeId for
189 -- equivalent ngrams intersections are not empty)
190 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
191 queryNgramsOccurrencesOnlyByNodeUser = [sql|
192 WITH input_rows(terms) AS (?)
193 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
194 JOIN ngrams ng ON nng.ngrams_id = ng.id
195 JOIN input_rows ir ON ir.terms = ng.terms
196 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
197 JOIN nodes n ON nn.node2_id = n.id
198 WHERE nn.node1_id = ? -- CorpusId
199 AND n.typename = ? -- NodeTypeId
200 AND nng.ngrams_type = ? -- NgramsTypeId
202 GROUP BY nng.node2_id, ng.terms
205 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
206 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
207 WITH input_rows(terms) AS (?)
208 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
209 JOIN ngrams ng ON nng.ngrams_id = ng.id
210 JOIN input_rows ir ON ir.terms = ng.terms
211 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
212 JOIN nodes n ON nn.node2_id = n.id
213 WHERE nn.node1_id = ? -- CorpusId
214 AND n.typename = ? -- NodeTypeId
215 AND nng.ngrams_type = ? -- NgramsTypeId
217 GROUP BY nng.node2_id, ng.terms
220 ------------------------------------------------------------------------
221 getNodesByNgramsOnlyUser :: CorpusId
225 -> Cmd err (HashMap NgramsTerm (Set NodeId))
226 getNodesByNgramsOnlyUser cId ls nt ngs =
228 . map (HM.fromListWith (<>)
229 . map (second Set.singleton))
230 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
231 (splitEvery 1000 ngs)
234 getNgramsByNodeOnlyUser :: NodeId
238 -> Cmd err (Map NodeId (Set NgramsTerm))
239 getNgramsByNodeOnlyUser cId ls nt ngs =
241 . map ( Map.fromListWith (<>)
242 . map (second Set.singleton)
245 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
246 (splitEvery 1000 ngs)
248 ------------------------------------------------------------------------
249 selectNgramsOnlyByNodeUser :: CorpusId
253 -> Cmd err [(NgramsTerm, NodeId)]
254 selectNgramsOnlyByNodeUser cId ls nt tms =
255 fmap (first NgramsTerm) <$>
256 runPGSQuery queryNgramsOnlyByNodeUser
257 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
258 , Values [QualifiedIdentifier Nothing "int4"]
259 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
261 , nodeTypeId NodeDocument
265 fields = [QualifiedIdentifier Nothing "text"]
267 queryNgramsOnlyByNodeUser :: DPS.Query
268 queryNgramsOnlyByNodeUser = [sql|
269 WITH input_rows(terms) AS (?),
270 input_list(id) AS (?)
271 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
272 JOIN ngrams ng ON nng.ngrams_id = ng.id
273 JOIN input_rows ir ON ir.terms = ng.terms
274 JOIN input_list il ON il.id = nng.node1_id
275 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
276 JOIN nodes n ON nn.node2_id = n.id
277 WHERE nn.node1_id = ? -- CorpusId
278 AND n.typename = ? -- NodeTypeId
279 AND nng.ngrams_type = ? -- NgramsTypeId
281 GROUP BY ng.terms, nng.node2_id
285 selectNgramsOnlyByNodeUser' :: CorpusId
289 -> Cmd err [(Text, Int)]
290 selectNgramsOnlyByNodeUser' cId ls nt tms =
291 runPGSQuery queryNgramsOnlyByNodeUser
292 ( Values fields (DPS.Only <$> tms)
293 , Values [QualifiedIdentifier Nothing "int4"]
294 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
296 , nodeTypeId NodeDocument
300 fields = [QualifiedIdentifier Nothing "text"]
302 queryNgramsOnlyByNodeUser' :: DPS.Query
303 queryNgramsOnlyByNodeUser' = [sql|
304 WITH input_rows(terms) AS (?),
305 input_list(id) AS (?)
306 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
307 JOIN ngrams ng ON nng.ngrams_id = ng.id
308 JOIN input_rows ir ON ir.terms = ng.terms
309 JOIN input_list il ON il.id = nng.node2_id
310 WHERE nng.node1_id = ? -- CorpusId
311 AND nng.ngrams_type = ? -- NgramsTypeId
312 -- AND nn.category > 0
313 GROUP BY ng.terms, nng.weight
317 getNgramsByDocOnlyUser :: DocId
321 -> Cmd err (HashMap NgramsTerm (Set NodeId))
322 getNgramsByDocOnlyUser cId ls nt ngs =
324 . map (HM.fromListWith (<>) . map (second Set.singleton))
325 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
328 selectNgramsOnlyByDocUser :: DocId
332 -> Cmd err [(NgramsTerm, NodeId)]
333 selectNgramsOnlyByDocUser dId ls nt tms =
334 fmap (first NgramsTerm) <$>
335 runPGSQuery queryNgramsOnlyByDocUser
336 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
337 , Values [QualifiedIdentifier Nothing "int4"]
338 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
343 fields = [QualifiedIdentifier Nothing "text"]
346 queryNgramsOnlyByDocUser :: DPS.Query
347 queryNgramsOnlyByDocUser = [sql|
348 WITH input_rows(terms) AS (?),
349 input_list(id) AS (?)
350 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
351 JOIN ngrams ng ON nng.ngrams_id = ng.id
352 JOIN input_rows ir ON ir.terms = ng.terms
353 JOIN input_list il ON il.id = nng.node1_id
354 WHERE nng.node2_id = ? -- DocId
355 AND nng.ngrams_type = ? -- NgramsTypeId
356 GROUP BY ng.terms, nng.node2_id
359 ------------------------------------------------------------------------
360 -- | TODO filter by language, database, any social field
361 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (HashMap Text (Set NodeId))
362 getNodesByNgramsMaster ucId mcId = unionsWith (<>)
363 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
364 -- . takeWhile (not . List.null)
365 -- . takeWhile (\l -> List.length l > 3)
366 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
368 selectNgramsByNodeMaster :: Int
372 -> Cmd err [(NodeId, Text)]
373 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
374 queryNgramsByNodeMaster'
376 , ngramsTypeId NgramsTerms
377 , nodeTypeId NodeDocument
379 , nodeTypeId NodeDocument
383 , nodeTypeId NodeDocument
384 , ngramsTypeId NgramsTerms
387 -- | TODO fix node_node_ngrams relation
388 queryNgramsByNodeMaster' :: DPS.Query
389 queryNgramsByNodeMaster' = [sql|
390 WITH nodesByNgramsUser AS (
392 SELECT n.id, ng.terms FROM nodes n
393 JOIN nodes_nodes nn ON n.id = nn.node2_id
394 JOIN node_node_ngrams nng ON nng.node2_id = n.id
395 JOIN ngrams ng ON nng.ngrams_id = ng.id
396 WHERE nn.node1_id = ? -- UserCorpusId
397 -- AND n.typename = ? -- NodeTypeId
398 AND nng.ngrams_type = ? -- NgramsTypeId
400 AND node_pos(n.id,?) >= ?
401 AND node_pos(n.id,?) < ?
402 GROUP BY n.id, ng.terms
406 nodesByNgramsMaster AS (
408 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
409 JOIN node_node_ngrams nng ON n.id = nng.node2_id
410 JOIN ngrams ng ON ng.id = nng.ngrams_id
412 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
413 AND n.typename = ? -- NodeTypeId
414 AND nng.ngrams_type = ? -- NgramsTypeId
415 GROUP BY n.id, ng.terms
418 SELECT m.id, m.terms FROM nodesByNgramsMaster m
419 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id