2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByNode
19 import Data.Map.Strict (Map, fromListWith, elems, toList)
20 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
22 import Data.Text (Text)
23 import Data.Tuple.Extra (second, swap)
24 import Database.PostgreSQL.Simple.SqlQQ (sql)
25 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
26 import Debug.Trace (trace)
27 import qualified Data.List as List
28 import qualified Data.Map.Strict as Map
29 import qualified Data.Set as Set
30 import qualified Data.Text as Text
31 import qualified Database.PostgreSQL.Simple as DPS
33 import Gargantext.Core (Lang(..))
34 import Gargantext.Database.Admin.Config (nodeTypeId)
35 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
36 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
37 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
38 import Gargantext.Prelude
42 -- | fst is size of Supra Corpus
43 -- snd is Texts and size of Occurrences (different docs)
44 countNodesByNgramsWith :: (Text -> Text)
45 -> Map Text (Set NodeId)
46 -> (Double, Map Text (Double, Set Text))
47 countNodesByNgramsWith f m = (total, m')
49 total = fromIntegral $ Set.size $ Set.unions $ elems m
50 m' = Map.map ( swap . second (fromIntegral . Set.size))
51 $ groupNodesByNgramsWith f m
54 groupNodesByNgramsWith :: (Text -> Text)
55 -> Map Text (Set NodeId)
56 -> Map Text (Set Text, Set NodeId)
57 groupNodesByNgramsWith f m =
58 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
61 ------------------------------------------------------------------------
62 getNodesByNgramsUser :: CorpusId
64 -> Cmd err (Map Text (Set NodeId))
65 getNodesByNgramsUser cId nt =
66 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
67 <$> selectNgramsByNodeUser cId nt
70 selectNgramsByNodeUser :: CorpusId
72 -> Cmd err [(NodeId, Text)]
73 selectNgramsByNodeUser cId' nt' =
74 runPGSQuery queryNgramsByNodeUser
76 , nodeTypeId NodeDocument
78 -- , 100 :: Int -- limit
79 -- , 0 :: Int -- offset
82 queryNgramsByNodeUser :: DPS.Query
83 queryNgramsByNodeUser = [sql|
84 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
85 JOIN ngrams ng ON nng.ngrams_id = ng.id
86 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
87 JOIN nodes n ON nn.node2_id = n.id
88 WHERE nn.node1_id = ? -- CorpusId
89 AND n.typename = ? -- NodeTypeId
90 AND nng.ngrams_type = ? -- NgramsTypeId
92 GROUP BY nng.node2_id, ng.terms
93 ORDER BY (nng.node2_id, ng.terms) DESC
97 ------------------------------------------------------------------------
99 getOccByNgramsOnlyFast :: CorpusId
102 -> Cmd err (Map Text Int)
103 getOccByNgramsOnlyFast cId nt ngs =
104 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
107 getOccByNgramsOnlyFast' :: CorpusId
111 -> Cmd err (Map Text Int)
112 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
113 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
116 fields = [QualifiedIdentifier Nothing "text"]
122 -> Cmd err [(Text, Double)]
123 run cId' lId' nt' tms' = runPGSQuery query
124 ( Values fields (DPS.Only <$> tms')
132 WITH input_rows(terms) AS (?)
133 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
134 JOIN ngrams ng ON nng.ngrams_id = ng.id
135 JOIN input_rows ir ON ir.terms = ng.terms
136 WHERE nng.node1_id = ? -- CorpusId
137 AND nng.node2_id = ? -- ListId
138 AND nng.ngrams_type = ? -- NgramsTypeId
139 -- AND nn.category > 0 -- TODO
140 GROUP BY ng.terms, nng.weight
144 -- just slower than getOccByNgramsOnlyFast
145 getOccByNgramsOnlySlow :: NodeType
150 -> Cmd err (Map Text Int)
151 getOccByNgramsOnlySlow t cId ls nt ngs =
152 Map.map Set.size <$> getScore' t cId ls nt ngs
154 getScore' NodeCorpus = getNodesByNgramsOnlyUser
155 getScore' NodeDocument = getNgramsByDocOnlyUser
156 getScore' _ = getNodesByNgramsOnlyUser
158 getOccByNgramsOnlySafe :: CorpusId
162 -> Cmd err (Map Text Int)
163 getOccByNgramsOnlySafe cId ls nt ngs = do
164 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
165 fast <- getOccByNgramsOnlyFast cId nt ngs
166 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
167 when (fast /= slow) $
168 printDebug "getOccByNgramsOnlySafe: difference"
169 (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
173 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
176 -> Cmd err [(Text, Int)]
177 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
178 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
179 ( Values fields (DPS.Only <$> tms)
181 , nodeTypeId NodeDocument
185 fields = [QualifiedIdentifier Nothing "text"]
187 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
188 -- Question: with the grouping is the result exactly the same (since Set NodeId for
189 -- equivalent ngrams intersections are not empty)
190 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
191 queryNgramsOccurrencesOnlyByNodeUser = [sql|
192 WITH input_rows(terms) AS (?)
193 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
194 JOIN ngrams ng ON nng.ngrams_id = ng.id
195 JOIN input_rows ir ON ir.terms = ng.terms
196 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
197 JOIN nodes n ON nn.node2_id = n.id
198 WHERE nn.node1_id = ? -- CorpusId
199 AND n.typename = ? -- NodeTypeId
200 AND nng.ngrams_type = ? -- NgramsTypeId
202 GROUP BY nng.node2_id, ng.terms
205 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
206 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
207 WITH input_rows(terms) AS (?)
208 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
209 JOIN ngrams ng ON nng.ngrams_id = ng.id
210 JOIN input_rows ir ON ir.terms = ng.terms
211 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
212 JOIN nodes n ON nn.node2_id = n.id
213 WHERE nn.node1_id = ? -- CorpusId
214 AND n.typename = ? -- NodeTypeId
215 AND nng.ngrams_type = ? -- NgramsTypeId
217 GROUP BY nng.node2_id, ng.terms
220 ------------------------------------------------------------------------
221 getNodesByNgramsOnlyUser :: CorpusId
225 -> Cmd err (Map Text (Set NodeId))
226 getNodesByNgramsOnlyUser cId ls nt ngs =
228 . map (fromListWith (<>)
229 . map (second Set.singleton))
230 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
231 (splitEvery 1000 ngs)
234 getNgramsByNodeOnlyUser :: NodeId
238 -> Cmd err (Map NodeId (Set Text))
239 getNgramsByNodeOnlyUser cId ls nt ngs =
241 . map (fromListWith (<>)
242 . map (second Set.singleton))
244 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
245 (splitEvery 1000 ngs)
247 ------------------------------------------------------------------------
248 selectNgramsOnlyByNodeUser :: CorpusId
252 -> Cmd err [(Text, NodeId)]
253 selectNgramsOnlyByNodeUser cId ls nt tms =
254 runPGSQuery queryNgramsOnlyByNodeUser
255 ( Values fields (DPS.Only <$> tms)
256 , Values [QualifiedIdentifier Nothing "int4"]
257 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
259 , nodeTypeId NodeDocument
263 fields = [QualifiedIdentifier Nothing "text"]
265 queryNgramsOnlyByNodeUser :: DPS.Query
266 queryNgramsOnlyByNodeUser = [sql|
267 WITH input_rows(terms) AS (?),
268 input_list(id) AS (?)
269 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
270 JOIN ngrams ng ON nng.ngrams_id = ng.id
271 JOIN input_rows ir ON ir.terms = ng.terms
272 JOIN input_list il ON il.id = nng.node1_id
273 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
274 JOIN nodes n ON nn.node2_id = n.id
275 WHERE nn.node1_id = ? -- CorpusId
276 AND n.typename = ? -- NodeTypeId
277 AND nng.ngrams_type = ? -- NgramsTypeId
279 GROUP BY ng.terms, nng.node2_id
283 selectNgramsOnlyByNodeUser' :: CorpusId
287 -> Cmd err [(Text, Int)]
288 selectNgramsOnlyByNodeUser' cId ls nt tms =
289 runPGSQuery queryNgramsOnlyByNodeUser
290 ( Values fields (DPS.Only <$> tms)
291 , Values [QualifiedIdentifier Nothing "int4"]
292 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
294 , nodeTypeId NodeDocument
298 fields = [QualifiedIdentifier Nothing "text"]
300 queryNgramsOnlyByNodeUser' :: DPS.Query
301 queryNgramsOnlyByNodeUser' = [sql|
302 WITH input_rows(terms) AS (?),
303 input_list(id) AS (?)
304 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
305 JOIN ngrams ng ON nng.ngrams_id = ng.id
306 JOIN input_rows ir ON ir.terms = ng.terms
307 JOIN input_list il ON il.id = nng.node2_id
308 WHERE nng.node1_id = ? -- CorpusId
309 AND nng.ngrams_type = ? -- NgramsTypeId
310 -- AND nn.category > 0
311 GROUP BY ng.terms, nng.weight
315 getNgramsByDocOnlyUser :: DocId
319 -> Cmd err (Map Text (Set NodeId))
320 getNgramsByDocOnlyUser cId ls nt ngs =
322 . map (fromListWith (<>) . map (second Set.singleton))
323 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
326 selectNgramsOnlyByDocUser :: DocId
330 -> Cmd err [(Text, NodeId)]
331 selectNgramsOnlyByDocUser dId ls nt tms =
332 runPGSQuery queryNgramsOnlyByDocUser
333 ( Values fields (DPS.Only <$> tms)
334 , Values [QualifiedIdentifier Nothing "int4"]
335 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
340 fields = [QualifiedIdentifier Nothing "text"]
343 queryNgramsOnlyByDocUser :: DPS.Query
344 queryNgramsOnlyByDocUser = [sql|
345 WITH input_rows(terms) AS (?),
346 input_list(id) AS (?)
347 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
348 JOIN ngrams ng ON nng.ngrams_id = ng.id
349 JOIN input_rows ir ON ir.terms = ng.terms
350 JOIN input_list il ON il.id = nng.node1_id
351 WHERE nng.node2_id = ? -- DocId
352 AND nng.ngrams_type = ? -- NgramsTypeId
353 GROUP BY ng.terms, nng.node2_id
356 ------------------------------------------------------------------------
357 -- | TODO filter by language, database, any social field
358 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
359 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
360 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
361 -- . takeWhile (not . List.null)
362 -- . takeWhile (\l -> List.length l > 3)
363 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
365 selectNgramsByNodeMaster :: Int
369 -> Cmd err [(NodeId, Text)]
370 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
371 queryNgramsByNodeMaster'
373 , ngramsTypeId NgramsTerms
374 , nodeTypeId NodeDocument
376 , nodeTypeId NodeDocument
380 , nodeTypeId NodeDocument
381 , ngramsTypeId NgramsTerms
384 -- | TODO fix node_node_ngrams relation
385 queryNgramsByNodeMaster' :: DPS.Query
386 queryNgramsByNodeMaster' = [sql|
387 WITH nodesByNgramsUser AS (
389 SELECT n.id, ng.terms FROM nodes n
390 JOIN nodes_nodes nn ON n.id = nn.node2_id
391 JOIN node_node_ngrams nng ON nng.node2_id = n.id
392 JOIN ngrams ng ON nng.ngrams_id = ng.id
393 WHERE nn.node1_id = ? -- UserCorpusId
394 -- AND n.typename = ? -- NodeTypeId
395 AND nng.ngrams_type = ? -- NgramsTypeId
397 AND node_pos(n.id,?) >= ?
398 AND node_pos(n.id,?) < ?
399 GROUP BY n.id, ng.terms
403 nodesByNgramsMaster AS (
405 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
406 JOIN node_node_ngrams nng ON n.id = nng.node2_id
407 JOIN ngrams ng ON ng.id = nng.ngrams_id
409 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
410 AND n.typename = ? -- NodeTypeId
411 AND nng.ngrams_type = ? -- NgramsTypeId
412 GROUP BY n.id, ng.terms
415 SELECT m.id, m.terms FROM nodesByNgramsMaster m
416 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id