2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByNode
19 import Data.Map.Strict (Map, fromListWith, elems, toList)
20 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
22 import Data.Text (Text)
23 import Data.Tuple.Extra (second, swap)
24 import Database.PostgreSQL.Simple.SqlQQ (sql)
25 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
26 import Debug.Trace (trace)
27 import qualified Data.List as List
28 import qualified Data.Map.Strict as Map
29 import qualified Data.Set as Set
30 import qualified Data.Text as Text
31 import qualified Database.PostgreSQL.Simple as DPS
33 import Gargantext.Core (Lang(..))
34 import Gargantext.Database.Admin.Config (nodeTypeId)
35 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
36 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
37 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
38 import Gargantext.Prelude
39 import Gargantext.Core.Text.Terms.Mono.Stem (stem)
41 -- | TODO: group with 2 terms only can be
42 -- discussed. Main purpose of this is offering
43 -- a first grouping option to user and get some
44 -- enriched data to better learn and improve that algo
50 ngramsGroup l _m _n = Text.intercalate " "
54 -- . (List.filter (\t -> Text.length t > m))
56 . Text.replace "-" " "
60 -- | fst is size of Supra Corpus
61 -- snd is Texts and size of Occurrences (different docs)
62 countNodesByNgramsWith :: (Text -> Text)
63 -> Map Text (Set NodeId)
64 -> (Double, Map Text (Double, Set Text))
65 countNodesByNgramsWith f m = (total, m')
67 total = fromIntegral $ Set.size $ Set.unions $ elems m
68 m' = Map.map ( swap . second (fromIntegral . Set.size))
69 $ groupNodesByNgramsWith f m
72 groupNodesByNgramsWith :: (Text -> Text)
73 -> Map Text (Set NodeId)
74 -> Map Text (Set Text, Set NodeId)
75 groupNodesByNgramsWith f m =
76 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
79 ------------------------------------------------------------------------
80 getNodesByNgramsUser :: CorpusId
82 -> Cmd err (Map Text (Set NodeId))
83 getNodesByNgramsUser cId nt =
84 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
85 <$> selectNgramsByNodeUser cId nt
88 selectNgramsByNodeUser :: CorpusId
90 -> Cmd err [(NodeId, Text)]
91 selectNgramsByNodeUser cId' nt' =
92 runPGSQuery queryNgramsByNodeUser
94 , nodeTypeId NodeDocument
96 -- , 100 :: Int -- limit
97 -- , 0 :: Int -- offset
100 queryNgramsByNodeUser :: DPS.Query
101 queryNgramsByNodeUser = [sql|
102 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
103 JOIN ngrams ng ON nng.ngrams_id = ng.id
104 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
105 JOIN nodes n ON nn.node2_id = n.id
106 WHERE nn.node1_id = ? -- CorpusId
107 AND n.typename = ? -- NodeTypeId
108 AND nng.ngrams_type = ? -- NgramsTypeId
110 GROUP BY nng.node2_id, ng.terms
111 ORDER BY (nng.node2_id, ng.terms) DESC
115 ------------------------------------------------------------------------
117 getOccByNgramsOnlyFast :: CorpusId
120 -> Cmd err (Map Text Int)
121 getOccByNgramsOnlyFast cId nt ngs =
122 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
125 getOccByNgramsOnlyFast' :: CorpusId
129 -> Cmd err (Map Text Int)
130 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
131 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
134 fields = [QualifiedIdentifier Nothing "text"]
140 -> Cmd err [(Text, Double)]
141 run cId' lId' nt' tms' = runPGSQuery query
142 ( Values fields (DPS.Only <$> tms')
150 WITH input_rows(terms) AS (?)
151 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
152 JOIN ngrams ng ON nng.ngrams_id = ng.id
153 JOIN input_rows ir ON ir.terms = ng.terms
154 WHERE nng.node1_id = ? -- CorpusId
155 AND nng.node2_id = ? -- ListId
156 AND nng.ngrams_type = ? -- NgramsTypeId
157 -- AND nn.category > 0 -- TODO
158 GROUP BY ng.terms, nng.weight
162 -- just slower than getOccByNgramsOnlyFast
163 getOccByNgramsOnlySlow :: NodeType
168 -> Cmd err (Map Text Int)
169 getOccByNgramsOnlySlow t cId ls nt ngs =
170 Map.map Set.size <$> getScore' t cId ls nt ngs
172 getScore' NodeCorpus = getNodesByNgramsOnlyUser
173 getScore' NodeDocument = getNgramsByDocOnlyUser
174 getScore' _ = getNodesByNgramsOnlyUser
176 getOccByNgramsOnlySafe :: CorpusId
180 -> Cmd err (Map Text Int)
181 getOccByNgramsOnlySafe cId ls nt ngs = do
182 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
183 fast <- getOccByNgramsOnlyFast cId nt ngs
184 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
185 when (fast /= slow) $
186 printDebug "getOccByNgramsOnlySafe: difference"
187 (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
191 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
194 -> Cmd err [(Text, Int)]
195 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
196 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
197 ( Values fields (DPS.Only <$> tms)
199 , nodeTypeId NodeDocument
203 fields = [QualifiedIdentifier Nothing "text"]
205 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
206 -- Question: with the grouping is the result exactly the same (since Set NodeId for
207 -- equivalent ngrams intersections are not empty)
208 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
209 queryNgramsOccurrencesOnlyByNodeUser = [sql|
210 WITH input_rows(terms) AS (?)
211 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
212 JOIN ngrams ng ON nng.ngrams_id = ng.id
213 JOIN input_rows ir ON ir.terms = ng.terms
214 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
215 JOIN nodes n ON nn.node2_id = n.id
216 WHERE nn.node1_id = ? -- CorpusId
217 AND n.typename = ? -- NodeTypeId
218 AND nng.ngrams_type = ? -- NgramsTypeId
220 GROUP BY nng.node2_id, ng.terms
223 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
224 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
225 WITH input_rows(terms) AS (?)
226 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
227 JOIN ngrams ng ON nng.ngrams_id = ng.id
228 JOIN input_rows ir ON ir.terms = ng.terms
229 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
230 JOIN nodes n ON nn.node2_id = n.id
231 WHERE nn.node1_id = ? -- CorpusId
232 AND n.typename = ? -- NodeTypeId
233 AND nng.ngrams_type = ? -- NgramsTypeId
235 GROUP BY nng.node2_id, ng.terms
238 ------------------------------------------------------------------------
239 getNodesByNgramsOnlyUser :: CorpusId
243 -> Cmd err (Map Text (Set NodeId))
244 getNodesByNgramsOnlyUser cId ls nt ngs =
246 . map (fromListWith (<>)
247 . map (second Set.singleton))
248 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
249 (splitEvery 1000 ngs)
252 getNgramsByNodeOnlyUser :: NodeId
256 -> Cmd err (Map NodeId (Set Text))
257 getNgramsByNodeOnlyUser cId ls nt ngs =
259 . map (fromListWith (<>)
260 . map (second Set.singleton))
262 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
263 (splitEvery 1000 ngs)
265 ------------------------------------------------------------------------
266 selectNgramsOnlyByNodeUser :: CorpusId
270 -> Cmd err [(Text, NodeId)]
271 selectNgramsOnlyByNodeUser cId ls nt tms =
272 runPGSQuery queryNgramsOnlyByNodeUser
273 ( Values fields (DPS.Only <$> tms)
274 , Values [QualifiedIdentifier Nothing "int4"]
275 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
277 , nodeTypeId NodeDocument
281 fields = [QualifiedIdentifier Nothing "text"]
283 queryNgramsOnlyByNodeUser :: DPS.Query
284 queryNgramsOnlyByNodeUser = [sql|
285 WITH input_rows(terms) AS (?),
286 input_list(id) AS (?)
287 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
288 JOIN ngrams ng ON nng.ngrams_id = ng.id
289 JOIN input_rows ir ON ir.terms = ng.terms
290 JOIN input_list il ON il.id = nng.node1_id
291 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
292 JOIN nodes n ON nn.node2_id = n.id
293 WHERE nn.node1_id = ? -- CorpusId
294 AND n.typename = ? -- NodeTypeId
295 AND nng.ngrams_type = ? -- NgramsTypeId
297 GROUP BY ng.terms, nng.node2_id
301 selectNgramsOnlyByNodeUser' :: CorpusId
305 -> Cmd err [(Text, Int)]
306 selectNgramsOnlyByNodeUser' cId ls nt tms =
307 runPGSQuery queryNgramsOnlyByNodeUser
308 ( Values fields (DPS.Only <$> tms)
309 , Values [QualifiedIdentifier Nothing "int4"]
310 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
312 , nodeTypeId NodeDocument
316 fields = [QualifiedIdentifier Nothing "text"]
318 queryNgramsOnlyByNodeUser' :: DPS.Query
319 queryNgramsOnlyByNodeUser' = [sql|
320 WITH input_rows(terms) AS (?),
321 input_list(id) AS (?)
322 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
323 JOIN ngrams ng ON nng.ngrams_id = ng.id
324 JOIN input_rows ir ON ir.terms = ng.terms
325 JOIN input_list il ON il.id = nng.node2_id
326 WHERE nng.node1_id = ? -- CorpusId
327 AND nng.ngrams_type = ? -- NgramsTypeId
328 -- AND nn.category > 0
329 GROUP BY ng.terms, nng.weight
333 getNgramsByDocOnlyUser :: DocId
337 -> Cmd err (Map Text (Set NodeId))
338 getNgramsByDocOnlyUser cId ls nt ngs =
340 . map (fromListWith (<>) . map (second Set.singleton))
341 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
344 selectNgramsOnlyByDocUser :: DocId
348 -> Cmd err [(Text, NodeId)]
349 selectNgramsOnlyByDocUser dId ls nt tms =
350 runPGSQuery queryNgramsOnlyByDocUser
351 ( Values fields (DPS.Only <$> tms)
352 , Values [QualifiedIdentifier Nothing "int4"]
353 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
358 fields = [QualifiedIdentifier Nothing "text"]
361 queryNgramsOnlyByDocUser :: DPS.Query
362 queryNgramsOnlyByDocUser = [sql|
363 WITH input_rows(terms) AS (?),
364 input_list(id) AS (?)
365 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
366 JOIN ngrams ng ON nng.ngrams_id = ng.id
367 JOIN input_rows ir ON ir.terms = ng.terms
368 JOIN input_list il ON il.id = nng.node1_id
369 WHERE nng.node2_id = ? -- DocId
370 AND nng.ngrams_type = ? -- NgramsTypeId
371 GROUP BY ng.terms, nng.node2_id
374 ------------------------------------------------------------------------
375 -- | TODO filter by language, database, any social field
376 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
377 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
378 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
379 -- . takeWhile (not . List.null)
380 -- . takeWhile (\l -> List.length l > 3)
381 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
383 selectNgramsByNodeMaster :: Int
387 -> Cmd err [(NodeId, Text)]
388 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
389 queryNgramsByNodeMaster'
391 , ngramsTypeId NgramsTerms
392 , nodeTypeId NodeDocument
394 , nodeTypeId NodeDocument
398 , nodeTypeId NodeDocument
399 , ngramsTypeId NgramsTerms
402 -- | TODO fix node_node_ngrams relation
403 queryNgramsByNodeMaster' :: DPS.Query
404 queryNgramsByNodeMaster' = [sql|
405 WITH nodesByNgramsUser AS (
407 SELECT n.id, ng.terms FROM nodes n
408 JOIN nodes_nodes nn ON n.id = nn.node2_id
409 JOIN node_node_ngrams nng ON nng.node2_id = n.id
410 JOIN ngrams ng ON nng.ngrams_id = ng.id
411 WHERE nn.node1_id = ? -- UserCorpusId
412 -- AND n.typename = ? -- NodeTypeId
413 AND nng.ngrams_type = ? -- NgramsTypeId
415 AND node_pos(n.id,?) >= ?
416 AND node_pos(n.id,?) < ?
417 GROUP BY n.id, ng.terms
421 nodesByNgramsMaster AS (
423 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
424 JOIN node_node_ngrams nng ON n.id = nng.node2_id
425 JOIN ngrams ng ON ng.id = nng.ngrams_id
427 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
428 AND n.typename = ? -- NodeTypeId
429 AND nng.ngrams_type = ? -- NgramsTypeId
430 GROUP BY n.id, ng.terms
433 SELECT m.id, m.terms FROM nodesByNgramsMaster m
434 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id