2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByNode
19 import Data.Map.Strict (Map, fromListWith, elems, toList)
20 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
22 import Data.Text (Text)
23 import Data.Tuple.Extra (second, swap)
24 import Database.PostgreSQL.Simple.SqlQQ (sql)
25 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
26 import Debug.Trace (trace)
27 import Gargantext.Core (Lang(..))
28 import Gargantext.Database.Admin.Config (nodeTypeId)
29 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
30 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
31 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
32 import Gargantext.Prelude
33 import Gargantext.Core.Text.Terms.Mono.Stem (stem)
34 import qualified Data.List as List
35 import qualified Data.Map.Strict as Map
36 import qualified Data.Set as Set
37 import qualified Data.Text as Text
38 import qualified Database.PostgreSQL.Simple as DPS
40 -- | TODO: group with 2 terms only can be
41 -- discussed. Main purpose of this is offering
42 -- a first grouping option to user and get some
43 -- enriched data to better learn and improve that algo
49 ngramsGroup l _m _n = Text.intercalate " "
53 -- . (List.filter (\t -> Text.length t > m))
55 . Text.replace "-" " "
59 -- | fst is size of Supra Corpus
60 -- snd is Texts and size of Occurrences (different docs)
61 countNodesByNgramsWith :: (Text -> Text)
62 -> Map Text (Set NodeId)
63 -> (Double, Map Text (Double, Set Text))
64 countNodesByNgramsWith f m = (total, m')
66 total = fromIntegral $ Set.size $ Set.unions $ elems m
67 m' = Map.map ( swap . second (fromIntegral . Set.size))
68 $ groupNodesByNgramsWith f m
71 groupNodesByNgramsWith :: (Text -> Text)
72 -> Map Text (Set NodeId)
73 -> Map Text (Set Text, Set NodeId)
74 groupNodesByNgramsWith f m =
75 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
78 ------------------------------------------------------------------------
79 getNodesByNgramsUser :: CorpusId
81 -> Cmd err (Map Text (Set NodeId))
82 getNodesByNgramsUser cId nt =
83 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
84 <$> selectNgramsByNodeUser cId nt
87 selectNgramsByNodeUser :: CorpusId
89 -> Cmd err [(NodeId, Text)]
90 selectNgramsByNodeUser cId' nt' =
91 runPGSQuery queryNgramsByNodeUser
93 , nodeTypeId NodeDocument
95 -- , 100 :: Int -- limit
96 -- , 0 :: Int -- offset
99 queryNgramsByNodeUser :: DPS.Query
100 queryNgramsByNodeUser = [sql|
101 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
102 JOIN ngrams ng ON nng.ngrams_id = ng.id
103 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
104 JOIN nodes n ON nn.node2_id = n.id
105 WHERE nn.node1_id = ? -- CorpusId
106 AND n.typename = ? -- NodeTypeId
107 AND nng.ngrams_type = ? -- NgramsTypeId
109 GROUP BY nng.node2_id, ng.terms
110 ORDER BY (nng.node2_id, ng.terms) DESC
114 ------------------------------------------------------------------------
116 getOccByNgramsOnlyFast :: CorpusId
119 -> Cmd err (Map Text Int)
120 getOccByNgramsOnlyFast cId nt ngs =
121 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
124 getOccByNgramsOnlyFast' :: CorpusId
128 -> Cmd err (Map Text Int)
129 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
130 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
133 fields = [QualifiedIdentifier Nothing "text"]
139 -> Cmd err [(Text, Double)]
140 run cId' lId' nt' tms' = runPGSQuery query
141 ( Values fields (DPS.Only <$> tms')
149 WITH input_rows(terms) AS (?)
150 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
151 JOIN ngrams ng ON nng.ngrams_id = ng.id
152 JOIN input_rows ir ON ir.terms = ng.terms
153 WHERE nng.node1_id = ? -- CorpusId
154 AND nng.node2_id = ? -- ListId
155 AND nng.ngrams_type = ? -- NgramsTypeId
156 -- AND nn.category > 0 -- TODO
157 GROUP BY ng.terms, nng.weight
161 -- just slower than getOccByNgramsOnlyFast
162 getOccByNgramsOnlySlow :: NodeType
167 -> Cmd err (Map Text Int)
168 getOccByNgramsOnlySlow t cId ls nt ngs =
169 Map.map Set.size <$> getScore' t cId ls nt ngs
171 getScore' NodeCorpus = getNodesByNgramsOnlyUser
172 getScore' NodeDocument = getNgramsByDocOnlyUser
173 getScore' _ = getNodesByNgramsOnlyUser
175 getOccByNgramsOnlySafe :: CorpusId
179 -> Cmd err (Map Text Int)
180 getOccByNgramsOnlySafe cId ls nt ngs = do
181 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
182 fast <- getOccByNgramsOnlyFast cId nt ngs
183 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
184 when (fast /= slow) $
185 printDebug "getOccByNgramsOnlySafe: difference"
186 (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
190 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
193 -> Cmd err [(Text, Int)]
194 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
195 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
196 ( Values fields (DPS.Only <$> tms)
198 , nodeTypeId NodeDocument
202 fields = [QualifiedIdentifier Nothing "text"]
204 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
205 -- Question: with the grouping is the result exactly the same (since Set NodeId for
206 -- equivalent ngrams intersections are not empty)
207 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
208 queryNgramsOccurrencesOnlyByNodeUser = [sql|
209 WITH input_rows(terms) AS (?)
210 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
211 JOIN ngrams ng ON nng.ngrams_id = ng.id
212 JOIN input_rows ir ON ir.terms = ng.terms
213 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
214 JOIN nodes n ON nn.node2_id = n.id
215 WHERE nn.node1_id = ? -- CorpusId
216 AND n.typename = ? -- NodeTypeId
217 AND nng.ngrams_type = ? -- NgramsTypeId
219 GROUP BY nng.node2_id, ng.terms
222 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
223 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
224 WITH input_rows(terms) AS (?)
225 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
226 JOIN ngrams ng ON nng.ngrams_id = ng.id
227 JOIN input_rows ir ON ir.terms = ng.terms
228 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
229 JOIN nodes n ON nn.node2_id = n.id
230 WHERE nn.node1_id = ? -- CorpusId
231 AND n.typename = ? -- NodeTypeId
232 AND nng.ngrams_type = ? -- NgramsTypeId
234 GROUP BY nng.node2_id, ng.terms
237 ------------------------------------------------------------------------
238 getNodesByNgramsOnlyUser :: NodeId
242 -> Cmd err (Map Text (Set NodeId))
243 getNodesByNgramsOnlyUser cId ls nt ngs =
245 . map (fromListWith (<>)
246 . map (second Set.singleton))
247 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
248 (splitEvery 1000 ngs)
251 getNgramsByNodeOnlyUser :: NodeId
255 -> Cmd err (Map NodeId (Set Text))
256 getNgramsByNodeOnlyUser cId ls nt ngs =
258 . map (fromListWith (<>)
259 . map (second Set.singleton))
261 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
262 (splitEvery 1000 ngs)
264 ------------------------------------------------------------------------
265 selectNgramsOnlyByNodeUser :: CorpusId
269 -> Cmd err [(Text, NodeId)]
270 selectNgramsOnlyByNodeUser cId ls nt tms =
271 runPGSQuery queryNgramsOnlyByNodeUser
272 ( Values fields (DPS.Only <$> tms)
273 , Values [QualifiedIdentifier Nothing "int4"]
274 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
276 , nodeTypeId NodeDocument
280 fields = [QualifiedIdentifier Nothing "text"]
282 queryNgramsOnlyByNodeUser :: DPS.Query
283 queryNgramsOnlyByNodeUser = [sql|
284 WITH input_rows(terms) AS (?),
285 input_list(id) AS (?)
286 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
287 JOIN ngrams ng ON nng.ngrams_id = ng.id
288 JOIN input_rows ir ON ir.terms = ng.terms
289 JOIN input_list il ON il.id = nng.node1_id
290 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
291 JOIN nodes n ON nn.node2_id = n.id
292 WHERE nn.node1_id = ? -- CorpusId
293 AND n.typename = ? -- NodeTypeId
294 AND nng.ngrams_type = ? -- NgramsTypeId
296 GROUP BY ng.terms, nng.node2_id
300 selectNgramsOnlyByNodeUser' :: CorpusId
304 -> Cmd err [(Text, Int)]
305 selectNgramsOnlyByNodeUser' cId ls nt tms =
306 runPGSQuery queryNgramsOnlyByNodeUser
307 ( Values fields (DPS.Only <$> tms)
308 , Values [QualifiedIdentifier Nothing "int4"]
309 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
311 , nodeTypeId NodeDocument
315 fields = [QualifiedIdentifier Nothing "text"]
317 queryNgramsOnlyByNodeUser' :: DPS.Query
318 queryNgramsOnlyByNodeUser' = [sql|
319 WITH input_rows(terms) AS (?),
320 input_list(id) AS (?)
321 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
322 JOIN ngrams ng ON nng.ngrams_id = ng.id
323 JOIN input_rows ir ON ir.terms = ng.terms
324 JOIN input_list il ON il.id = nng.node2_id
325 WHERE nng.node1_id = ? -- CorpusId
326 AND nng.ngrams_type = ? -- NgramsTypeId
327 -- AND nn.category > 0
328 GROUP BY ng.terms, nng.weight
332 getNgramsByDocOnlyUser :: NodeId
336 -> Cmd err (Map Text (Set NodeId))
337 getNgramsByDocOnlyUser cId ls nt ngs =
339 . map (fromListWith (<>) . map (second Set.singleton))
340 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
343 selectNgramsOnlyByDocUser :: DocId
347 -> Cmd err [(Text, NodeId)]
348 selectNgramsOnlyByDocUser dId ls nt tms =
349 runPGSQuery queryNgramsOnlyByDocUser
350 ( Values fields (DPS.Only <$> tms)
351 , Values [QualifiedIdentifier Nothing "int4"]
352 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
357 fields = [QualifiedIdentifier Nothing "text"]
360 queryNgramsOnlyByDocUser :: DPS.Query
361 queryNgramsOnlyByDocUser = [sql|
362 WITH input_rows(terms) AS (?),
363 input_list(id) AS (?)
364 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
365 JOIN ngrams ng ON nng.ngrams_id = ng.id
366 JOIN input_rows ir ON ir.terms = ng.terms
367 JOIN input_list il ON il.id = nng.node1_id
368 WHERE nng.node2_id = ? -- DocId
369 AND nng.ngrams_type = ? -- NgramsTypeId
370 GROUP BY ng.terms, nng.node2_id
373 ------------------------------------------------------------------------
374 -- | TODO filter by language, database, any social field
375 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
376 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
377 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
378 -- . takeWhile (not . List.null)
379 -- . takeWhile (\l -> List.length l > 3)
380 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
382 selectNgramsByNodeMaster :: Int
386 -> Cmd err [(NodeId, Text)]
387 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
388 queryNgramsByNodeMaster'
390 , ngramsTypeId NgramsTerms
391 , nodeTypeId NodeDocument
393 , nodeTypeId NodeDocument
397 , nodeTypeId NodeDocument
398 , ngramsTypeId NgramsTerms
401 -- | TODO fix node_node_ngrams relation
402 queryNgramsByNodeMaster' :: DPS.Query
403 queryNgramsByNodeMaster' = [sql|
404 WITH nodesByNgramsUser AS (
406 SELECT n.id, ng.terms FROM nodes n
407 JOIN nodes_nodes nn ON n.id = nn.node2_id
408 JOIN node_node_ngrams nng ON nng.node2_id = n.id
409 JOIN ngrams ng ON nng.ngrams_id = ng.id
410 WHERE nn.node1_id = ? -- UserCorpusId
411 -- AND n.typename = ? -- NodeTypeId
412 AND nng.ngrams_type = ? -- NgramsTypeId
414 AND node_pos(n.id,?) >= ?
415 AND node_pos(n.id,?) < ?
416 GROUP BY n.id, ng.terms
420 nodesByNgramsMaster AS (
422 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
423 JOIN node_node_ngrams nng ON n.id = nng.node2_id
424 JOIN ngrams ng ON ng.id = nng.ngrams_id
426 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
427 AND n.typename = ? -- NodeTypeId
428 AND nng.ngrams_type = ? -- NgramsTypeId
429 GROUP BY n.id, ng.terms
432 SELECT m.id, m.terms FROM nodesByNgramsMaster m
433 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id