2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE FlexibleContexts #-}
15 {-# LANGUAGE QuasiQuotes #-}
16 {-# LANGUAGE NoImplicitPrelude #-}
17 {-# LANGUAGE OverloadedStrings #-}
18 {-# LANGUAGE RankNTypes #-}
20 module Gargantext.Database.Action.Metrics.NgramsByNode
23 import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
24 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
26 import Data.Text (Text)
27 import Data.Tuple.Extra (second, swap)
28 import Database.PostgreSQL.Simple.SqlQQ (sql)
29 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
30 import Debug.Trace (trace)
31 import Gargantext.Core (Lang(..))
32 import Gargantext.Database.Admin.Config (nodeTypeId)
33 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
34 import Gargantext.Database.Admin.Utils (Cmd, runPGSQuery)
35 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
36 import Gargantext.Prelude
37 import Gargantext.Text.Metrics.TFICF
38 import Gargantext.Text.Terms.Mono.Stem (stem)
39 import qualified Data.List as List
40 import qualified Data.Map.Strict as Map
41 import qualified Data.Set as Set
42 import qualified Data.Text as Text
43 import qualified Database.PostgreSQL.Simple as DPS
45 -- | TODO: group with 2 terms only can be
46 -- discussed. Main purpose of this is offering
47 -- a first grouping option to user and get some
48 -- enriched data to better learn and improve that algo
54 ngramsGroup l _m _n = Text.intercalate " "
58 -- . (List.filter (\t -> Text.length t > m))
60 . Text.replace "-" " "
63 sortTficf :: (Map Text (Double, Set Text))
64 -> [ (Text,(Double, Set Text))]
65 sortTficf = List.sortOn (fst . snd) . toList
68 getTficf :: UserCorpusId
72 -> Cmd err (Map Text (Double, Set Text))
73 getTficf u m nt f = do
74 u' <- getNodesByNgramsUser u nt
75 m' <- getNodesByNgramsMaster u m
77 pure $ toTficfData (countNodesByNgramsWith f u')
78 (countNodesByNgramsWith f m')
81 getTficfWith :: UserCorpusId
85 -> Map Text (Maybe Text)
86 -> Cmd err (Map Text (Double, Set Text))
87 getTficfWith u m ls nt mtxt = do
88 u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
89 m' <- getNodesByNgramsMaster u m
91 let f x = case Map.lookup x mtxt of
93 Just x' -> maybe x identity x'
95 pure $ toTficfData (countNodesByNgramsWith f u')
96 (countNodesByNgramsWith f m')
99 type Context = (Double, Map Text (Double, Set Text))
105 -> Map Text (Double, Set Text)
106 toTficfData (ti, mi) (ts, ms) =
107 fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
108 (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
112 | (t, (n,ns)) <- toList mi
116 -- | fst is size of Supra Corpus
117 -- snd is Texts and size of Occurrences (different docs)
118 countNodesByNgramsWith :: (Text -> Text)
119 -> Map Text (Set NodeId)
120 -> (Double, Map Text (Double, Set Text))
121 countNodesByNgramsWith f m = (total, m')
123 total = fromIntegral $ Set.size $ Set.unions $ elems m
124 m' = Map.map ( swap . second (fromIntegral . Set.size))
125 $ groupNodesByNgramsWith f m
128 groupNodesByNgramsWith :: (Text -> Text)
129 -> Map Text (Set NodeId)
130 -> Map Text (Set Text, Set NodeId)
131 groupNodesByNgramsWith f m =
132 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
135 ------------------------------------------------------------------------
136 getNodesByNgramsUser :: CorpusId
138 -> Cmd err (Map Text (Set NodeId))
139 getNodesByNgramsUser cId nt =
140 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
141 <$> selectNgramsByNodeUser cId nt
144 selectNgramsByNodeUser :: CorpusId
146 -> Cmd err [(NodeId, Text)]
147 selectNgramsByNodeUser cId' nt' =
148 runPGSQuery queryNgramsByNodeUser
150 , nodeTypeId NodeDocument
152 -- , 100 :: Int -- limit
153 -- , 0 :: Int -- offset
156 queryNgramsByNodeUser :: DPS.Query
157 queryNgramsByNodeUser = [sql|
158 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
159 JOIN ngrams ng ON nng.ngrams_id = ng.id
160 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
161 JOIN nodes n ON nn.node2_id = n.id
162 WHERE nn.node1_id = ? -- CorpusId
163 AND n.typename = ? -- NodeTypeId
164 AND nng.ngrams_type = ? -- NgramsTypeId
166 GROUP BY nng.node2_id, ng.terms
167 ORDER BY (nng.node2_id, ng.terms) DESC
171 ------------------------------------------------------------------------
173 getOccByNgramsOnlyFast :: CorpusId
176 -> Cmd err (Map Text Int)
177 getOccByNgramsOnlyFast cId nt ngs =
178 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
181 getOccByNgramsOnlyFast' :: CorpusId
185 -> Cmd err (Map Text Int)
186 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
187 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
190 fields = [QualifiedIdentifier Nothing "text"]
196 -> Cmd err [(Text, Double)]
197 run cId' lId' nt' tms' = runPGSQuery query
198 ( Values fields (DPS.Only <$> tms')
206 WITH input_rows(terms) AS (?)
207 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
208 JOIN ngrams ng ON nng.ngrams_id = ng.id
209 JOIN input_rows ir ON ir.terms = ng.terms
210 WHERE nng.node1_id = ? -- CorpusId
211 AND nng.node2_id = ? -- ListId
212 AND nng.ngrams_type = ? -- NgramsTypeId
213 -- AND nn.category > 0 -- TODO
214 GROUP BY ng.terms, nng.weight
218 -- just slower than getOccByNgramsOnlyFast
219 getOccByNgramsOnlySlow :: NodeType
224 -> Cmd err (Map Text Int)
225 getOccByNgramsOnlySlow t cId ls nt ngs =
226 Map.map Set.size <$> getScore' t cId ls nt ngs
228 getScore' NodeCorpus = getNodesByNgramsOnlyUser
229 getScore' NodeDocument = getNgramsByDocOnlyUser
230 getScore' _ = getNodesByNgramsOnlyUser
232 getOccByNgramsOnlySafe :: CorpusId
236 -> Cmd err (Map Text Int)
237 getOccByNgramsOnlySafe cId ls nt ngs = do
238 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
239 fast <- getOccByNgramsOnlyFast cId nt ngs
240 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
241 when (fast /= slow) $
242 printDebug "getOccByNgramsOnlySafe: difference"
243 (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
247 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
250 -> Cmd err [(Text, Int)]
251 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
252 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
253 ( Values fields (DPS.Only <$> tms)
255 , nodeTypeId NodeDocument
259 fields = [QualifiedIdentifier Nothing "text"]
261 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
262 -- Question: with the grouping is the result exactly the same (since Set NodeId for
263 -- equivalent ngrams intersections are not empty)
264 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
265 queryNgramsOccurrencesOnlyByNodeUser = [sql|
266 WITH input_rows(terms) AS (?)
267 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
268 JOIN ngrams ng ON nng.ngrams_id = ng.id
269 JOIN input_rows ir ON ir.terms = ng.terms
270 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
271 JOIN nodes n ON nn.node2_id = n.id
272 WHERE nn.node1_id = ? -- CorpusId
273 AND n.typename = ? -- NodeTypeId
274 AND nng.ngrams_type = ? -- NgramsTypeId
276 GROUP BY nng.node2_id, ng.terms
279 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
280 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
281 WITH input_rows(terms) AS (?)
282 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
283 JOIN ngrams ng ON nng.ngrams_id = ng.id
284 JOIN input_rows ir ON ir.terms = ng.terms
285 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
286 JOIN nodes n ON nn.node2_id = n.id
287 WHERE nn.node1_id = ? -- CorpusId
288 AND n.typename = ? -- NodeTypeId
289 AND nng.ngrams_type = ? -- NgramsTypeId
291 GROUP BY nng.node2_id, ng.terms
294 ------------------------------------------------------------------------
295 getNodesByNgramsOnlyUser :: NodeId
299 -> Cmd err (Map Text (Set NodeId))
300 getNodesByNgramsOnlyUser cId ls nt ngs =
302 . map (fromListWith (<>)
303 . map (second Set.singleton))
304 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
305 (splitEvery 1000 ngs)
308 getNgramsByNodeOnlyUser :: NodeId
312 -> Cmd err (Map NodeId (Set Text))
313 getNgramsByNodeOnlyUser cId ls nt ngs =
315 . map (fromListWith (<>)
316 . map (second Set.singleton))
318 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
319 (splitEvery 1000 ngs)
321 ------------------------------------------------------------------------
322 selectNgramsOnlyByNodeUser :: CorpusId
326 -> Cmd err [(Text, NodeId)]
327 selectNgramsOnlyByNodeUser cId ls nt tms =
328 runPGSQuery queryNgramsOnlyByNodeUser
329 ( Values fields (DPS.Only <$> tms)
330 , Values [QualifiedIdentifier Nothing "int4"]
331 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
333 , nodeTypeId NodeDocument
337 fields = [QualifiedIdentifier Nothing "text"]
339 queryNgramsOnlyByNodeUser :: DPS.Query
340 queryNgramsOnlyByNodeUser = [sql|
341 WITH input_rows(terms) AS (?),
342 input_list(id) AS (?)
343 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
344 JOIN ngrams ng ON nng.ngrams_id = ng.id
345 JOIN input_rows ir ON ir.terms = ng.terms
346 JOIN input_list il ON il.id = nng.node1_id
347 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
348 JOIN nodes n ON nn.node2_id = n.id
349 WHERE nn.node1_id = ? -- CorpusId
350 AND n.typename = ? -- NodeTypeId
351 AND nng.ngrams_type = ? -- NgramsTypeId
353 GROUP BY ng.terms, nng.node2_id
357 selectNgramsOnlyByNodeUser' :: CorpusId
361 -> Cmd err [(Text, Int)]
362 selectNgramsOnlyByNodeUser' cId ls nt tms =
363 runPGSQuery queryNgramsOnlyByNodeUser
364 ( Values fields (DPS.Only <$> tms)
365 , Values [QualifiedIdentifier Nothing "int4"]
366 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
368 , nodeTypeId NodeDocument
372 fields = [QualifiedIdentifier Nothing "text"]
374 queryNgramsOnlyByNodeUser' :: DPS.Query
375 queryNgramsOnlyByNodeUser' = [sql|
376 WITH input_rows(terms) AS (?),
377 input_list(id) AS (?)
378 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
379 JOIN ngrams ng ON nng.ngrams_id = ng.id
380 JOIN input_rows ir ON ir.terms = ng.terms
381 JOIN input_list il ON il.id = nng.node2_id
382 WHERE nng.node1_id = ? -- CorpusId
383 AND nng.ngrams_type = ? -- NgramsTypeId
384 -- AND nn.category > 0
385 GROUP BY ng.terms, nng.weight
389 getNgramsByDocOnlyUser :: NodeId
393 -> Cmd err (Map Text (Set NodeId))
394 getNgramsByDocOnlyUser cId ls nt ngs =
396 . map (fromListWith (<>) . map (second Set.singleton))
397 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
400 selectNgramsOnlyByDocUser :: DocId
404 -> Cmd err [(Text, NodeId)]
405 selectNgramsOnlyByDocUser dId ls nt tms =
406 runPGSQuery queryNgramsOnlyByDocUser
407 ( Values fields (DPS.Only <$> tms)
408 , Values [QualifiedIdentifier Nothing "int4"]
409 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
414 fields = [QualifiedIdentifier Nothing "text"]
417 queryNgramsOnlyByDocUser :: DPS.Query
418 queryNgramsOnlyByDocUser = [sql|
419 WITH input_rows(terms) AS (?),
420 input_list(id) AS (?)
421 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
422 JOIN ngrams ng ON nng.ngrams_id = ng.id
423 JOIN input_rows ir ON ir.terms = ng.terms
424 JOIN input_list il ON il.id = nng.node1_id
425 WHERE nng.node2_id = ? -- DocId
426 AND nng.ngrams_type = ? -- NgramsTypeId
427 GROUP BY ng.terms, nng.node2_id
430 ------------------------------------------------------------------------
431 -- | TODO filter by language, database, any social field
432 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
433 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
434 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
435 -- . takeWhile (not . List.null)
436 -- . takeWhile (\l -> List.length l > 3)
437 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
439 selectNgramsByNodeMaster :: Int
443 -> Cmd err [(NodeId, Text)]
444 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
445 queryNgramsByNodeMaster'
447 , ngramsTypeId NgramsTerms
448 , nodeTypeId NodeDocument
450 , nodeTypeId NodeDocument
454 , nodeTypeId NodeDocument
455 , ngramsTypeId NgramsTerms
458 -- | TODO fix node_node_ngrams relation
459 queryNgramsByNodeMaster' :: DPS.Query
460 queryNgramsByNodeMaster' = [sql|
461 WITH nodesByNgramsUser AS (
463 SELECT n.id, ng.terms FROM nodes n
464 JOIN nodes_nodes nn ON n.id = nn.node2_id
465 JOIN node_node_ngrams nng ON nng.node2_id = n.id
466 JOIN ngrams ng ON nng.ngrams_id = ng.id
467 WHERE nn.node1_id = ? -- UserCorpusId
468 -- AND n.typename = ? -- NodeTypeId
469 AND nng.ngrams_type = ? -- NgramsTypeId
471 AND node_pos(n.id,?) >= ?
472 AND node_pos(n.id,?) < ?
473 GROUP BY n.id, ng.terms
477 nodesByNgramsMaster AS (
479 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
480 JOIN node_node_ngrams nng ON n.id = nng.node2_id
481 JOIN ngrams ng ON ng.id = nng.ngrams_id
483 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
484 AND n.typename = ? -- NodeTypeId
485 AND nng.ngrams_type = ? -- NgramsTypeId
486 GROUP BY n.id, ng.terms
489 SELECT m.id, m.terms FROM nodesByNgramsMaster m
490 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id