2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByNode
19 import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
20 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
22 import Data.Text (Text)
23 import Data.Tuple.Extra (second, swap)
24 import Database.PostgreSQL.Simple.SqlQQ (sql)
25 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
26 import Debug.Trace (trace)
27 import Gargantext.Core (Lang(..))
28 import Gargantext.Database.Admin.Config (nodeTypeId)
29 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
30 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
31 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
32 import Gargantext.Prelude
33 import Gargantext.Core.Text.Metrics.TFICF
34 import Gargantext.Core.Text.Terms.Mono.Stem (stem)
35 import qualified Data.List as List
36 import qualified Data.Map.Strict as Map
37 import qualified Data.Set as Set
38 import qualified Data.Text as Text
39 import qualified Database.PostgreSQL.Simple as DPS
41 -- | TODO: group with 2 terms only can be
42 -- discussed. Main purpose of this is offering
43 -- a first grouping option to user and get some
44 -- enriched data to better learn and improve that algo
50 ngramsGroup l _m _n = Text.intercalate " "
54 -- . (List.filter (\t -> Text.length t > m))
56 . Text.replace "-" " "
59 getTficf :: UserCorpusId
63 -> Cmd err (Map Text (Double, Set Text))
64 getTficf u m nt f = do
65 u' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsUser u nt
66 m' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsMaster u m
68 pure $ toTficfData (countNodesByNgramsWith f u')
69 (countNodesByNgramsWith f m')
72 getTficfWith :: UserCorpusId
76 -> Map Text (Maybe Text)
77 -> Cmd err (Map Text (Double, Set Text))
78 getTficfWith u m ls nt mtxt = do
79 u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
80 m' <- getNodesByNgramsMaster u m
82 let f x = case Map.lookup x mtxt of
84 Just x' -> maybe x identity x'
86 pure $ toTficfData (countNodesByNgramsWith f u') (countNodesByNgramsWith f m')
89 type Context = (Double, Map Text (Double, Set Text))
95 -> Map Text (Double, Set Text)
96 toTficfData (ti, mi) (ts, ms) =
97 fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
98 (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
102 | (t, (n,ns)) <- toList mi
106 -- | fst is size of Supra Corpus
107 -- snd is Texts and size of Occurrences (different docs)
108 countNodesByNgramsWith :: (Text -> Text)
109 -> Map Text (Set NodeId)
110 -> (Double, Map Text (Double, Set Text))
111 countNodesByNgramsWith f m = (total, m')
113 total = fromIntegral $ Set.size $ Set.unions $ elems m
114 m' = Map.map ( swap . second (fromIntegral . Set.size))
115 $ groupNodesByNgramsWith f m
118 groupNodesByNgramsWith :: (Text -> Text)
119 -> Map Text (Set NodeId)
120 -> Map Text (Set Text, Set NodeId)
121 groupNodesByNgramsWith f m =
122 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
125 ------------------------------------------------------------------------
126 getNodesByNgramsUser :: CorpusId
128 -> Cmd err (Map Text (Set NodeId))
129 getNodesByNgramsUser cId nt =
130 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
131 <$> selectNgramsByNodeUser cId nt
134 selectNgramsByNodeUser :: CorpusId
136 -> Cmd err [(NodeId, Text)]
137 selectNgramsByNodeUser cId' nt' =
138 runPGSQuery queryNgramsByNodeUser
140 , nodeTypeId NodeDocument
142 -- , 100 :: Int -- limit
143 -- , 0 :: Int -- offset
146 queryNgramsByNodeUser :: DPS.Query
147 queryNgramsByNodeUser = [sql|
148 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
149 JOIN ngrams ng ON nng.ngrams_id = ng.id
150 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
151 JOIN nodes n ON nn.node2_id = n.id
152 WHERE nn.node1_id = ? -- CorpusId
153 AND n.typename = ? -- NodeTypeId
154 AND nng.ngrams_type = ? -- NgramsTypeId
156 GROUP BY nng.node2_id, ng.terms
157 ORDER BY (nng.node2_id, ng.terms) DESC
161 ------------------------------------------------------------------------
163 getOccByNgramsOnlyFast :: CorpusId
166 -> Cmd err (Map Text Int)
167 getOccByNgramsOnlyFast cId nt ngs =
168 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
171 getOccByNgramsOnlyFast' :: CorpusId
175 -> Cmd err (Map Text Int)
176 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
177 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
180 fields = [QualifiedIdentifier Nothing "text"]
186 -> Cmd err [(Text, Double)]
187 run cId' lId' nt' tms' = runPGSQuery query
188 ( Values fields (DPS.Only <$> tms')
196 WITH input_rows(terms) AS (?)
197 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
198 JOIN ngrams ng ON nng.ngrams_id = ng.id
199 JOIN input_rows ir ON ir.terms = ng.terms
200 WHERE nng.node1_id = ? -- CorpusId
201 AND nng.node2_id = ? -- ListId
202 AND nng.ngrams_type = ? -- NgramsTypeId
203 -- AND nn.category > 0 -- TODO
204 GROUP BY ng.terms, nng.weight
208 -- just slower than getOccByNgramsOnlyFast
209 getOccByNgramsOnlySlow :: NodeType
214 -> Cmd err (Map Text Int)
215 getOccByNgramsOnlySlow t cId ls nt ngs =
216 Map.map Set.size <$> getScore' t cId ls nt ngs
218 getScore' NodeCorpus = getNodesByNgramsOnlyUser
219 getScore' NodeDocument = getNgramsByDocOnlyUser
220 getScore' _ = getNodesByNgramsOnlyUser
222 getOccByNgramsOnlySafe :: CorpusId
226 -> Cmd err (Map Text Int)
227 getOccByNgramsOnlySafe cId ls nt ngs = do
228 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
229 fast <- getOccByNgramsOnlyFast cId nt ngs
230 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
231 when (fast /= slow) $
232 printDebug "getOccByNgramsOnlySafe: difference"
233 (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
237 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
240 -> Cmd err [(Text, Int)]
241 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
242 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
243 ( Values fields (DPS.Only <$> tms)
245 , nodeTypeId NodeDocument
249 fields = [QualifiedIdentifier Nothing "text"]
251 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
252 -- Question: with the grouping is the result exactly the same (since Set NodeId for
253 -- equivalent ngrams intersections are not empty)
254 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
255 queryNgramsOccurrencesOnlyByNodeUser = [sql|
256 WITH input_rows(terms) AS (?)
257 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
258 JOIN ngrams ng ON nng.ngrams_id = ng.id
259 JOIN input_rows ir ON ir.terms = ng.terms
260 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
261 JOIN nodes n ON nn.node2_id = n.id
262 WHERE nn.node1_id = ? -- CorpusId
263 AND n.typename = ? -- NodeTypeId
264 AND nng.ngrams_type = ? -- NgramsTypeId
266 GROUP BY nng.node2_id, ng.terms
269 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
270 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
271 WITH input_rows(terms) AS (?)
272 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
273 JOIN ngrams ng ON nng.ngrams_id = ng.id
274 JOIN input_rows ir ON ir.terms = ng.terms
275 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
276 JOIN nodes n ON nn.node2_id = n.id
277 WHERE nn.node1_id = ? -- CorpusId
278 AND n.typename = ? -- NodeTypeId
279 AND nng.ngrams_type = ? -- NgramsTypeId
281 GROUP BY nng.node2_id, ng.terms
284 ------------------------------------------------------------------------
285 getNodesByNgramsOnlyUser :: NodeId
289 -> Cmd err (Map Text (Set NodeId))
290 getNodesByNgramsOnlyUser cId ls nt ngs =
292 . map (fromListWith (<>)
293 . map (second Set.singleton))
294 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
295 (splitEvery 1000 ngs)
298 getNgramsByNodeOnlyUser :: NodeId
302 -> Cmd err (Map NodeId (Set Text))
303 getNgramsByNodeOnlyUser cId ls nt ngs =
305 . map (fromListWith (<>)
306 . map (second Set.singleton))
308 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
309 (splitEvery 1000 ngs)
311 ------------------------------------------------------------------------
312 selectNgramsOnlyByNodeUser :: CorpusId
316 -> Cmd err [(Text, NodeId)]
317 selectNgramsOnlyByNodeUser cId ls nt tms =
318 runPGSQuery queryNgramsOnlyByNodeUser
319 ( Values fields (DPS.Only <$> tms)
320 , Values [QualifiedIdentifier Nothing "int4"]
321 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
323 , nodeTypeId NodeDocument
327 fields = [QualifiedIdentifier Nothing "text"]
329 queryNgramsOnlyByNodeUser :: DPS.Query
330 queryNgramsOnlyByNodeUser = [sql|
331 WITH input_rows(terms) AS (?),
332 input_list(id) AS (?)
333 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
334 JOIN ngrams ng ON nng.ngrams_id = ng.id
335 JOIN input_rows ir ON ir.terms = ng.terms
336 JOIN input_list il ON il.id = nng.node1_id
337 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
338 JOIN nodes n ON nn.node2_id = n.id
339 WHERE nn.node1_id = ? -- CorpusId
340 AND n.typename = ? -- NodeTypeId
341 AND nng.ngrams_type = ? -- NgramsTypeId
343 GROUP BY ng.terms, nng.node2_id
347 selectNgramsOnlyByNodeUser' :: CorpusId
351 -> Cmd err [(Text, Int)]
352 selectNgramsOnlyByNodeUser' cId ls nt tms =
353 runPGSQuery queryNgramsOnlyByNodeUser
354 ( Values fields (DPS.Only <$> tms)
355 , Values [QualifiedIdentifier Nothing "int4"]
356 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
358 , nodeTypeId NodeDocument
362 fields = [QualifiedIdentifier Nothing "text"]
364 queryNgramsOnlyByNodeUser' :: DPS.Query
365 queryNgramsOnlyByNodeUser' = [sql|
366 WITH input_rows(terms) AS (?),
367 input_list(id) AS (?)
368 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
369 JOIN ngrams ng ON nng.ngrams_id = ng.id
370 JOIN input_rows ir ON ir.terms = ng.terms
371 JOIN input_list il ON il.id = nng.node2_id
372 WHERE nng.node1_id = ? -- CorpusId
373 AND nng.ngrams_type = ? -- NgramsTypeId
374 -- AND nn.category > 0
375 GROUP BY ng.terms, nng.weight
379 getNgramsByDocOnlyUser :: NodeId
383 -> Cmd err (Map Text (Set NodeId))
384 getNgramsByDocOnlyUser cId ls nt ngs =
386 . map (fromListWith (<>) . map (second Set.singleton))
387 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
390 selectNgramsOnlyByDocUser :: DocId
394 -> Cmd err [(Text, NodeId)]
395 selectNgramsOnlyByDocUser dId ls nt tms =
396 runPGSQuery queryNgramsOnlyByDocUser
397 ( Values fields (DPS.Only <$> tms)
398 , Values [QualifiedIdentifier Nothing "int4"]
399 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
404 fields = [QualifiedIdentifier Nothing "text"]
407 queryNgramsOnlyByDocUser :: DPS.Query
408 queryNgramsOnlyByDocUser = [sql|
409 WITH input_rows(terms) AS (?),
410 input_list(id) AS (?)
411 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
412 JOIN ngrams ng ON nng.ngrams_id = ng.id
413 JOIN input_rows ir ON ir.terms = ng.terms
414 JOIN input_list il ON il.id = nng.node1_id
415 WHERE nng.node2_id = ? -- DocId
416 AND nng.ngrams_type = ? -- NgramsTypeId
417 GROUP BY ng.terms, nng.node2_id
420 ------------------------------------------------------------------------
421 -- | TODO filter by language, database, any social field
422 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
423 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
424 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
425 -- . takeWhile (not . List.null)
426 -- . takeWhile (\l -> List.length l > 3)
427 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
429 selectNgramsByNodeMaster :: Int
433 -> Cmd err [(NodeId, Text)]
434 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
435 queryNgramsByNodeMaster'
437 , ngramsTypeId NgramsTerms
438 , nodeTypeId NodeDocument
440 , nodeTypeId NodeDocument
444 , nodeTypeId NodeDocument
445 , ngramsTypeId NgramsTerms
448 -- | TODO fix node_node_ngrams relation
449 queryNgramsByNodeMaster' :: DPS.Query
450 queryNgramsByNodeMaster' = [sql|
451 WITH nodesByNgramsUser AS (
453 SELECT n.id, ng.terms FROM nodes n
454 JOIN nodes_nodes nn ON n.id = nn.node2_id
455 JOIN node_node_ngrams nng ON nng.node2_id = n.id
456 JOIN ngrams ng ON nng.ngrams_id = ng.id
457 WHERE nn.node1_id = ? -- UserCorpusId
458 -- AND n.typename = ? -- NodeTypeId
459 AND nng.ngrams_type = ? -- NgramsTypeId
461 AND node_pos(n.id,?) >= ?
462 AND node_pos(n.id,?) < ?
463 GROUP BY n.id, ng.terms
467 nodesByNgramsMaster AS (
469 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
470 JOIN node_node_ngrams nng ON n.id = nng.node2_id
471 JOIN ngrams ng ON ng.id = nng.ngrams_id
473 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
474 AND n.typename = ? -- NodeTypeId
475 AND nng.ngrams_type = ? -- NgramsTypeId
476 GROUP BY n.id, ng.terms
479 SELECT m.id, m.terms FROM nodesByNgramsMaster m
480 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id