2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByNode
19 import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
20 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
22 import Data.Text (Text)
23 import Data.Tuple.Extra (second, swap)
24 import Database.PostgreSQL.Simple.SqlQQ (sql)
25 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
26 import Debug.Trace (trace)
27 import Gargantext.Core (Lang(..))
28 import Gargantext.Database.Admin.Config (nodeTypeId)
29 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
30 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
31 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
32 import Gargantext.Prelude
33 import Gargantext.Text.Metrics.TFICF
34 import Gargantext.Text.Terms.Mono.Stem (stem)
35 import qualified Data.List as List
36 import qualified Data.Map.Strict as Map
37 import qualified Data.Set as Set
38 import qualified Data.Text as Text
39 import qualified Database.PostgreSQL.Simple as DPS
41 -- | TODO: group with 2 terms only can be
42 -- discussed. Main purpose of this is offering
43 -- a first grouping option to user and get some
44 -- enriched data to better learn and improve that algo
50 ngramsGroup l _m _n = Text.intercalate " "
54 -- . (List.filter (\t -> Text.length t > m))
56 . Text.replace "-" " "
60 getTficf :: UserCorpusId
64 -> Cmd err (Map Text (Double, Set Text))
65 getTficf u m nt f = do
66 u' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsUser u nt
67 m' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsMaster u m
69 pure $ toTficfData (countNodesByNgramsWith f u')
70 (countNodesByNgramsWith f m')
73 getTficfWith :: UserCorpusId
77 -> Map Text (Maybe Text)
78 -> Cmd err (Map Text (Double, Set Text))
79 getTficfWith u m ls nt mtxt = do
80 u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
81 m' <- getNodesByNgramsMaster u m
83 let f x = case Map.lookup x mtxt of
85 Just x' -> maybe x identity x'
87 pure $ toTficfData (countNodesByNgramsWith f u') (countNodesByNgramsWith f m')
90 type Context = (Double, Map Text (Double, Set Text))
96 -> Map Text (Double, Set Text)
97 toTficfData (ti, mi) (ts, ms) =
98 fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
99 (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
103 | (t, (n,ns)) <- toList mi
107 -- | fst is size of Supra Corpus
108 -- snd is Texts and size of Occurrences (different docs)
109 countNodesByNgramsWith :: (Text -> Text)
110 -> Map Text (Set NodeId)
111 -> (Double, Map Text (Double, Set Text))
112 countNodesByNgramsWith f m = (total, m')
114 total = fromIntegral $ Set.size $ Set.unions $ elems m
115 m' = Map.map ( swap . second (fromIntegral . Set.size))
116 $ groupNodesByNgramsWith f m
119 groupNodesByNgramsWith :: (Text -> Text)
120 -> Map Text (Set NodeId)
121 -> Map Text (Set Text, Set NodeId)
122 groupNodesByNgramsWith f m =
123 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
126 ------------------------------------------------------------------------
127 getNodesByNgramsUser :: CorpusId
129 -> Cmd err (Map Text (Set NodeId))
130 getNodesByNgramsUser cId nt =
131 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
132 <$> selectNgramsByNodeUser cId nt
135 selectNgramsByNodeUser :: CorpusId
137 -> Cmd err [(NodeId, Text)]
138 selectNgramsByNodeUser cId' nt' =
139 runPGSQuery queryNgramsByNodeUser
141 , nodeTypeId NodeDocument
143 -- , 100 :: Int -- limit
144 -- , 0 :: Int -- offset
147 queryNgramsByNodeUser :: DPS.Query
148 queryNgramsByNodeUser = [sql|
149 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
150 JOIN ngrams ng ON nng.ngrams_id = ng.id
151 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
152 JOIN nodes n ON nn.node2_id = n.id
153 WHERE nn.node1_id = ? -- CorpusId
154 AND n.typename = ? -- NodeTypeId
155 AND nng.ngrams_type = ? -- NgramsTypeId
157 GROUP BY nng.node2_id, ng.terms
158 ORDER BY (nng.node2_id, ng.terms) DESC
162 ------------------------------------------------------------------------
164 getOccByNgramsOnlyFast :: CorpusId
167 -> Cmd err (Map Text Int)
168 getOccByNgramsOnlyFast cId nt ngs =
169 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
172 getOccByNgramsOnlyFast' :: CorpusId
176 -> Cmd err (Map Text Int)
177 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
178 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
181 fields = [QualifiedIdentifier Nothing "text"]
187 -> Cmd err [(Text, Double)]
188 run cId' lId' nt' tms' = runPGSQuery query
189 ( Values fields (DPS.Only <$> tms')
197 WITH input_rows(terms) AS (?)
198 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
199 JOIN ngrams ng ON nng.ngrams_id = ng.id
200 JOIN input_rows ir ON ir.terms = ng.terms
201 WHERE nng.node1_id = ? -- CorpusId
202 AND nng.node2_id = ? -- ListId
203 AND nng.ngrams_type = ? -- NgramsTypeId
204 -- AND nn.category > 0 -- TODO
205 GROUP BY ng.terms, nng.weight
209 -- just slower than getOccByNgramsOnlyFast
210 getOccByNgramsOnlySlow :: NodeType
215 -> Cmd err (Map Text Int)
216 getOccByNgramsOnlySlow t cId ls nt ngs =
217 Map.map Set.size <$> getScore' t cId ls nt ngs
219 getScore' NodeCorpus = getNodesByNgramsOnlyUser
220 getScore' NodeDocument = getNgramsByDocOnlyUser
221 getScore' _ = getNodesByNgramsOnlyUser
223 getOccByNgramsOnlySafe :: CorpusId
227 -> Cmd err (Map Text Int)
228 getOccByNgramsOnlySafe cId ls nt ngs = do
229 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
230 fast <- getOccByNgramsOnlyFast cId nt ngs
231 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
232 when (fast /= slow) $
233 printDebug "getOccByNgramsOnlySafe: difference"
234 (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
238 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
241 -> Cmd err [(Text, Int)]
242 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
243 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
244 ( Values fields (DPS.Only <$> tms)
246 , nodeTypeId NodeDocument
250 fields = [QualifiedIdentifier Nothing "text"]
252 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
253 -- Question: with the grouping is the result exactly the same (since Set NodeId for
254 -- equivalent ngrams intersections are not empty)
255 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
256 queryNgramsOccurrencesOnlyByNodeUser = [sql|
257 WITH input_rows(terms) AS (?)
258 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
259 JOIN ngrams ng ON nng.ngrams_id = ng.id
260 JOIN input_rows ir ON ir.terms = ng.terms
261 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
262 JOIN nodes n ON nn.node2_id = n.id
263 WHERE nn.node1_id = ? -- CorpusId
264 AND n.typename = ? -- NodeTypeId
265 AND nng.ngrams_type = ? -- NgramsTypeId
267 GROUP BY nng.node2_id, ng.terms
270 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
271 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
272 WITH input_rows(terms) AS (?)
273 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
274 JOIN ngrams ng ON nng.ngrams_id = ng.id
275 JOIN input_rows ir ON ir.terms = ng.terms
276 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
277 JOIN nodes n ON nn.node2_id = n.id
278 WHERE nn.node1_id = ? -- CorpusId
279 AND n.typename = ? -- NodeTypeId
280 AND nng.ngrams_type = ? -- NgramsTypeId
282 GROUP BY nng.node2_id, ng.terms
285 ------------------------------------------------------------------------
286 getNodesByNgramsOnlyUser :: NodeId
290 -> Cmd err (Map Text (Set NodeId))
291 getNodesByNgramsOnlyUser cId ls nt ngs =
293 . map (fromListWith (<>)
294 . map (second Set.singleton))
295 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
296 (splitEvery 1000 ngs)
299 getNgramsByNodeOnlyUser :: NodeId
303 -> Cmd err (Map NodeId (Set Text))
304 getNgramsByNodeOnlyUser cId ls nt ngs =
306 . map (fromListWith (<>)
307 . map (second Set.singleton))
309 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
310 (splitEvery 1000 ngs)
312 ------------------------------------------------------------------------
313 selectNgramsOnlyByNodeUser :: CorpusId
317 -> Cmd err [(Text, NodeId)]
318 selectNgramsOnlyByNodeUser cId ls nt tms =
319 runPGSQuery queryNgramsOnlyByNodeUser
320 ( Values fields (DPS.Only <$> tms)
321 , Values [QualifiedIdentifier Nothing "int4"]
322 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
324 , nodeTypeId NodeDocument
328 fields = [QualifiedIdentifier Nothing "text"]
330 queryNgramsOnlyByNodeUser :: DPS.Query
331 queryNgramsOnlyByNodeUser = [sql|
332 WITH input_rows(terms) AS (?),
333 input_list(id) AS (?)
334 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
335 JOIN ngrams ng ON nng.ngrams_id = ng.id
336 JOIN input_rows ir ON ir.terms = ng.terms
337 JOIN input_list il ON il.id = nng.node1_id
338 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
339 JOIN nodes n ON nn.node2_id = n.id
340 WHERE nn.node1_id = ? -- CorpusId
341 AND n.typename = ? -- NodeTypeId
342 AND nng.ngrams_type = ? -- NgramsTypeId
344 GROUP BY ng.terms, nng.node2_id
348 selectNgramsOnlyByNodeUser' :: CorpusId
352 -> Cmd err [(Text, Int)]
353 selectNgramsOnlyByNodeUser' cId ls nt tms =
354 runPGSQuery queryNgramsOnlyByNodeUser
355 ( Values fields (DPS.Only <$> tms)
356 , Values [QualifiedIdentifier Nothing "int4"]
357 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
359 , nodeTypeId NodeDocument
363 fields = [QualifiedIdentifier Nothing "text"]
365 queryNgramsOnlyByNodeUser' :: DPS.Query
366 queryNgramsOnlyByNodeUser' = [sql|
367 WITH input_rows(terms) AS (?),
368 input_list(id) AS (?)
369 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
370 JOIN ngrams ng ON nng.ngrams_id = ng.id
371 JOIN input_rows ir ON ir.terms = ng.terms
372 JOIN input_list il ON il.id = nng.node2_id
373 WHERE nng.node1_id = ? -- CorpusId
374 AND nng.ngrams_type = ? -- NgramsTypeId
375 -- AND nn.category > 0
376 GROUP BY ng.terms, nng.weight
380 getNgramsByDocOnlyUser :: NodeId
384 -> Cmd err (Map Text (Set NodeId))
385 getNgramsByDocOnlyUser cId ls nt ngs =
387 . map (fromListWith (<>) . map (second Set.singleton))
388 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
391 selectNgramsOnlyByDocUser :: DocId
395 -> Cmd err [(Text, NodeId)]
396 selectNgramsOnlyByDocUser dId ls nt tms =
397 runPGSQuery queryNgramsOnlyByDocUser
398 ( Values fields (DPS.Only <$> tms)
399 , Values [QualifiedIdentifier Nothing "int4"]
400 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
405 fields = [QualifiedIdentifier Nothing "text"]
408 queryNgramsOnlyByDocUser :: DPS.Query
409 queryNgramsOnlyByDocUser = [sql|
410 WITH input_rows(terms) AS (?),
411 input_list(id) AS (?)
412 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
413 JOIN ngrams ng ON nng.ngrams_id = ng.id
414 JOIN input_rows ir ON ir.terms = ng.terms
415 JOIN input_list il ON il.id = nng.node1_id
416 WHERE nng.node2_id = ? -- DocId
417 AND nng.ngrams_type = ? -- NgramsTypeId
418 GROUP BY ng.terms, nng.node2_id
421 ------------------------------------------------------------------------
422 -- | TODO filter by language, database, any social field
423 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
424 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
425 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
426 -- . takeWhile (not . List.null)
427 -- . takeWhile (\l -> List.length l > 3)
428 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
430 selectNgramsByNodeMaster :: Int
434 -> Cmd err [(NodeId, Text)]
435 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
436 queryNgramsByNodeMaster'
438 , ngramsTypeId NgramsTerms
439 , nodeTypeId NodeDocument
441 , nodeTypeId NodeDocument
445 , nodeTypeId NodeDocument
446 , ngramsTypeId NgramsTerms
449 -- | TODO fix node_node_ngrams relation
450 queryNgramsByNodeMaster' :: DPS.Query
451 queryNgramsByNodeMaster' = [sql|
452 WITH nodesByNgramsUser AS (
454 SELECT n.id, ng.terms FROM nodes n
455 JOIN nodes_nodes nn ON n.id = nn.node2_id
456 JOIN node_node_ngrams nng ON nng.node2_id = n.id
457 JOIN ngrams ng ON nng.ngrams_id = ng.id
458 WHERE nn.node1_id = ? -- UserCorpusId
459 -- AND n.typename = ? -- NodeTypeId
460 AND nng.ngrams_type = ? -- NgramsTypeId
462 AND node_pos(n.id,?) >= ?
463 AND node_pos(n.id,?) < ?
464 GROUP BY n.id, ng.terms
468 nodesByNgramsMaster AS (
470 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
471 JOIN node_node_ngrams nng ON n.id = nng.node2_id
472 JOIN ngrams ng ON ng.id = nng.ngrams_id
474 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
475 AND n.typename = ? -- NodeTypeId
476 AND nng.ngrams_type = ? -- NgramsTypeId
477 GROUP BY n.id, ng.terms
480 SELECT m.id, m.terms FROM nodesByNgramsMaster m
481 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id