2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE FlexibleContexts #-}
15 {-# LANGUAGE QuasiQuotes #-}
16 {-# LANGUAGE NoImplicitPrelude #-}
17 {-# LANGUAGE OverloadedStrings #-}
18 {-# LANGUAGE RankNTypes #-}
20 module Gargantext.Database.Action.Metrics.NgramsByNode
23 import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
24 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
26 import qualified Data.Ord as DO (Down(..))
27 import Data.Text (Text)
28 import Data.Tuple.Extra (second, swap)
29 import Database.PostgreSQL.Simple.SqlQQ (sql)
30 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
31 import Debug.Trace (trace)
32 import Gargantext.Core (Lang(..))
33 import Gargantext.Core.Types (Ordering(..))
34 import Gargantext.Database.Admin.Config (nodeTypeId)
35 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
36 import Gargantext.Database.Admin.Utils (Cmd, runPGSQuery)
37 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
38 import Gargantext.Prelude
39 import Gargantext.Text.Metrics.TFICF
40 import Gargantext.Text.Terms.Mono.Stem (stem)
41 import qualified Data.List as List
42 import qualified Data.Map.Strict as Map
43 import qualified Data.Set as Set
44 import qualified Data.Text as Text
45 import qualified Database.PostgreSQL.Simple as DPS
47 -- | TODO: group with 2 terms only can be
48 -- discussed. Main purpose of this is offering
49 -- a first grouping option to user and get some
50 -- enriched data to better learn and improve that algo
56 ngramsGroup l _m _n = Text.intercalate " "
60 -- . (List.filter (\t -> Text.length t > m))
62 . Text.replace "-" " "
66 -> (Map Text (Double, Set Text))
67 -> [ (Text,(Double, Set Text))]
68 sortTficf Down = List.sortOn (DO.Down . fst . snd) . toList
69 sortTficf Up = List.sortOn (fst . snd) . toList
72 getTficf :: UserCorpusId
76 -> Cmd err (Map Text (Double, Set Text))
77 getTficf u m nt f = do
78 u' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsUser u nt
79 m' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsMaster u m
81 pure $ toTficfData (countNodesByNgramsWith f u')
82 (countNodesByNgramsWith f m')
85 getTficfWith :: UserCorpusId
89 -> Map Text (Maybe Text)
90 -> Cmd err (Map Text (Double, Set Text))
91 getTficfWith u m ls nt mtxt = do
92 u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
93 m' <- getNodesByNgramsMaster u m
95 let f x = case Map.lookup x mtxt of
97 Just x' -> maybe x identity x'
99 pure $ toTficfData (countNodesByNgramsWith f u') (countNodesByNgramsWith f m')
102 type Context = (Double, Map Text (Double, Set Text))
108 -> Map Text (Double, Set Text)
109 toTficfData (ti, mi) (ts, ms) =
110 fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
111 (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
115 | (t, (n,ns)) <- toList mi
119 -- | fst is size of Supra Corpus
120 -- snd is Texts and size of Occurrences (different docs)
121 countNodesByNgramsWith :: (Text -> Text)
122 -> Map Text (Set NodeId)
123 -> (Double, Map Text (Double, Set Text))
124 countNodesByNgramsWith f m = (total, m')
126 total = fromIntegral $ Set.size $ Set.unions $ elems m
127 m' = Map.map ( swap . second (fromIntegral . Set.size))
128 $ groupNodesByNgramsWith f m
131 groupNodesByNgramsWith :: (Text -> Text)
132 -> Map Text (Set NodeId)
133 -> Map Text (Set Text, Set NodeId)
134 groupNodesByNgramsWith f m =
135 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
138 ------------------------------------------------------------------------
139 getNodesByNgramsUser :: CorpusId
141 -> Cmd err (Map Text (Set NodeId))
142 getNodesByNgramsUser cId nt =
143 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
144 <$> selectNgramsByNodeUser cId nt
147 selectNgramsByNodeUser :: CorpusId
149 -> Cmd err [(NodeId, Text)]
150 selectNgramsByNodeUser cId' nt' =
151 runPGSQuery queryNgramsByNodeUser
153 , nodeTypeId NodeDocument
155 -- , 100 :: Int -- limit
156 -- , 0 :: Int -- offset
159 queryNgramsByNodeUser :: DPS.Query
160 queryNgramsByNodeUser = [sql|
161 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
162 JOIN ngrams ng ON nng.ngrams_id = ng.id
163 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
164 JOIN nodes n ON nn.node2_id = n.id
165 WHERE nn.node1_id = ? -- CorpusId
166 AND n.typename = ? -- NodeTypeId
167 AND nng.ngrams_type = ? -- NgramsTypeId
169 GROUP BY nng.node2_id, ng.terms
170 ORDER BY (nng.node2_id, ng.terms) DESC
174 ------------------------------------------------------------------------
176 getOccByNgramsOnlyFast :: CorpusId
179 -> Cmd err (Map Text Int)
180 getOccByNgramsOnlyFast cId nt ngs =
181 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
184 getOccByNgramsOnlyFast' :: CorpusId
188 -> Cmd err (Map Text Int)
189 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
190 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
193 fields = [QualifiedIdentifier Nothing "text"]
199 -> Cmd err [(Text, Double)]
200 run cId' lId' nt' tms' = runPGSQuery query
201 ( Values fields (DPS.Only <$> tms')
209 WITH input_rows(terms) AS (?)
210 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
211 JOIN ngrams ng ON nng.ngrams_id = ng.id
212 JOIN input_rows ir ON ir.terms = ng.terms
213 WHERE nng.node1_id = ? -- CorpusId
214 AND nng.node2_id = ? -- ListId
215 AND nng.ngrams_type = ? -- NgramsTypeId
216 -- AND nn.category > 0 -- TODO
217 GROUP BY ng.terms, nng.weight
221 -- just slower than getOccByNgramsOnlyFast
222 getOccByNgramsOnlySlow :: NodeType
227 -> Cmd err (Map Text Int)
228 getOccByNgramsOnlySlow t cId ls nt ngs =
229 Map.map Set.size <$> getScore' t cId ls nt ngs
231 getScore' NodeCorpus = getNodesByNgramsOnlyUser
232 getScore' NodeDocument = getNgramsByDocOnlyUser
233 getScore' _ = getNodesByNgramsOnlyUser
235 getOccByNgramsOnlySafe :: CorpusId
239 -> Cmd err (Map Text Int)
240 getOccByNgramsOnlySafe cId ls nt ngs = do
241 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
242 fast <- getOccByNgramsOnlyFast cId nt ngs
243 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
244 when (fast /= slow) $
245 printDebug "getOccByNgramsOnlySafe: difference"
246 (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
250 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
253 -> Cmd err [(Text, Int)]
254 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
255 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
256 ( Values fields (DPS.Only <$> tms)
258 , nodeTypeId NodeDocument
262 fields = [QualifiedIdentifier Nothing "text"]
264 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
265 -- Question: with the grouping is the result exactly the same (since Set NodeId for
266 -- equivalent ngrams intersections are not empty)
267 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
268 queryNgramsOccurrencesOnlyByNodeUser = [sql|
269 WITH input_rows(terms) AS (?)
270 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
271 JOIN ngrams ng ON nng.ngrams_id = ng.id
272 JOIN input_rows ir ON ir.terms = ng.terms
273 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
274 JOIN nodes n ON nn.node2_id = n.id
275 WHERE nn.node1_id = ? -- CorpusId
276 AND n.typename = ? -- NodeTypeId
277 AND nng.ngrams_type = ? -- NgramsTypeId
279 GROUP BY nng.node2_id, ng.terms
282 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
283 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
284 WITH input_rows(terms) AS (?)
285 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
286 JOIN ngrams ng ON nng.ngrams_id = ng.id
287 JOIN input_rows ir ON ir.terms = ng.terms
288 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
289 JOIN nodes n ON nn.node2_id = n.id
290 WHERE nn.node1_id = ? -- CorpusId
291 AND n.typename = ? -- NodeTypeId
292 AND nng.ngrams_type = ? -- NgramsTypeId
294 GROUP BY nng.node2_id, ng.terms
297 ------------------------------------------------------------------------
298 getNodesByNgramsOnlyUser :: NodeId
302 -> Cmd err (Map Text (Set NodeId))
303 getNodesByNgramsOnlyUser cId ls nt ngs =
305 . map (fromListWith (<>)
306 . map (second Set.singleton))
307 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
308 (splitEvery 1000 ngs)
311 getNgramsByNodeOnlyUser :: NodeId
315 -> Cmd err (Map NodeId (Set Text))
316 getNgramsByNodeOnlyUser cId ls nt ngs =
318 . map (fromListWith (<>)
319 . map (second Set.singleton))
321 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
322 (splitEvery 1000 ngs)
324 ------------------------------------------------------------------------
325 selectNgramsOnlyByNodeUser :: CorpusId
329 -> Cmd err [(Text, NodeId)]
330 selectNgramsOnlyByNodeUser cId ls nt tms =
331 runPGSQuery queryNgramsOnlyByNodeUser
332 ( Values fields (DPS.Only <$> tms)
333 , Values [QualifiedIdentifier Nothing "int4"]
334 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
336 , nodeTypeId NodeDocument
340 fields = [QualifiedIdentifier Nothing "text"]
342 queryNgramsOnlyByNodeUser :: DPS.Query
343 queryNgramsOnlyByNodeUser = [sql|
344 WITH input_rows(terms) AS (?),
345 input_list(id) AS (?)
346 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
347 JOIN ngrams ng ON nng.ngrams_id = ng.id
348 JOIN input_rows ir ON ir.terms = ng.terms
349 JOIN input_list il ON il.id = nng.node1_id
350 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
351 JOIN nodes n ON nn.node2_id = n.id
352 WHERE nn.node1_id = ? -- CorpusId
353 AND n.typename = ? -- NodeTypeId
354 AND nng.ngrams_type = ? -- NgramsTypeId
356 GROUP BY ng.terms, nng.node2_id
360 selectNgramsOnlyByNodeUser' :: CorpusId
364 -> Cmd err [(Text, Int)]
365 selectNgramsOnlyByNodeUser' cId ls nt tms =
366 runPGSQuery queryNgramsOnlyByNodeUser
367 ( Values fields (DPS.Only <$> tms)
368 , Values [QualifiedIdentifier Nothing "int4"]
369 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
371 , nodeTypeId NodeDocument
375 fields = [QualifiedIdentifier Nothing "text"]
377 queryNgramsOnlyByNodeUser' :: DPS.Query
378 queryNgramsOnlyByNodeUser' = [sql|
379 WITH input_rows(terms) AS (?),
380 input_list(id) AS (?)
381 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
382 JOIN ngrams ng ON nng.ngrams_id = ng.id
383 JOIN input_rows ir ON ir.terms = ng.terms
384 JOIN input_list il ON il.id = nng.node2_id
385 WHERE nng.node1_id = ? -- CorpusId
386 AND nng.ngrams_type = ? -- NgramsTypeId
387 -- AND nn.category > 0
388 GROUP BY ng.terms, nng.weight
392 getNgramsByDocOnlyUser :: NodeId
396 -> Cmd err (Map Text (Set NodeId))
397 getNgramsByDocOnlyUser cId ls nt ngs =
399 . map (fromListWith (<>) . map (second Set.singleton))
400 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
403 selectNgramsOnlyByDocUser :: DocId
407 -> Cmd err [(Text, NodeId)]
408 selectNgramsOnlyByDocUser dId ls nt tms =
409 runPGSQuery queryNgramsOnlyByDocUser
410 ( Values fields (DPS.Only <$> tms)
411 , Values [QualifiedIdentifier Nothing "int4"]
412 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
417 fields = [QualifiedIdentifier Nothing "text"]
420 queryNgramsOnlyByDocUser :: DPS.Query
421 queryNgramsOnlyByDocUser = [sql|
422 WITH input_rows(terms) AS (?),
423 input_list(id) AS (?)
424 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
425 JOIN ngrams ng ON nng.ngrams_id = ng.id
426 JOIN input_rows ir ON ir.terms = ng.terms
427 JOIN input_list il ON il.id = nng.node1_id
428 WHERE nng.node2_id = ? -- DocId
429 AND nng.ngrams_type = ? -- NgramsTypeId
430 GROUP BY ng.terms, nng.node2_id
433 ------------------------------------------------------------------------
434 -- | TODO filter by language, database, any social field
435 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
436 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
437 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
438 -- . takeWhile (not . List.null)
439 -- . takeWhile (\l -> List.length l > 3)
440 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
442 selectNgramsByNodeMaster :: Int
446 -> Cmd err [(NodeId, Text)]
447 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
448 queryNgramsByNodeMaster'
450 , ngramsTypeId NgramsTerms
451 , nodeTypeId NodeDocument
453 , nodeTypeId NodeDocument
457 , nodeTypeId NodeDocument
458 , ngramsTypeId NgramsTerms
461 -- | TODO fix node_node_ngrams relation
462 queryNgramsByNodeMaster' :: DPS.Query
463 queryNgramsByNodeMaster' = [sql|
464 WITH nodesByNgramsUser AS (
466 SELECT n.id, ng.terms FROM nodes n
467 JOIN nodes_nodes nn ON n.id = nn.node2_id
468 JOIN node_node_ngrams nng ON nng.node2_id = n.id
469 JOIN ngrams ng ON nng.ngrams_id = ng.id
470 WHERE nn.node1_id = ? -- UserCorpusId
471 -- AND n.typename = ? -- NodeTypeId
472 AND nng.ngrams_type = ? -- NgramsTypeId
474 AND node_pos(n.id,?) >= ?
475 AND node_pos(n.id,?) < ?
476 GROUP BY n.id, ng.terms
480 nodesByNgramsMaster AS (
482 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
483 JOIN node_node_ngrams nng ON n.id = nng.node2_id
484 JOIN ngrams ng ON ng.id = nng.ngrams_id
486 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
487 AND n.typename = ? -- NodeTypeId
488 AND nng.ngrams_type = ? -- NgramsTypeId
489 GROUP BY n.id, ng.terms
492 SELECT m.id, m.terms FROM nodesByNgramsMaster m
493 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id