2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
17 {-# LANGUAGE RankNTypes #-}
19 module Gargantext.Database.Metrics.NgramsByNode
22 import Debug.Trace (trace)
23 import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
24 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
26 import Data.Text (Text)
27 import Data.Tuple.Extra (second, swap)
28 import Database.PostgreSQL.Simple.SqlQQ (sql)
29 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
30 import Gargantext.Core (Lang(..))
31 import Gargantext.Database.Config (nodeTypeId)
32 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
33 import Gargantext.Database.Types.Node -- (ListId, CorpusId, NodeId)
34 import Gargantext.Database.Utils (Cmd, runPGSQuery)
35 import Gargantext.Prelude
36 import Gargantext.Text.Metrics.TFICF
37 import Gargantext.Text.Terms.Mono.Stem (stem)
38 import qualified Data.List as List
39 import qualified Data.Map.Strict as Map
40 import qualified Data.Set as Set
41 import qualified Data.Text as Text
42 import qualified Database.PostgreSQL.Simple as DPS
44 -- | TODO: group with 2 terms only can be
45 -- discussed. Main purpose of this is offering
46 -- a first grouping option to user and get some
47 -- enriched data to better learn and improve that algo
48 ngramsGroup :: Lang -> Int -> Int -> Text -> Text
49 ngramsGroup l _m _n = Text.intercalate " "
53 -- . (List.filter (\t -> Text.length t > m))
55 . Text.replace "-" " "
58 sortTficf :: (Map Text (Double, Set Text))
59 -> [ (Text,(Double, Set Text))]
60 sortTficf = List.sortOn (fst . snd) . toList
63 getTficf' :: UserCorpusId -> MasterCorpusId -> NgramsType -> (Text -> Text)
64 -> Cmd err (Map Text (Double, Set Text))
65 getTficf' u m nt f = do
66 u' <- getNodesByNgramsUser u nt
67 m' <- getNodesByNgramsMaster u m
69 pure $ toTficfData (countNodesByNgramsWith f u')
70 (countNodesByNgramsWith f m')
72 getTficfWith :: UserCorpusId -> MasterCorpusId -> [ListId]
73 -> NgramsType -> Map Text (Maybe Text)
74 -> Cmd err (Map Text (Double, Set Text))
75 getTficfWith u m ls nt mtxt = do
76 u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
77 m' <- getNodesByNgramsMaster u m
79 let f x = case Map.lookup x mtxt of
81 Just x' -> maybe x identity x'
83 pure $ toTficfData (countNodesByNgramsWith f u')
84 (countNodesByNgramsWith f m')
87 type Context = (Double, Map Text (Double, Set Text))
91 toTficfData :: Infra -> Supra
92 -> Map Text (Double, Set Text)
93 toTficfData (ti, mi) (ts, ms) =
94 fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
95 (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
99 | (t, (n,ns)) <- toList mi
103 -- | fst is size of Supra Corpus
104 -- snd is Texts and size of Occurrences (different docs)
105 countNodesByNgramsWith :: (Text -> Text)
106 -> Map Text (Set NodeId)
107 -> (Double, Map Text (Double, Set Text))
108 countNodesByNgramsWith f m = (total, m')
110 total = fromIntegral $ Set.size $ Set.unions $ elems m
111 m' = Map.map ( swap . second (fromIntegral . Set.size))
112 $ groupNodesByNgramsWith f m
115 groupNodesByNgramsWith :: (Text -> Text)
116 -> Map Text (Set NodeId)
117 -> Map Text (Set Text, Set NodeId)
118 groupNodesByNgramsWith f m =
119 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
122 ------------------------------------------------------------------------
123 getNodesByNgramsUser :: CorpusId
125 -> Cmd err (Map Text (Set NodeId))
126 getNodesByNgramsUser cId nt =
127 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
128 <$> selectNgramsByNodeUser cId nt
131 selectNgramsByNodeUser :: CorpusId -> NgramsType
132 -> Cmd err [(NodeId, Text)]
133 selectNgramsByNodeUser cId' nt' =
134 runPGSQuery queryNgramsByNodeUser
136 , nodeTypeId NodeDocument
138 -- , 100 :: Int -- limit
139 -- , 0 :: Int -- offset
142 queryNgramsByNodeUser :: DPS.Query
143 queryNgramsByNodeUser = [sql|
144 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
145 JOIN ngrams ng ON nng.ngrams_id = ng.id
146 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
147 JOIN nodes n ON nn.node2_id = n.id
148 WHERE nn.node1_id = ? -- CorpusId
149 AND n.typename = ? -- NodeTypeId
150 AND nng.ngrams_type = ? -- NgramsTypeId
152 GROUP BY nng.node2_id, ng.terms
153 ORDER BY (nng.node2_id, ng.terms) DESC
157 ------------------------------------------------------------------------
159 getOccByNgramsOnlyFast :: CorpusId
162 -> Cmd err (Map Text Int)
163 getOccByNgramsOnlyFast cId nt ngs =
164 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
167 getOccByNgramsOnlyFast' :: CorpusId
171 -> Cmd err (Map Text Int)
172 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
173 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
176 fields = [QualifiedIdentifier Nothing "text"]
182 -> Cmd err [(Text, Double)]
183 run cId' lId' _nt' tms' = runPGSQuery query
184 ( Values fields (DPS.Only <$> tms')
187 -- , ngramsTypeId nt'
192 WITH input_rows(terms) AS (?)
193 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
194 JOIN ngrams ng ON nng.ngrams_id = ng.id
195 JOIN input_rows ir ON ir.terms = ng.terms
196 WHERE nng.node1_id = ? -- CorpusId
198 -- AND nng.ngrams_type = ? -- NgramsTypeId
199 -- AND nn.category > 0
200 GROUP BY ng.terms, nng.weight
206 -- just slower than getOccByNgramsOnlyFast
207 getOccByNgramsOnlySlow :: NodeType
212 -> Cmd err (Map Text Int)
213 getOccByNgramsOnlySlow t cId ls nt ngs =
214 Map.map Set.size <$> getScore' t cId ls nt ngs
216 getScore' NodeCorpus = getNodesByNgramsOnlyUser
217 getScore' NodeDocument = getNgramsByDocOnlyUser
218 getScore' _ = getNodesByNgramsOnlyUser
220 getOccByNgramsOnlySafe :: CorpusId
224 -> Cmd err (Map Text Int)
225 getOccByNgramsOnlySafe cId ls nt ngs = do
226 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
227 fast <- getOccByNgramsOnlyFast cId nt ngs
228 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
229 when (fast /= slow) $
230 printDebug "getOccByNgramsOnlySafe: difference" (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
234 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId -> NgramsType -> [Text]
235 -> Cmd err [(Text, Int)]
236 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
237 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
238 ( Values fields (DPS.Only <$> tms)
240 , nodeTypeId NodeDocument
244 fields = [QualifiedIdentifier Nothing "text"]
246 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
247 -- Question: with the grouping is the result exactly the same (since Set NodeId for
248 -- equivalent ngrams intersections are not empty)
249 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
250 queryNgramsOccurrencesOnlyByNodeUser = [sql|
251 WITH input_rows(terms) AS (?)
252 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
253 JOIN ngrams ng ON nng.ngrams_id = ng.id
254 JOIN input_rows ir ON ir.terms = ng.terms
255 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
256 JOIN nodes n ON nn.node2_id = n.id
257 WHERE nn.node1_id = ? -- CorpusId
258 AND n.typename = ? -- NodeTypeId
259 AND nng.ngrams_type = ? -- NgramsTypeId
261 GROUP BY nng.node2_id, ng.terms
266 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
267 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
268 WITH input_rows(terms) AS (?)
269 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
270 JOIN ngrams ng ON nng.ngrams_id = ng.id
271 JOIN input_rows ir ON ir.terms = ng.terms
272 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
273 JOIN nodes n ON nn.node2_id = n.id
274 WHERE nn.node1_id = ? -- CorpusId
275 AND n.typename = ? -- NodeTypeId
276 AND nng.ngrams_type = ? -- NgramsTypeId
278 GROUP BY nng.node2_id, ng.terms
284 getNodesByNgramsOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
285 -> Cmd err (Map Text (Set NodeId))
286 getNodesByNgramsOnlyUser cId ls nt ngs =
288 . map (fromListWith (<>) . map (second Set.singleton))
289 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
290 (splitEvery 1000 ngs)
292 selectNgramsOnlyByNodeUser :: CorpusId -> [ListId] -> NgramsType -> [Text]
293 -> Cmd err [(Text, NodeId)]
294 selectNgramsOnlyByNodeUser cId ls nt tms =
295 runPGSQuery queryNgramsOnlyByNodeUser
296 ( Values fields (DPS.Only <$> tms)
297 , Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
299 , nodeTypeId NodeDocument
303 fields = [QualifiedIdentifier Nothing "text"]
305 queryNgramsOnlyByNodeUser :: DPS.Query
306 queryNgramsOnlyByNodeUser = [sql|
307 WITH input_rows(terms) AS (?),
308 input_list(id) AS (?)
309 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
310 JOIN ngrams ng ON nng.ngrams_id = ng.id
311 JOIN input_rows ir ON ir.terms = ng.terms
312 JOIN input_list il ON il.id = nng.node1_id
313 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
314 JOIN nodes n ON nn.node2_id = n.id
315 WHERE nn.node1_id = ? -- CorpusId
316 AND n.typename = ? -- NodeTypeId
317 AND nng.ngrams_type = ? -- NgramsTypeId
319 GROUP BY ng.terms, nng.node2_id
325 selectNgramsOnlyByNodeUser' :: CorpusId -> [ListId] -> NgramsType -> [Text]
326 -> Cmd err [(Text, Int)]
327 selectNgramsOnlyByNodeUser' cId ls nt tms =
328 runPGSQuery queryNgramsOnlyByNodeUser
329 ( Values fields (DPS.Only <$> tms)
330 , Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
332 , nodeTypeId NodeDocument
336 fields = [QualifiedIdentifier Nothing "text"]
338 queryNgramsOnlyByNodeUser' :: DPS.Query
339 queryNgramsOnlyByNodeUser' = [sql|
340 WITH input_rows(terms) AS (?),
341 input_list(id) AS (?)
342 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
343 JOIN ngrams ng ON nng.ngrams_id = ng.id
344 JOIN input_rows ir ON ir.terms = ng.terms
345 JOIN input_list il ON il.id = nng.node2_id
346 WHERE nng.node1_id = ? -- CorpusId
347 AND nng.ngrams_type = ? -- NgramsTypeId
348 -- AND nn.category > 0
349 GROUP BY ng.terms, nng.weight
355 getNgramsByDocOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
356 -> Cmd err (Map Text (Set NodeId))
357 getNgramsByDocOnlyUser cId ls nt ngs =
359 . map (fromListWith (<>) . map (second Set.singleton))
360 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
363 selectNgramsOnlyByDocUser :: DocId -> [ListId] -> NgramsType -> [Text]
364 -> Cmd err [(Text, NodeId)]
365 selectNgramsOnlyByDocUser dId ls nt tms =
366 runPGSQuery queryNgramsOnlyByDocUser
367 ( Values fields (DPS.Only <$> tms)
368 , Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
373 fields = [QualifiedIdentifier Nothing "text"]
375 queryNgramsOnlyByDocUser :: DPS.Query
376 queryNgramsOnlyByDocUser = [sql|
377 WITH input_rows(terms) AS (?),
378 input_list(id) AS (?)
379 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
380 JOIN ngrams ng ON nng.ngrams_id = ng.id
381 JOIN input_rows ir ON ir.terms = ng.terms
382 JOIN input_list il ON il.id = nng.node1_id
383 WHERE nng.node2_id = ? -- DocId
384 AND nng.ngrams_type = ? -- NgramsTypeId
385 GROUP BY ng.terms, nng.node2_id
388 ------------------------------------------------------------------------
389 -- | TODO filter by language, database, any social field
390 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
391 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
392 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
393 -- . takeWhile (not . List.null)
394 -- . takeWhile (\l -> List.length l > 3)
395 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
398 selectNgramsByNodeMaster :: Int -> UserCorpusId -> MasterCorpusId -> Int -> Cmd err [(NodeId, Text)]
399 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
400 queryNgramsByNodeMaster'
402 , ngramsTypeId NgramsTerms
403 , nodeTypeId NodeDocument
405 , nodeTypeId NodeDocument
409 , nodeTypeId NodeDocument
410 , ngramsTypeId NgramsTerms
413 -- | TODO fix node_node_ngrams relation
414 queryNgramsByNodeMaster' :: DPS.Query
415 queryNgramsByNodeMaster' = [sql|
416 WITH nodesByNgramsUser AS (
418 SELECT n.id, ng.terms FROM nodes n
419 JOIN nodes_nodes nn ON n.id = nn.node2_id
420 JOIN node_node_ngrams nng ON nng.node2_id = n.id
421 JOIN ngrams ng ON nng.ngrams_id = ng.id
422 WHERE nn.node1_id = ? -- UserCorpusId
423 -- AND n.typename = ? -- NodeTypeId
424 AND nng.ngrams_type = ? -- NgramsTypeId
426 AND node_pos(n.id,?) >= ?
427 AND node_pos(n.id,?) < ?
428 GROUP BY n.id, ng.terms
432 nodesByNgramsMaster AS (
434 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
435 JOIN node_node_ngrams nng ON n.id = nng.node2_id
436 JOIN ngrams ng ON ng.id = nng.ngrams_id
438 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
439 AND n.typename = ? -- NodeTypeId
440 AND nng.ngrams_type = ? -- NgramsTypeId
441 GROUP BY n.id, ng.terms
444 SELECT m.id, m.terms FROM nodesByNgramsMaster m
445 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id