2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByContext
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
24 import Data.Text (Text)
25 import Data.Tuple.Extra (first, second, swap)
26 import Database.PostgreSQL.Simple.SqlQQ (sql)
27 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
28 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
29 import Gargantext.Core
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.List as List
37 import qualified Data.Map as Map
38 import qualified Data.Set as Set
39 import qualified Database.PostgreSQL.Simple as DPS
40 import qualified Database.PostgreSQL.Simple.Types as DPST
42 -- | fst is size of Supra Corpus
43 -- snd is Texts and size of Occurrences (different docs)
45 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
46 -> HashMap NgramsTerm (Set ContextId)
47 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
48 countContextsByNgramsWith f m = (total, m')
50 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
51 m' = HM.map ( swap . second (fromIntegral . Set.size))
52 $ groupContextsByNgramsWith f m
55 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
56 -> HashMap NgramsTerm (Set NodeId)
57 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
58 groupContextsByNgramsWith f' m'' =
59 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
62 ------------------------------------------------------------------------
63 getContextsByNgramsUser :: HasDBid NodeType
66 -> Cmd err (HashMap NgramsTerm (Set ContextId))
67 getContextsByNgramsUser cId nt =
68 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
69 <$> selectNgramsByContextUser cId nt
72 selectNgramsByContextUser :: HasDBid NodeType
75 -> Cmd err [(NodeId, Text)]
76 selectNgramsByContextUser cId' nt' =
77 runPGSQuery queryNgramsByContextUser
81 -- , 100 :: Int -- limit
82 -- , 0 :: Int -- offset
85 queryNgramsByContextUser :: DPS.Query
86 queryNgramsByContextUser = [sql|
87 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
88 JOIN ngrams ng ON cng.ngrams_id = ng.id
89 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
90 JOIN contexts c ON nc.context_id = c.id
91 WHERE nc.node_id = ? -- CorpusId
92 AND c.typename = ? -- toDBid
93 AND cng.ngrams_type = ? -- NgramsTypeId
94 AND nc.category > 0 -- is not in Trash
95 GROUP BY cng.context_id, ng.terms
99 ------------------------------------------------------------------------
100 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
105 -> Cmd err (HashMap NgramsTerm Int)
106 getOccByNgramsOnlyFast_withSample cId int nt ngs =
107 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
110 getOccByNgramsOnlyFast :: CorpusId
113 -> Cmd err (HashMap NgramsTerm [ContextId])
114 getOccByNgramsOnlyFast cId lId nt = do
115 --HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
116 HM.fromList <$> map (\(t, ns) -> (NgramsTerm t, NodeId <$> DPST.fromPGArray ns)) <$> run cId lId nt
122 -> Cmd err [(Text, DPST.PGArray Int)]
123 run cId' lId' nt' = runPGSQuery query
134 --, round(nng.weight)
136 SELECT DISTINCT context_node_ngrams.context_id
137 FROM context_node_ngrams
139 ON context_node_ngrams.context_id = nodes_contexts.context_id
140 WHERE ng.id = context_node_ngrams.ngrams_id
141 AND nodes_contexts.node_id = ?
145 -- , ns.ngrams_type_id
147 JOIN node_stories ns ON ng.id = ns.ngrams_id
148 JOIN node_node_ngrams nng ON ns.node_id = nng.node2_id
149 WHERE nng.node1_id = ?
151 AND nng.ngrams_type = ?
152 AND nng.ngrams_id = ng.id
153 AND nng.ngrams_type = ns.ngrams_type_id
158 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
163 -> Cmd err [(NgramsTerm, Int)]
164 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
165 fmap (first NgramsTerm) <$>
166 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
168 , toDBid NodeDocument
170 , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
175 fields = [QualifiedIdentifier Nothing "text"]
177 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
178 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
179 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
180 JOIN nodes_contexts nn ON n.id = nn.context_id
183 input_rows(terms) AS (?)
184 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
185 JOIN ngrams ng ON cng.ngrams_id = ng.id
186 JOIN input_rows ir ON ir.terms = ng.terms
187 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
188 JOIN nodes_sample n ON nn.context_id = n.id
189 WHERE nn.node_id = ? -- CorpusId
190 AND cng.ngrams_type = ? -- NgramsTypeId
192 GROUP BY cng.node_id, ng.terms
195 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
199 -> Cmd err [(NgramsTerm, Int)]
200 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
201 fmap (first NgramsTerm) <$>
202 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
204 , toDBid NodeDocument
210 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
211 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
212 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
213 JOIN nodes_contexts nc ON c.id = nc.context_id
216 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
217 JOIN ngrams ng ON cng.ngrams_id = ng.id
218 JOIN node_stories ns ON ns.ngrams_id = ng.id
219 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
220 JOIN contexts_sample c ON nc.context_id = c.id
221 WHERE nc.node_id = ? -- CorpusId
222 AND cng.ngrams_type = ? -- NgramsTypeId
233 ------------------------------------------------------------------------
234 getContextsByNgramsOnlyUser :: HasDBid NodeType
239 -> Cmd err (HashMap NgramsTerm (Set NodeId))
240 getContextsByNgramsOnlyUser cId ls nt ngs =
242 . map (HM.fromListWith (<>)
243 . map (second Set.singleton))
244 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
245 (splitEvery 1000 ngs)
247 getNgramsByContextOnlyUser :: HasDBid NodeType
252 -> Cmd err (Map NodeId (Set NgramsTerm))
253 getNgramsByContextOnlyUser cId ls nt ngs =
255 . map ( Map.fromListWith (<>)
256 . map (second Set.singleton)
259 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
260 (splitEvery 1000 ngs)
262 ------------------------------------------------------------------------
263 selectNgramsOnlyByContextUser :: HasDBid NodeType
268 -> Cmd err [(NgramsTerm, ContextId)]
269 selectNgramsOnlyByContextUser cId ls nt tms =
270 fmap (first NgramsTerm) <$>
271 runPGSQuery queryNgramsOnlyByContextUser
272 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
273 , Values [QualifiedIdentifier Nothing "int4"]
274 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
276 , toDBid NodeDocument
280 fields = [QualifiedIdentifier Nothing "text"]
282 queryNgramsOnlyByContextUser :: DPS.Query
283 queryNgramsOnlyByContextUser = [sql|
284 WITH input_rows(terms) AS (?),
285 input_list(id) AS (?)
286 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
287 JOIN ngrams ng ON cng.ngrams_id = ng.id
288 JOIN input_rows ir ON ir.terms = ng.terms
289 JOIN input_list il ON il.id = cng.node_id
290 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
291 JOIN contexts c ON nc.context_id = c.id
292 WHERE nc.node_id = ? -- CorpusId
293 AND c.typename = ? -- toDBid (maybe not useful with context table)
294 AND cng.ngrams_type = ? -- NgramsTypeId
296 GROUP BY ng.terms, cng.context_id
299 getNgramsByDocOnlyUser :: DocId
303 -> Cmd err (HashMap NgramsTerm (Set NodeId))
304 getNgramsByDocOnlyUser cId ls nt ngs =
306 . map (HM.fromListWith (<>) . map (second Set.singleton))
307 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
310 selectNgramsOnlyByDocUser :: DocId
314 -> Cmd err [(NgramsTerm, NodeId)]
315 selectNgramsOnlyByDocUser dId ls nt tms =
316 fmap (first NgramsTerm) <$>
317 runPGSQuery queryNgramsOnlyByDocUser
318 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
319 , Values [QualifiedIdentifier Nothing "int4"]
320 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
325 fields = [QualifiedIdentifier Nothing "text"]
328 queryNgramsOnlyByDocUser :: DPS.Query
329 queryNgramsOnlyByDocUser = [sql|
330 WITH input_rows(terms) AS (?),
331 input_list(id) AS (?)
332 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
333 JOIN ngrams ng ON cng.ngrams_id = ng.id
334 JOIN input_rows ir ON ir.terms = ng.terms
335 JOIN input_list il ON il.id = cng.context_id
336 WHERE cng.node_id = ? -- DocId
337 AND cng.ngrams_type = ? -- NgramsTypeId
338 GROUP BY ng.terms, cng.node_id
341 ------------------------------------------------------------------------
342 -- | TODO filter by language, database, any social field
343 getContextsByNgramsMaster :: HasDBid NodeType
346 -> Cmd err (HashMap Text (Set NodeId))
347 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
348 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
349 -- . takeWhile (not . List.null)
350 -- . takeWhile (\l -> List.length l > 3)
351 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
353 selectNgramsByContextMaster :: HasDBid NodeType
358 -> Cmd err [(NodeId, Text)]
359 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
360 queryNgramsByContextMaster'
362 , ngramsTypeId NgramsTerms
363 , toDBid NodeDocument
365 , toDBid NodeDocument
369 , toDBid NodeDocument
370 , ngramsTypeId NgramsTerms
373 -- | TODO fix context_node_ngrams relation
374 queryNgramsByContextMaster' :: DPS.Query
375 queryNgramsByContextMaster' = [sql|
376 WITH contextsByNgramsUser AS (
378 SELECT n.id, ng.terms FROM contexts n
379 JOIN nodes_contexts nn ON n.id = nn.context_id
380 JOIN context_node_ngrams cng ON cng.context_id = n.id
381 JOIN ngrams ng ON cng.ngrams_id = ng.id
382 WHERE nn.node_id = ? -- UserCorpusId
383 -- AND n.typename = ? -- toDBid
384 AND cng.ngrams_type = ? -- NgramsTypeId
386 AND node_pos(n.id,?) >= ?
387 AND node_pos(n.id,?) < ?
388 GROUP BY n.id, ng.terms
392 contextsByNgramsMaster AS (
394 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
395 JOIN context_node_ngrams cng ON n.id = cng.context_id
396 JOIN ngrams ng ON ng.id = cng.ngrams_id
398 WHERE n.parent_id = ? -- Master Corpus toDBid
399 AND n.typename = ? -- toDBid
400 AND cng.ngrams_type = ? -- NgramsTypeId
401 GROUP BY n.id, ng.terms
404 SELECT m.id, m.terms FROM nodesByNgramsMaster m
405 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id