2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByContext
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map.Strict (Map)
24 import Data.Text (Text)
25 import Data.Tuple.Extra (first, second, swap)
26 import Database.PostgreSQL.Simple.SqlQQ (sql)
27 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
28 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
29 import Gargantext.Core
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.List as List
37 import qualified Data.Map.Strict as Map
38 import qualified Data.Set as Set
39 import qualified Database.PostgreSQL.Simple as DPS
40 import qualified Database.PostgreSQL.Simple.Types as DPST
42 -- | fst is size of Supra Corpus
43 -- snd is Texts and size of Occurrences (different docs)
45 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
46 -> HashMap NgramsTerm (Set ContextId)
47 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
48 countContextsByNgramsWith f m = (total, m')
50 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
51 m' = HM.map ( swap . second (fromIntegral . Set.size))
52 $ groupContextsByNgramsWith f m
55 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
56 -> HashMap NgramsTerm (Set NodeId)
57 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
58 groupContextsByNgramsWith f' m'' =
59 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
62 ------------------------------------------------------------------------
63 getContextsByNgramsUser :: HasDBid NodeType
66 -> Cmd err (HashMap NgramsTerm (Set ContextId))
67 getContextsByNgramsUser cId nt =
68 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
69 <$> selectNgramsByContextUser cId nt
72 selectNgramsByContextUser :: HasDBid NodeType
75 -> Cmd err [(NodeId, Text)]
76 selectNgramsByContextUser cId' nt' =
77 runPGSQuery queryNgramsByContextUser
81 -- , 100 :: Int -- limit
82 -- , 0 :: Int -- offset
85 queryNgramsByContextUser :: DPS.Query
86 queryNgramsByContextUser = [sql|
87 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
88 JOIN ngrams ng ON cng.ngrams_id = ng.id
89 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
90 JOIN contexts c ON nc.context_id = c.id
91 WHERE nc.node_id = ? -- CorpusId
92 AND c.typename = ? -- toDBid
93 AND cng.ngrams_type = ? -- NgramsTypeId
94 AND nc.category > 0 -- is not in Trash
95 GROUP BY cng.context_id, ng.terms
99 ------------------------------------------------------------------------
100 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
105 -> Cmd err (HashMap NgramsTerm Int)
106 getOccByNgramsOnlyFast_withSample cId int nt ngs =
107 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
110 getOccByNgramsOnlyFast :: CorpusId
113 -> Cmd err (HashMap NgramsTerm [ContextId])
114 getOccByNgramsOnlyFast cId lId nt = do
115 --HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
116 HM.fromList <$> map (\(t, ns) -> (NgramsTerm t, NodeId <$> DPST.fromPGArray ns)) <$> run cId lId nt
122 -> Cmd err [(Text, DPST.PGArray Int)]
123 run cId' lId' nt' = runPGSQuery query
131 WITH node_context_ids AS
132 (select context_id, ngrams_id
133 FROM context_node_ngrams_view
136 (select ngrams_id FROM node_stories
137 WHERE node_id = ? AND ngrams_type_id = ?
141 ARRAY ( SELECT DISTINCT context_id
142 FROM node_context_ids
143 WHERE ns.ngrams_id = node_context_ids.ngrams_id
147 JOIN ns ON ng.id = ns.ngrams_id
151 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
156 -> Cmd err [(NgramsTerm, Int)]
157 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
158 fmap (first NgramsTerm) <$>
159 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
161 , toDBid NodeDocument
163 , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
168 fields = [QualifiedIdentifier Nothing "text"]
170 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
171 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
172 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
173 JOIN nodes_contexts nn ON n.id = nn.context_id
176 input_rows(terms) AS (?)
177 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
178 JOIN ngrams ng ON cng.ngrams_id = ng.id
179 JOIN input_rows ir ON ir.terms = ng.terms
180 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
181 JOIN nodes_sample n ON nn.context_id = n.id
182 WHERE nn.node_id = ? -- CorpusId
183 AND cng.ngrams_type = ? -- NgramsTypeId
185 GROUP BY cng.node_id, ng.terms
188 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
192 -> Cmd err [(NgramsTerm, Int)]
193 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
194 fmap (first NgramsTerm) <$>
195 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
197 , toDBid NodeDocument
203 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
204 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
205 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
206 JOIN nodes_contexts nc ON c.id = nc.context_id
209 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
210 JOIN ngrams ng ON cng.ngrams_id = ng.id
211 JOIN node_stories ns ON ns.ngrams_id = ng.id
212 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
213 JOIN contexts_sample c ON nc.context_id = c.id
214 WHERE nc.node_id = ? -- CorpusId
215 AND cng.ngrams_type = ? -- NgramsTypeId
226 ------------------------------------------------------------------------
227 getContextsByNgramsOnlyUser :: HasDBid NodeType
232 -> Cmd err (HashMap NgramsTerm (Set NodeId))
233 getContextsByNgramsOnlyUser cId ls nt ngs =
235 . map (HM.fromListWith (<>)
236 . map (second Set.singleton))
237 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
238 (splitEvery 1000 ngs)
240 getNgramsByContextOnlyUser :: HasDBid NodeType
245 -> Cmd err (Map NodeId (Set NgramsTerm))
246 getNgramsByContextOnlyUser cId ls nt ngs =
248 . map ( Map.fromListWith (<>)
249 . map (second Set.singleton)
252 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
253 (splitEvery 1000 ngs)
255 ------------------------------------------------------------------------
256 selectNgramsOnlyByContextUser :: HasDBid NodeType
261 -> Cmd err [(NgramsTerm, ContextId)]
262 selectNgramsOnlyByContextUser cId ls nt tms =
263 fmap (first NgramsTerm) <$>
264 runPGSQuery queryNgramsOnlyByContextUser
265 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
266 , Values [QualifiedIdentifier Nothing "int4"]
267 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
269 , toDBid NodeDocument
273 fields = [QualifiedIdentifier Nothing "text"]
275 queryNgramsOnlyByContextUser :: DPS.Query
276 queryNgramsOnlyByContextUser = [sql|
277 WITH input_rows(terms) AS (?),
278 input_list(id) AS (?)
279 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
280 JOIN ngrams ng ON cng.ngrams_id = ng.id
281 JOIN input_rows ir ON ir.terms = ng.terms
282 JOIN input_list il ON il.id = cng.node_id
283 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
284 JOIN contexts c ON nc.context_id = c.id
285 WHERE nc.node_id = ? -- CorpusId
286 AND c.typename = ? -- toDBid (maybe not useful with context table)
287 AND cng.ngrams_type = ? -- NgramsTypeId
289 GROUP BY ng.terms, cng.context_id
292 getNgramsByDocOnlyUser :: DocId
296 -> Cmd err (HashMap NgramsTerm (Set NodeId))
297 getNgramsByDocOnlyUser cId ls nt ngs =
299 . map (HM.fromListWith (<>) . map (second Set.singleton))
300 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
303 selectNgramsOnlyByDocUser :: DocId
307 -> Cmd err [(NgramsTerm, NodeId)]
308 selectNgramsOnlyByDocUser dId ls nt tms =
309 fmap (first NgramsTerm) <$>
310 runPGSQuery queryNgramsOnlyByDocUser
311 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
312 , Values [QualifiedIdentifier Nothing "int4"]
313 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
318 fields = [QualifiedIdentifier Nothing "text"]
321 queryNgramsOnlyByDocUser :: DPS.Query
322 queryNgramsOnlyByDocUser = [sql|
323 WITH input_rows(terms) AS (?),
324 input_list(id) AS (?)
325 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
326 JOIN ngrams ng ON cng.ngrams_id = ng.id
327 JOIN input_rows ir ON ir.terms = ng.terms
328 JOIN input_list il ON il.id = cng.context_id
329 WHERE cng.node_id = ? -- DocId
330 AND cng.ngrams_type = ? -- NgramsTypeId
331 GROUP BY ng.terms, cng.node_id
334 ------------------------------------------------------------------------
335 -- | TODO filter by language, database, any social field
336 getContextsByNgramsMaster :: HasDBid NodeType
339 -> Cmd err (HashMap Text (Set NodeId))
340 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
341 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
342 -- . takeWhile (not . List.null)
343 -- . takeWhile (\l -> List.length l > 3)
344 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
346 selectNgramsByContextMaster :: HasDBid NodeType
351 -> Cmd err [(NodeId, Text)]
352 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
353 queryNgramsByContextMaster'
355 , ngramsTypeId NgramsTerms
356 , toDBid NodeDocument
358 , toDBid NodeDocument
362 , toDBid NodeDocument
363 , ngramsTypeId NgramsTerms
366 -- | TODO fix context_node_ngrams relation
367 queryNgramsByContextMaster' :: DPS.Query
368 queryNgramsByContextMaster' = [sql|
369 WITH contextsByNgramsUser AS (
371 SELECT n.id, ng.terms FROM contexts n
372 JOIN nodes_contexts nn ON n.id = nn.context_id
373 JOIN context_node_ngrams cng ON cng.context_id = n.id
374 JOIN ngrams ng ON cng.ngrams_id = ng.id
375 WHERE nn.node_id = ? -- UserCorpusId
376 -- AND n.typename = ? -- toDBid
377 AND cng.ngrams_type = ? -- NgramsTypeId
379 AND node_pos(n.id,?) >= ?
380 AND node_pos(n.id,?) < ?
381 GROUP BY n.id, ng.terms
385 contextsByNgramsMaster AS (
387 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
388 JOIN context_node_ngrams cng ON n.id = cng.context_id
389 JOIN ngrams ng ON ng.id = cng.ngrams_id
391 WHERE n.parent_id = ? -- Master Corpus toDBid
392 AND n.typename = ? -- toDBid
393 AND cng.ngrams_type = ? -- NgramsTypeId
394 GROUP BY n.id, ng.terms
397 SELECT m.id, m.terms FROM nodesByNgramsMaster m
398 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id