2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByContext
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
24 import Data.Text (Text)
25 import Data.Tuple.Extra (first, second, swap)
26 import Database.PostgreSQL.Simple.SqlQQ (sql)
27 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
28 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
29 import Gargantext.Core
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
39 import qualified Database.PostgreSQL.Simple.Types as DPST
41 -- | fst is size of Supra Corpus
42 -- snd is Texts and size of Occurrences (different docs)
44 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
45 -> HashMap NgramsTerm (Set ContextId)
46 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
47 countContextsByNgramsWith f m = (total, m')
49 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
50 m' = HM.map ( swap . second (fromIntegral . Set.size))
51 $ groupContextsByNgramsWith f m
54 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
55 -> HashMap NgramsTerm (Set NodeId)
56 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
57 groupContextsByNgramsWith f' m'' =
58 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
61 ------------------------------------------------------------------------
62 getContextsByNgramsUser :: HasDBid NodeType
65 -> Cmd err (HashMap NgramsTerm (Set ContextId))
66 getContextsByNgramsUser cId nt =
67 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
68 <$> selectNgramsByContextUser cId nt
71 selectNgramsByContextUser :: HasDBid NodeType
74 -> Cmd err [(NodeId, Text)]
75 selectNgramsByContextUser cId' nt' =
76 runPGSQuery queryNgramsByContextUser
80 -- , 100 :: Int -- limit
81 -- , 0 :: Int -- offset
84 queryNgramsByContextUser :: DPS.Query
85 queryNgramsByContextUser = [sql|
86 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
87 JOIN ngrams ng ON cng.ngrams_id = ng.id
88 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
89 JOIN contexts c ON nc.context_id = c.id
90 WHERE nc.node_id = ? -- CorpusId
91 AND c.typename = ? -- toDBid
92 AND cng.ngrams_type = ? -- NgramsTypeId
93 AND nc.category > 0 -- is not in Trash
94 GROUP BY cng.context_id, ng.terms
98 ------------------------------------------------------------------------
99 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
104 -> Cmd err (HashMap NgramsTerm Int)
105 getOccByNgramsOnlyFast_withSample cId int nt ngs =
106 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
109 getOccByNgramsOnlyFast :: CorpusId
112 -> Cmd err (HashMap NgramsTerm [ContextId])
113 getOccByNgramsOnlyFast cId lId nt = do
114 --HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
115 HM.fromList <$> map (\(t, ns) -> (NgramsTerm t, NodeId <$> DPST.fromPGArray ns)) <$> run cId lId nt
121 -> Cmd err [(Text, DPST.PGArray Int)]
122 run cId' lId' nt' = runPGSQuery query
133 --, round(nng.weight)
135 SELECT DISTINCT context_node_ngrams.context_id
136 FROM context_node_ngrams
138 ON context_node_ngrams.context_id = nodes_contexts.context_id
139 WHERE ng.id = context_node_ngrams.ngrams_id
140 AND nodes_contexts.node_id = ?
144 -- , ns.ngrams_type_id
146 JOIN node_stories ns ON ng.id = ns.ngrams_id
147 JOIN node_node_ngrams nng ON ns.node_id = nng.node2_id
148 WHERE nng.node1_id = ?
150 AND nng.ngrams_type = ?
151 AND nng.ngrams_id = ng.id
152 AND nng.ngrams_type = ns.ngrams_type_id
157 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
162 -> Cmd err [(NgramsTerm, Int)]
163 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
164 fmap (first NgramsTerm) <$>
165 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
167 , toDBid NodeDocument
169 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
174 fields = [QualifiedIdentifier Nothing "text"]
176 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
177 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
178 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
179 JOIN nodes_contexts nn ON n.id = nn.context_id
182 input_rows(terms) AS (?)
183 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
184 JOIN ngrams ng ON cng.ngrams_id = ng.id
185 JOIN input_rows ir ON ir.terms = ng.terms
186 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
187 JOIN nodes_sample n ON nn.context_id = n.id
188 WHERE nn.node_id = ? -- CorpusId
189 AND cng.ngrams_type = ? -- NgramsTypeId
191 GROUP BY cng.node_id, ng.terms
194 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
198 -> Cmd err [(NgramsTerm, Int)]
199 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
200 fmap (first NgramsTerm) <$>
201 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
203 , toDBid NodeDocument
209 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
210 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
211 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
212 JOIN nodes_contexts nc ON c.id = nc.context_id
215 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
216 JOIN ngrams ng ON cng.ngrams_id = ng.id
217 JOIN node_stories ns ON ns.ngrams_id = ng.id
218 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
219 JOIN contexts_sample c ON nc.context_id = c.id
220 WHERE nc.node_id = ? -- CorpusId
221 AND cng.ngrams_type = ? -- NgramsTypeId
232 ------------------------------------------------------------------------
233 getContextsByNgramsOnlyUser :: HasDBid NodeType
238 -> Cmd err (HashMap NgramsTerm (Set NodeId))
239 getContextsByNgramsOnlyUser cId ls nt ngs =
241 . map (HM.fromListWith (<>)
242 . map (second Set.singleton))
243 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
244 (splitEvery 1000 ngs)
246 getNgramsByContextOnlyUser :: HasDBid NodeType
251 -> Cmd err (Map NodeId (Set NgramsTerm))
252 getNgramsByContextOnlyUser cId ls nt ngs =
254 . map ( Map.fromListWith (<>)
255 . map (second Set.singleton)
258 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
259 (splitEvery 1000 ngs)
261 ------------------------------------------------------------------------
262 selectNgramsOnlyByContextUser :: HasDBid NodeType
267 -> Cmd err [(NgramsTerm, ContextId)]
268 selectNgramsOnlyByContextUser cId ls nt tms =
269 fmap (first NgramsTerm) <$>
270 runPGSQuery queryNgramsOnlyByContextUser
271 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
272 , Values [QualifiedIdentifier Nothing "int4"]
273 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
275 , toDBid NodeDocument
279 fields = [QualifiedIdentifier Nothing "text"]
281 queryNgramsOnlyByContextUser :: DPS.Query
282 queryNgramsOnlyByContextUser = [sql|
283 WITH input_rows(terms) AS (?),
284 input_list(id) AS (?)
285 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
286 JOIN ngrams ng ON cng.ngrams_id = ng.id
287 JOIN input_rows ir ON ir.terms = ng.terms
288 JOIN input_list il ON il.id = cng.node_id
289 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
290 JOIN contexts c ON nc.context_id = c.id
291 WHERE nc.node_id = ? -- CorpusId
292 AND c.typename = ? -- toDBid (maybe not useful with context table)
293 AND cng.ngrams_type = ? -- NgramsTypeId
295 GROUP BY ng.terms, cng.context_id
298 getNgramsByDocOnlyUser :: DocId
302 -> Cmd err (HashMap NgramsTerm (Set NodeId))
303 getNgramsByDocOnlyUser cId ls nt ngs =
305 . map (HM.fromListWith (<>) . map (second Set.singleton))
306 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
309 selectNgramsOnlyByDocUser :: DocId
313 -> Cmd err [(NgramsTerm, NodeId)]
314 selectNgramsOnlyByDocUser dId ls nt tms =
315 fmap (first NgramsTerm) <$>
316 runPGSQuery queryNgramsOnlyByDocUser
317 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
318 , Values [QualifiedIdentifier Nothing "int4"]
319 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
324 fields = [QualifiedIdentifier Nothing "text"]
327 queryNgramsOnlyByDocUser :: DPS.Query
328 queryNgramsOnlyByDocUser = [sql|
329 WITH input_rows(terms) AS (?),
330 input_list(id) AS (?)
331 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
332 JOIN ngrams ng ON cng.ngrams_id = ng.id
333 JOIN input_rows ir ON ir.terms = ng.terms
334 JOIN input_list il ON il.id = cng.context_id
335 WHERE cng.node_id = ? -- DocId
336 AND cng.ngrams_type = ? -- NgramsTypeId
337 GROUP BY ng.terms, cng.node_id
340 ------------------------------------------------------------------------
341 -- | TODO filter by language, database, any social field
342 getContextsByNgramsMaster :: HasDBid NodeType
345 -> Cmd err (HashMap Text (Set NodeId))
346 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
347 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
348 -- . takeWhile (not . List.null)
349 -- . takeWhile (\l -> List.length l > 3)
350 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
352 selectNgramsByContextMaster :: HasDBid NodeType
357 -> Cmd err [(NodeId, Text)]
358 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
359 queryNgramsByContextMaster'
361 , ngramsTypeId NgramsTerms
362 , toDBid NodeDocument
364 , toDBid NodeDocument
368 , toDBid NodeDocument
369 , ngramsTypeId NgramsTerms
372 -- | TODO fix context_node_ngrams relation
373 queryNgramsByContextMaster' :: DPS.Query
374 queryNgramsByContextMaster' = [sql|
375 WITH contextsByNgramsUser AS (
377 SELECT n.id, ng.terms FROM contexts n
378 JOIN nodes_contexts nn ON n.id = nn.context_id
379 JOIN context_node_ngrams cng ON cng.context_id = n.id
380 JOIN ngrams ng ON cng.ngrams_id = ng.id
381 WHERE nn.node_id = ? -- UserCorpusId
382 -- AND n.typename = ? -- toDBid
383 AND cng.ngrams_type = ? -- NgramsTypeId
385 AND node_pos(n.id,?) >= ?
386 AND node_pos(n.id,?) < ?
387 GROUP BY n.id, ng.terms
391 contextsByNgramsMaster AS (
393 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
394 JOIN context_node_ngrams cng ON n.id = cng.context_id
395 JOIN ngrams ng ON ng.id = cng.ngrams_id
397 WHERE n.parent_id = ? -- Master Corpus toDBid
398 AND n.typename = ? -- toDBid
399 AND cng.ngrams_type = ? -- NgramsTypeId
400 GROUP BY n.id, ng.terms
403 SELECT m.id, m.terms FROM nodesByNgramsMaster m
404 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id