2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByContext
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map.Strict (Map)
24 import Data.Text (Text)
25 import Data.Tuple.Extra (first, second, swap)
26 import Database.PostgreSQL.Simple.SqlQQ (sql)
27 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
28 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
29 import Gargantext.Core
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.List as List
37 import qualified Data.Map.Strict as Map
38 import qualified Data.Set as Set
39 import qualified Database.PostgreSQL.Simple as DPS
40 import qualified Database.PostgreSQL.Simple.Types as DPST
42 -- | fst is size of Supra Corpus
43 -- snd is Texts and size of Occurrences (different docs)
45 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
46 -> HashMap NgramsTerm (Set ContextId)
47 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
48 countContextsByNgramsWith f m = (total, m')
50 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
51 m' = HM.map ( swap . second (fromIntegral . Set.size))
52 $ groupContextsByNgramsWith f m
55 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
56 -> HashMap NgramsTerm (Set NodeId)
57 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
58 groupContextsByNgramsWith f' m'' =
59 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
62 ------------------------------------------------------------------------
63 getContextsByNgramsUser :: HasDBid NodeType
66 -> Cmd err (HashMap NgramsTerm (Set ContextId))
67 getContextsByNgramsUser cId nt =
68 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
69 <$> selectNgramsByContextUser cId nt
72 selectNgramsByContextUser :: HasDBid NodeType
75 -> Cmd err [(NodeId, Text)]
76 selectNgramsByContextUser cId' nt' =
77 runPGSQuery queryNgramsByContextUser
81 -- , 100 :: Int -- limit
82 -- , 0 :: Int -- offset
85 queryNgramsByContextUser :: DPS.Query
86 queryNgramsByContextUser = [sql|
87 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
88 JOIN ngrams ng ON cng.ngrams_id = ng.id
89 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
90 JOIN contexts c ON nc.context_id = c.id
91 WHERE nc.node_id = ? -- CorpusId
92 AND c.typename = ? -- toDBid
93 AND cng.ngrams_type = ? -- NgramsTypeId
94 AND nc.category > 0 -- is not in Trash
95 GROUP BY cng.context_id, ng.terms
99 ------------------------------------------------------------------------
100 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
105 -> Cmd err (HashMap NgramsTerm Int)
106 getOccByNgramsOnlyFast_withSample cId int nt ngs =
107 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
110 getOccByNgramsOnlyFast :: CorpusId
113 -> Cmd err (HashMap NgramsTerm [ContextId])
114 getOccByNgramsOnlyFast cId lId nt = do
115 --HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
116 HM.fromList <$> map (\(t, ns) -> (NgramsTerm t, NodeId <$> DPST.fromPGArray ns)) <$> run cId lId nt
122 -> Cmd err [(Text, DPST.PGArray Int)]
123 run cId' lId' nt' = runPGSQuery query
134 SELECT DISTINCT context_node_ngrams.context_id
135 FROM context_node_ngrams
137 ON context_node_ngrams.context_id = nodes_contexts.context_id
138 WHERE ng.id = context_node_ngrams.ngrams_id
139 AND nodes_contexts.node_id = ?
142 JOIN node_stories ns ON ng.id = ns.ngrams_id
143 JOIN node_node_ngrams nng ON ns.node_id = nng.node2_id
144 WHERE nng.node1_id = ?
146 AND nng.ngrams_type = ?
147 AND nng.ngrams_id = ng.id
148 AND nng.ngrams_type = ns.ngrams_type_id
153 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
158 -> Cmd err [(NgramsTerm, Int)]
159 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
160 fmap (first NgramsTerm) <$>
161 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
163 , toDBid NodeDocument
165 , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
170 fields = [QualifiedIdentifier Nothing "text"]
172 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
173 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
174 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
175 JOIN nodes_contexts nn ON n.id = nn.context_id
178 input_rows(terms) AS (?)
179 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
180 JOIN ngrams ng ON cng.ngrams_id = ng.id
181 JOIN input_rows ir ON ir.terms = ng.terms
182 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
183 JOIN nodes_sample n ON nn.context_id = n.id
184 WHERE nn.node_id = ? -- CorpusId
185 AND cng.ngrams_type = ? -- NgramsTypeId
187 GROUP BY cng.node_id, ng.terms
190 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
194 -> Cmd err [(NgramsTerm, Int)]
195 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
196 fmap (first NgramsTerm) <$>
197 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
199 , toDBid NodeDocument
205 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
206 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
207 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
208 JOIN nodes_contexts nc ON c.id = nc.context_id
211 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
212 JOIN ngrams ng ON cng.ngrams_id = ng.id
213 JOIN node_stories ns ON ns.ngrams_id = ng.id
214 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
215 JOIN contexts_sample c ON nc.context_id = c.id
216 WHERE nc.node_id = ? -- CorpusId
217 AND cng.ngrams_type = ? -- NgramsTypeId
228 ------------------------------------------------------------------------
229 getContextsByNgramsOnlyUser :: HasDBid NodeType
234 -> Cmd err (HashMap NgramsTerm (Set NodeId))
235 getContextsByNgramsOnlyUser cId ls nt ngs =
237 . map (HM.fromListWith (<>)
238 . map (second Set.singleton))
239 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
240 (splitEvery 1000 ngs)
242 getNgramsByContextOnlyUser :: HasDBid NodeType
247 -> Cmd err (Map NodeId (Set NgramsTerm))
248 getNgramsByContextOnlyUser cId ls nt ngs =
250 . map ( Map.fromListWith (<>)
251 . map (second Set.singleton)
254 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
255 (splitEvery 1000 ngs)
257 ------------------------------------------------------------------------
258 selectNgramsOnlyByContextUser :: HasDBid NodeType
263 -> Cmd err [(NgramsTerm, ContextId)]
264 selectNgramsOnlyByContextUser cId ls nt tms =
265 fmap (first NgramsTerm) <$>
266 runPGSQuery queryNgramsOnlyByContextUser
267 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
268 , Values [QualifiedIdentifier Nothing "int4"]
269 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
271 , toDBid NodeDocument
275 fields = [QualifiedIdentifier Nothing "text"]
277 queryNgramsOnlyByContextUser :: DPS.Query
278 queryNgramsOnlyByContextUser = [sql|
279 WITH input_rows(terms) AS (?),
280 input_list(id) AS (?)
281 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
282 JOIN ngrams ng ON cng.ngrams_id = ng.id
283 JOIN input_rows ir ON ir.terms = ng.terms
284 JOIN input_list il ON il.id = cng.node_id
285 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
286 JOIN contexts c ON nc.context_id = c.id
287 WHERE nc.node_id = ? -- CorpusId
288 AND c.typename = ? -- toDBid (maybe not useful with context table)
289 AND cng.ngrams_type = ? -- NgramsTypeId
291 GROUP BY ng.terms, cng.context_id
294 getNgramsByDocOnlyUser :: DocId
298 -> Cmd err (HashMap NgramsTerm (Set NodeId))
299 getNgramsByDocOnlyUser cId ls nt ngs =
301 . map (HM.fromListWith (<>) . map (second Set.singleton))
302 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
305 selectNgramsOnlyByDocUser :: DocId
309 -> Cmd err [(NgramsTerm, NodeId)]
310 selectNgramsOnlyByDocUser dId ls nt tms =
311 fmap (first NgramsTerm) <$>
312 runPGSQuery queryNgramsOnlyByDocUser
313 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
314 , Values [QualifiedIdentifier Nothing "int4"]
315 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
320 fields = [QualifiedIdentifier Nothing "text"]
323 queryNgramsOnlyByDocUser :: DPS.Query
324 queryNgramsOnlyByDocUser = [sql|
325 WITH input_rows(terms) AS (?),
326 input_list(id) AS (?)
327 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
328 JOIN ngrams ng ON cng.ngrams_id = ng.id
329 JOIN input_rows ir ON ir.terms = ng.terms
330 JOIN input_list il ON il.id = cng.context_id
331 WHERE cng.node_id = ? -- DocId
332 AND cng.ngrams_type = ? -- NgramsTypeId
333 GROUP BY ng.terms, cng.node_id
336 ------------------------------------------------------------------------
337 -- | TODO filter by language, database, any social field
338 getContextsByNgramsMaster :: HasDBid NodeType
341 -> Cmd err (HashMap Text (Set NodeId))
342 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
343 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
344 -- . takeWhile (not . List.null)
345 -- . takeWhile (\l -> List.length l > 3)
346 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
348 selectNgramsByContextMaster :: HasDBid NodeType
353 -> Cmd err [(NodeId, Text)]
354 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
355 queryNgramsByContextMaster'
357 , ngramsTypeId NgramsTerms
358 , toDBid NodeDocument
360 , toDBid NodeDocument
364 , toDBid NodeDocument
365 , ngramsTypeId NgramsTerms
368 -- | TODO fix context_node_ngrams relation
369 queryNgramsByContextMaster' :: DPS.Query
370 queryNgramsByContextMaster' = [sql|
371 WITH contextsByNgramsUser AS (
373 SELECT n.id, ng.terms FROM contexts n
374 JOIN nodes_contexts nn ON n.id = nn.context_id
375 JOIN context_node_ngrams cng ON cng.context_id = n.id
376 JOIN ngrams ng ON cng.ngrams_id = ng.id
377 WHERE nn.node_id = ? -- UserCorpusId
378 -- AND n.typename = ? -- toDBid
379 AND cng.ngrams_type = ? -- NgramsTypeId
381 AND node_pos(n.id,?) >= ?
382 AND node_pos(n.id,?) < ?
383 GROUP BY n.id, ng.terms
387 contextsByNgramsMaster AS (
389 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
390 JOIN context_node_ngrams cng ON n.id = cng.context_id
391 JOIN ngrams ng ON ng.id = cng.ngrams_id
393 WHERE n.parent_id = ? -- Master Corpus toDBid
394 AND n.typename = ? -- toDBid
395 AND cng.ngrams_type = ? -- NgramsTypeId
396 GROUP BY n.id, ng.terms
399 SELECT m.id, m.terms FROM nodesByNgramsMaster m
400 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id