2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByContext
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
24 import Data.Text (Text)
25 import Data.Tuple.Extra (first, second, swap)
26 import Database.PostgreSQL.Simple.SqlQQ (sql)
27 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
28 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
29 import Gargantext.Core
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
39 import qualified Database.PostgreSQL.Simple.Types as DPST
41 -- | fst is size of Supra Corpus
42 -- snd is Texts and size of Occurrences (different docs)
44 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
45 -> HashMap NgramsTerm (Set ContextId)
46 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
47 countContextsByNgramsWith f m = (total, m')
49 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
50 m' = HM.map ( swap . second (fromIntegral . Set.size))
51 $ groupContextsByNgramsWith f m
54 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
55 -> HashMap NgramsTerm (Set NodeId)
56 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
57 groupContextsByNgramsWith f' m'' =
58 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
61 ------------------------------------------------------------------------
62 getContextsByNgramsUser :: HasDBid NodeType
65 -> Cmd err (HashMap NgramsTerm (Set ContextId))
66 getContextsByNgramsUser cId nt =
67 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
68 <$> selectNgramsByContextUser cId nt
71 selectNgramsByContextUser :: HasDBid NodeType
74 -> Cmd err [(NodeId, Text)]
75 selectNgramsByContextUser cId' nt' =
76 runPGSQuery queryNgramsByContextUser
80 -- , 100 :: Int -- limit
81 -- , 0 :: Int -- offset
84 queryNgramsByContextUser :: DPS.Query
85 queryNgramsByContextUser = [sql|
86 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
87 JOIN ngrams ng ON cng.ngrams_id = ng.id
88 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
89 JOIN contexts c ON nc.context_id = c.id
90 WHERE nc.node_id = ? -- CorpusId
91 AND c.typename = ? -- toDBid
92 AND cng.ngrams_type = ? -- NgramsTypeId
93 AND nc.category > 0 -- is not in Trash
94 GROUP BY cng.context_id, ng.terms
98 ------------------------------------------------------------------------
99 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
104 -> Cmd err (HashMap NgramsTerm Int)
105 getOccByNgramsOnlyFast_withSample cId int nt ngs =
106 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
109 getOccByNgramsOnlyFast :: CorpusId
112 -> Cmd err (HashMap NgramsTerm [ContextId])
113 getOccByNgramsOnlyFast cId lId nt = do
114 --HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
115 HM.fromList <$> map (\(t, ns) -> (NgramsTerm t, NodeId <$> DPST.fromPGArray ns)) <$> run cId lId nt
121 -> Cmd err [(Text, DPST.PGArray Int)]
122 run cId' lId' nt' = runPGSQuery query
132 --, round(nng.weight)
133 , ARRAY(SELECT DISTINCT context_node_ngrams.context_id FROM context_node_ngrams WHERE ng.id = ngrams_id) AS context_ids
136 -- , ns.ngrams_type_id
138 JOIN node_stories ns ON ng.id = ns.ngrams_id
139 JOIN node_node_ngrams nng ON ns.node_id = nng.node2_id
140 WHERE nng.node1_id = ?
142 AND nng.ngrams_type = ?
143 AND nng.ngrams_id = ng.id
144 AND nng.ngrams_type = ns.ngrams_type_id
149 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
154 -> Cmd err [(NgramsTerm, Int)]
155 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
156 fmap (first NgramsTerm) <$>
157 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
159 , toDBid NodeDocument
161 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
166 fields = [QualifiedIdentifier Nothing "text"]
168 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
169 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
170 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
171 JOIN nodes_contexts nn ON n.id = nn.context_id
174 input_rows(terms) AS (?)
175 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
176 JOIN ngrams ng ON cng.ngrams_id = ng.id
177 JOIN input_rows ir ON ir.terms = ng.terms
178 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
179 JOIN nodes_sample n ON nn.context_id = n.id
180 WHERE nn.node_id = ? -- CorpusId
181 AND cng.ngrams_type = ? -- NgramsTypeId
183 GROUP BY cng.node_id, ng.terms
186 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
190 -> Cmd err [(NgramsTerm, Int)]
191 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
192 fmap (first NgramsTerm) <$>
193 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
195 , toDBid NodeDocument
201 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
202 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
203 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
204 JOIN nodes_contexts nc ON c.id = nc.context_id
207 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
208 JOIN ngrams ng ON cng.ngrams_id = ng.id
209 JOIN node_stories ns ON ns.ngrams_id = ng.id
210 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
211 JOIN contexts_sample c ON nc.context_id = c.id
212 WHERE nc.node_id = ? -- CorpusId
213 AND cng.ngrams_type = ? -- NgramsTypeId
224 ------------------------------------------------------------------------
225 getContextsByNgramsOnlyUser :: HasDBid NodeType
230 -> Cmd err (HashMap NgramsTerm (Set NodeId))
231 getContextsByNgramsOnlyUser cId ls nt ngs =
233 . map (HM.fromListWith (<>)
234 . map (second Set.singleton))
235 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
236 (splitEvery 1000 ngs)
238 getNgramsByContextOnlyUser :: HasDBid NodeType
243 -> Cmd err (Map NodeId (Set NgramsTerm))
244 getNgramsByContextOnlyUser cId ls nt ngs =
246 . map ( Map.fromListWith (<>)
247 . map (second Set.singleton)
250 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
251 (splitEvery 1000 ngs)
253 ------------------------------------------------------------------------
254 selectNgramsOnlyByContextUser :: HasDBid NodeType
259 -> Cmd err [(NgramsTerm, ContextId)]
260 selectNgramsOnlyByContextUser cId ls nt tms =
261 fmap (first NgramsTerm) <$>
262 runPGSQuery queryNgramsOnlyByContextUser
263 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
264 , Values [QualifiedIdentifier Nothing "int4"]
265 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
267 , toDBid NodeDocument
271 fields = [QualifiedIdentifier Nothing "text"]
273 queryNgramsOnlyByContextUser :: DPS.Query
274 queryNgramsOnlyByContextUser = [sql|
275 WITH input_rows(terms) AS (?),
276 input_list(id) AS (?)
277 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
278 JOIN ngrams ng ON cng.ngrams_id = ng.id
279 JOIN input_rows ir ON ir.terms = ng.terms
280 JOIN input_list il ON il.id = cng.node_id
281 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
282 JOIN contexts c ON nc.context_id = c.id
283 WHERE nc.node_id = ? -- CorpusId
284 AND c.typename = ? -- toDBid (maybe not useful with context table)
285 AND cng.ngrams_type = ? -- NgramsTypeId
287 GROUP BY ng.terms, cng.context_id
290 getNgramsByDocOnlyUser :: DocId
294 -> Cmd err (HashMap NgramsTerm (Set NodeId))
295 getNgramsByDocOnlyUser cId ls nt ngs =
297 . map (HM.fromListWith (<>) . map (second Set.singleton))
298 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
301 selectNgramsOnlyByDocUser :: DocId
305 -> Cmd err [(NgramsTerm, NodeId)]
306 selectNgramsOnlyByDocUser dId ls nt tms =
307 fmap (first NgramsTerm) <$>
308 runPGSQuery queryNgramsOnlyByDocUser
309 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
310 , Values [QualifiedIdentifier Nothing "int4"]
311 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
316 fields = [QualifiedIdentifier Nothing "text"]
319 queryNgramsOnlyByDocUser :: DPS.Query
320 queryNgramsOnlyByDocUser = [sql|
321 WITH input_rows(terms) AS (?),
322 input_list(id) AS (?)
323 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
324 JOIN ngrams ng ON cng.ngrams_id = ng.id
325 JOIN input_rows ir ON ir.terms = ng.terms
326 JOIN input_list il ON il.id = cng.context_id
327 WHERE cng.node_id = ? -- DocId
328 AND cng.ngrams_type = ? -- NgramsTypeId
329 GROUP BY ng.terms, cng.node_id
332 ------------------------------------------------------------------------
333 -- | TODO filter by language, database, any social field
334 getContextsByNgramsMaster :: HasDBid NodeType
337 -> Cmd err (HashMap Text (Set NodeId))
338 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
339 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
340 -- . takeWhile (not . List.null)
341 -- . takeWhile (\l -> List.length l > 3)
342 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
344 selectNgramsByContextMaster :: HasDBid NodeType
349 -> Cmd err [(NodeId, Text)]
350 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
351 queryNgramsByContextMaster'
353 , ngramsTypeId NgramsTerms
354 , toDBid NodeDocument
356 , toDBid NodeDocument
360 , toDBid NodeDocument
361 , ngramsTypeId NgramsTerms
364 -- | TODO fix context_node_ngrams relation
365 queryNgramsByContextMaster' :: DPS.Query
366 queryNgramsByContextMaster' = [sql|
367 WITH contextsByNgramsUser AS (
369 SELECT n.id, ng.terms FROM contexts n
370 JOIN nodes_contexts nn ON n.id = nn.context_id
371 JOIN context_node_ngrams cng ON cng.context_id = n.id
372 JOIN ngrams ng ON cng.ngrams_id = ng.id
373 WHERE nn.node_id = ? -- UserCorpusId
374 -- AND n.typename = ? -- toDBid
375 AND cng.ngrams_type = ? -- NgramsTypeId
377 AND node_pos(n.id,?) >= ?
378 AND node_pos(n.id,?) < ?
379 GROUP BY n.id, ng.terms
383 contextsByNgramsMaster AS (
385 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
386 JOIN context_node_ngrams cng ON n.id = cng.context_id
387 JOIN ngrams ng ON ng.id = cng.ngrams_id
389 WHERE n.parent_id = ? -- Master Corpus toDBid
390 AND n.typename = ? -- toDBid
391 AND cng.ngrams_type = ? -- NgramsTypeId
392 GROUP BY n.id, ng.terms
395 SELECT m.id, m.terms FROM nodesByNgramsMaster m
396 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id