2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByContext
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
24 import Data.Text (Text)
25 import Data.Tuple.Extra (first, second, swap)
26 import Database.PostgreSQL.Simple.SqlQQ (sql)
27 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
28 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
29 import Gargantext.Core
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
40 -- | fst is size of Supra Corpus
41 -- snd is Texts and size of Occurrences (different docs)
43 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
44 -> HashMap NgramsTerm (Set ContextId)
45 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
46 countContextsByNgramsWith f m = (total, m')
48 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
49 m' = HM.map ( swap . second (fromIntegral . Set.size))
50 $ groupContextsByNgramsWith f m
53 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
54 -> HashMap NgramsTerm (Set NodeId)
55 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
56 groupContextsByNgramsWith f' m'' =
57 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
60 ------------------------------------------------------------------------
61 getContextsByNgramsUser :: HasDBid NodeType
64 -> Cmd err (HashMap NgramsTerm (Set ContextId))
65 getContextsByNgramsUser cId nt =
66 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
67 <$> selectNgramsByContextUser cId nt
70 selectNgramsByContextUser :: HasDBid NodeType
73 -> Cmd err [(NodeId, Text)]
74 selectNgramsByContextUser cId' nt' =
75 runPGSQuery queryNgramsByContextUser
79 -- , 100 :: Int -- limit
80 -- , 0 :: Int -- offset
83 queryNgramsByContextUser :: DPS.Query
84 queryNgramsByContextUser = [sql|
85 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
86 JOIN ngrams ng ON cng.ngrams_id = ng.id
87 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
88 JOIN contexts c ON nc.context_id = c.id
89 WHERE nc.node_id = ? -- CorpusId
90 AND c.typename = ? -- toDBid
91 AND cng.ngrams_type = ? -- NgramsTypeId
92 AND nc.category > 0 -- is not in Trash
93 GROUP BY cng.context_id, ng.terms
97 ------------------------------------------------------------------------
98 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
103 -> Cmd err (HashMap NgramsTerm Int)
104 getOccByNgramsOnlyFast_withSample cId int nt ngs =
105 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
108 getOccByNgramsOnlyFast :: CorpusId
111 -> Cmd err (HashMap NgramsTerm Int)
112 getOccByNgramsOnlyFast cId lId nt = do
113 HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
119 -> Cmd err [(Text, Double)]
120 run cId' lId' nt' = runPGSQuery query
133 -- , ns.ngrams_type_id
135 JOIN node_stories ns ON ng.id = ns.ngrams_id
136 JOIN node_node_ngrams nng ON ns.node_id = nng.node2_id
137 WHERE nng.node1_id = ?
139 AND nng.ngrams_type = ?
140 AND nng.ngrams_id = ng.id
141 AND nng.ngrams_type = ns.ngrams_type_id
146 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
151 -> Cmd err [(NgramsTerm, Int)]
152 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
153 fmap (first NgramsTerm) <$>
154 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
156 , toDBid NodeDocument
158 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
163 fields = [QualifiedIdentifier Nothing "text"]
165 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
166 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
167 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
168 JOIN nodes_contexts nn ON n.id = nn.context_id
171 input_rows(terms) AS (?)
172 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
173 JOIN ngrams ng ON cng.ngrams_id = ng.id
174 JOIN input_rows ir ON ir.terms = ng.terms
175 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
176 JOIN nodes_sample n ON nn.context_id = n.id
177 WHERE nn.node_id = ? -- CorpusId
178 AND cng.ngrams_type = ? -- NgramsTypeId
180 GROUP BY cng.node_id, ng.terms
183 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
187 -> Cmd err [(NgramsTerm, Int)]
188 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
189 fmap (first NgramsTerm) <$>
190 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
192 , toDBid NodeDocument
198 fields = [QualifiedIdentifier Nothing "text"]
200 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
201 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
202 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
203 JOIN nodes_contexts nc ON c.id = nc.context_id
206 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
207 JOIN ngrams ng ON cng.ngrams_id = ng.id
208 JOIN node_stories ns ON ns.ngrams_id = ng.id
209 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
210 JOIN contexts_sample c ON nc.context_id = c.id
211 WHERE nc.node_id = ? -- CorpusId
212 AND cng.ngrams_type = ? -- NgramsTypeId
223 ------------------------------------------------------------------------
224 getContextsByNgramsOnlyUser :: HasDBid NodeType
229 -> Cmd err (HashMap NgramsTerm (Set NodeId))
230 getContextsByNgramsOnlyUser cId ls nt ngs =
232 . map (HM.fromListWith (<>)
233 . map (second Set.singleton))
234 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
235 (splitEvery 1000 ngs)
237 getNgramsByContextOnlyUser :: HasDBid NodeType
242 -> Cmd err (Map NodeId (Set NgramsTerm))
243 getNgramsByContextOnlyUser cId ls nt ngs =
245 . map ( Map.fromListWith (<>)
246 . map (second Set.singleton)
249 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
250 (splitEvery 1000 ngs)
252 ------------------------------------------------------------------------
253 selectNgramsOnlyByContextUser :: HasDBid NodeType
258 -> Cmd err [(NgramsTerm, ContextId)]
259 selectNgramsOnlyByContextUser cId ls nt tms =
260 fmap (first NgramsTerm) <$>
261 runPGSQuery queryNgramsOnlyByContextUser
262 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
263 , Values [QualifiedIdentifier Nothing "int4"]
264 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
266 , toDBid NodeDocument
270 fields = [QualifiedIdentifier Nothing "text"]
272 queryNgramsOnlyByContextUser :: DPS.Query
273 queryNgramsOnlyByContextUser = [sql|
274 WITH input_rows(terms) AS (?),
275 input_list(id) AS (?)
276 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
277 JOIN ngrams ng ON cng.ngrams_id = ng.id
278 JOIN input_rows ir ON ir.terms = ng.terms
279 JOIN input_list il ON il.id = cng.node_id
280 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
281 JOIN contexts c ON nc.context_id = c.id
282 WHERE nc.node_id = ? -- CorpusId
283 AND c.typename = ? -- toDBid (maybe not useful with context table)
284 AND cng.ngrams_type = ? -- NgramsTypeId
286 GROUP BY ng.terms, cng.context_id
289 getNgramsByDocOnlyUser :: DocId
293 -> Cmd err (HashMap NgramsTerm (Set NodeId))
294 getNgramsByDocOnlyUser cId ls nt ngs =
296 . map (HM.fromListWith (<>) . map (second Set.singleton))
297 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
300 selectNgramsOnlyByDocUser :: DocId
304 -> Cmd err [(NgramsTerm, NodeId)]
305 selectNgramsOnlyByDocUser dId ls nt tms =
306 fmap (first NgramsTerm) <$>
307 runPGSQuery queryNgramsOnlyByDocUser
308 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
309 , Values [QualifiedIdentifier Nothing "int4"]
310 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
315 fields = [QualifiedIdentifier Nothing "text"]
318 queryNgramsOnlyByDocUser :: DPS.Query
319 queryNgramsOnlyByDocUser = [sql|
320 WITH input_rows(terms) AS (?),
321 input_list(id) AS (?)
322 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
323 JOIN ngrams ng ON cng.ngrams_id = ng.id
324 JOIN input_rows ir ON ir.terms = ng.terms
325 JOIN input_list il ON il.id = cng.context_id
326 WHERE cng.node_id = ? -- DocId
327 AND cng.ngrams_type = ? -- NgramsTypeId
328 GROUP BY ng.terms, cng.node_id
331 ------------------------------------------------------------------------
332 -- | TODO filter by language, database, any social field
333 getContextsByNgramsMaster :: HasDBid NodeType
336 -> Cmd err (HashMap Text (Set NodeId))
337 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
338 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
339 -- . takeWhile (not . List.null)
340 -- . takeWhile (\l -> List.length l > 3)
341 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
343 selectNgramsByContextMaster :: HasDBid NodeType
348 -> Cmd err [(NodeId, Text)]
349 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
350 queryNgramsByContextMaster'
352 , ngramsTypeId NgramsTerms
353 , toDBid NodeDocument
355 , toDBid NodeDocument
359 , toDBid NodeDocument
360 , ngramsTypeId NgramsTerms
363 -- | TODO fix context_node_ngrams relation
364 queryNgramsByContextMaster' :: DPS.Query
365 queryNgramsByContextMaster' = [sql|
366 WITH contextsByNgramsUser AS (
368 SELECT n.id, ng.terms FROM contexts n
369 JOIN nodes_contexts nn ON n.id = nn.context_id
370 JOIN context_node_ngrams cng ON cng.context_id = n.id
371 JOIN ngrams ng ON cng.ngrams_id = ng.id
372 WHERE nn.node_id = ? -- UserCorpusId
373 -- AND n.typename = ? -- toDBid
374 AND cng.ngrams_type = ? -- NgramsTypeId
376 AND node_pos(n.id,?) >= ?
377 AND node_pos(n.id,?) < ?
378 GROUP BY n.id, ng.terms
382 contextsByNgramsMaster AS (
384 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
385 JOIN context_node_ngrams cng ON n.id = cng.context_id
386 JOIN ngrams ng ON ng.id = cng.ngrams_id
388 WHERE n.parent_id = ? -- Master Corpus toDBid
389 AND n.typename = ? -- toDBid
390 AND cng.ngrams_type = ? -- NgramsTypeId
391 GROUP BY n.id, ng.terms
394 SELECT m.id, m.terms FROM nodesByNgramsMaster m
395 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id