2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByContext
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
24 import Data.Text (Text)
25 import Data.Tuple.Extra (first, second, swap)
26 import Database.PostgreSQL.Simple.SqlQQ (sql)
27 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
28 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
29 import Gargantext.Core
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
40 -- | fst is size of Supra Corpus
41 -- snd is Texts and size of Occurrences (different docs)
43 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
44 -> HashMap NgramsTerm (Set ContextId)
45 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
46 countContextsByNgramsWith f m = (total, m')
48 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
49 m' = HM.map ( swap . second (fromIntegral . Set.size))
50 $ groupContextsByNgramsWith f m
53 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
54 -> HashMap NgramsTerm (Set NodeId)
55 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
56 groupContextsByNgramsWith f' m'' =
57 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
60 ------------------------------------------------------------------------
61 getContextsByNgramsUser :: HasDBid NodeType
64 -> Cmd err (HashMap NgramsTerm (Set ContextId))
65 getContextsByNgramsUser cId nt =
66 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
67 <$> selectNgramsByContextUser cId nt
70 selectNgramsByContextUser :: HasDBid NodeType
73 -> Cmd err [(NodeId, Text)]
74 selectNgramsByContextUser cId' nt' =
75 runPGSQuery queryNgramsByContextUser
79 -- , 100 :: Int -- limit
80 -- , 0 :: Int -- offset
83 queryNgramsByContextUser :: DPS.Query
84 queryNgramsByContextUser = [sql|
85 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
86 JOIN ngrams ng ON cng.ngrams_id = ng.id
87 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
88 JOIN contexts c ON nc.context_id = c.id
89 WHERE nc.node_id = ? -- CorpusId
90 AND c.typename = ? -- toDBid
91 AND cng.ngrams_type = ? -- NgramsTypeId
92 AND nc.category > 0 -- is not in Trash
93 GROUP BY cng.context_id, ng.terms
97 ------------------------------------------------------------------------
98 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
103 -> Cmd err (HashMap NgramsTerm Int)
104 getOccByNgramsOnlyFast_withSample cId int nt ngs =
105 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
108 getOccByNgramsOnlyFast :: CorpusId
111 -> Cmd err (HashMap NgramsTerm Int)
112 getOccByNgramsOnlyFast cId lId nt = do
113 HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
119 -> Cmd err [(Text, Double)]
120 run cId' lId' nt' = runPGSQuery query
133 -- , ns.ngrams_type_id
135 JOIN node_stories ns ON ng.id = ns.ngrams_id
136 JOIN node_node_ngrams nng ON ns.node_id = nng.node2_id
137 WHERE nng.node1_id = ?
139 AND nng.ngrams_type = ?
140 AND nng.ngrams_id = ng.id
141 AND nng.ngrams_type = ns.ngrams_type_id
146 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
151 -> Cmd err [(NgramsTerm, Int)]
152 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
153 fmap (first NgramsTerm) <$>
154 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
156 , toDBid NodeDocument
158 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
163 fields = [QualifiedIdentifier Nothing "text"]
165 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
166 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
167 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
168 JOIN nodes_contexts nn ON n.id = nn.context_id
171 input_rows(terms) AS (?)
172 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
173 JOIN ngrams ng ON cng.ngrams_id = ng.id
174 JOIN input_rows ir ON ir.terms = ng.terms
175 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
176 JOIN nodes_sample n ON nn.context_id = n.id
177 WHERE nn.node_id = ? -- CorpusId
178 AND cng.ngrams_type = ? -- NgramsTypeId
180 GROUP BY cng.node_id, ng.terms
183 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
187 -> Cmd err [(NgramsTerm, Int)]
188 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
189 fmap (first NgramsTerm) <$>
190 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
192 , toDBid NodeDocument
198 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
199 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
200 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
201 JOIN nodes_contexts nc ON c.id = nc.context_id
204 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
205 JOIN ngrams ng ON cng.ngrams_id = ng.id
206 JOIN node_stories ns ON ns.ngrams_id = ng.id
207 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
208 JOIN contexts_sample c ON nc.context_id = c.id
209 WHERE nc.node_id = ? -- CorpusId
210 AND cng.ngrams_type = ? -- NgramsTypeId
221 ------------------------------------------------------------------------
222 getContextsByNgramsOnlyUser :: HasDBid NodeType
227 -> Cmd err (HashMap NgramsTerm (Set NodeId))
228 getContextsByNgramsOnlyUser cId ls nt ngs =
230 . map (HM.fromListWith (<>)
231 . map (second Set.singleton))
232 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
233 (splitEvery 1000 ngs)
235 getNgramsByContextOnlyUser :: HasDBid NodeType
240 -> Cmd err (Map NodeId (Set NgramsTerm))
241 getNgramsByContextOnlyUser cId ls nt ngs =
243 . map ( Map.fromListWith (<>)
244 . map (second Set.singleton)
247 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
248 (splitEvery 1000 ngs)
250 ------------------------------------------------------------------------
251 selectNgramsOnlyByContextUser :: HasDBid NodeType
256 -> Cmd err [(NgramsTerm, ContextId)]
257 selectNgramsOnlyByContextUser cId ls nt tms =
258 fmap (first NgramsTerm) <$>
259 runPGSQuery queryNgramsOnlyByContextUser
260 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
261 , Values [QualifiedIdentifier Nothing "int4"]
262 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
264 , toDBid NodeDocument
268 fields = [QualifiedIdentifier Nothing "text"]
270 queryNgramsOnlyByContextUser :: DPS.Query
271 queryNgramsOnlyByContextUser = [sql|
272 WITH input_rows(terms) AS (?),
273 input_list(id) AS (?)
274 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
275 JOIN ngrams ng ON cng.ngrams_id = ng.id
276 JOIN input_rows ir ON ir.terms = ng.terms
277 JOIN input_list il ON il.id = cng.node_id
278 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
279 JOIN contexts c ON nc.context_id = c.id
280 WHERE nc.node_id = ? -- CorpusId
281 AND c.typename = ? -- toDBid (maybe not useful with context table)
282 AND cng.ngrams_type = ? -- NgramsTypeId
284 GROUP BY ng.terms, cng.context_id
287 getNgramsByDocOnlyUser :: DocId
291 -> Cmd err (HashMap NgramsTerm (Set NodeId))
292 getNgramsByDocOnlyUser cId ls nt ngs =
294 . map (HM.fromListWith (<>) . map (second Set.singleton))
295 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
298 selectNgramsOnlyByDocUser :: DocId
302 -> Cmd err [(NgramsTerm, NodeId)]
303 selectNgramsOnlyByDocUser dId ls nt tms =
304 fmap (first NgramsTerm) <$>
305 runPGSQuery queryNgramsOnlyByDocUser
306 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
307 , Values [QualifiedIdentifier Nothing "int4"]
308 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
313 fields = [QualifiedIdentifier Nothing "text"]
316 queryNgramsOnlyByDocUser :: DPS.Query
317 queryNgramsOnlyByDocUser = [sql|
318 WITH input_rows(terms) AS (?),
319 input_list(id) AS (?)
320 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
321 JOIN ngrams ng ON cng.ngrams_id = ng.id
322 JOIN input_rows ir ON ir.terms = ng.terms
323 JOIN input_list il ON il.id = cng.context_id
324 WHERE cng.node_id = ? -- DocId
325 AND cng.ngrams_type = ? -- NgramsTypeId
326 GROUP BY ng.terms, cng.node_id
329 ------------------------------------------------------------------------
330 -- | TODO filter by language, database, any social field
331 getContextsByNgramsMaster :: HasDBid NodeType
334 -> Cmd err (HashMap Text (Set NodeId))
335 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
336 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
337 -- . takeWhile (not . List.null)
338 -- . takeWhile (\l -> List.length l > 3)
339 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
341 selectNgramsByContextMaster :: HasDBid NodeType
346 -> Cmd err [(NodeId, Text)]
347 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
348 queryNgramsByContextMaster'
350 , ngramsTypeId NgramsTerms
351 , toDBid NodeDocument
353 , toDBid NodeDocument
357 , toDBid NodeDocument
358 , ngramsTypeId NgramsTerms
361 -- | TODO fix context_node_ngrams relation
362 queryNgramsByContextMaster' :: DPS.Query
363 queryNgramsByContextMaster' = [sql|
364 WITH contextsByNgramsUser AS (
366 SELECT n.id, ng.terms FROM contexts n
367 JOIN nodes_contexts nn ON n.id = nn.context_id
368 JOIN context_node_ngrams cng ON cng.context_id = n.id
369 JOIN ngrams ng ON cng.ngrams_id = ng.id
370 WHERE nn.node_id = ? -- UserCorpusId
371 -- AND n.typename = ? -- toDBid
372 AND cng.ngrams_type = ? -- NgramsTypeId
374 AND node_pos(n.id,?) >= ?
375 AND node_pos(n.id,?) < ?
376 GROUP BY n.id, ng.terms
380 contextsByNgramsMaster AS (
382 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
383 JOIN context_node_ngrams cng ON n.id = cng.context_id
384 JOIN ngrams ng ON ng.id = cng.ngrams_id
386 WHERE n.parent_id = ? -- Master Corpus toDBid
387 AND n.typename = ? -- toDBid
388 AND cng.ngrams_type = ? -- NgramsTypeId
389 GROUP BY n.id, ng.terms
392 SELECT m.id, m.terms FROM nodesByNgramsMaster m
393 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id