2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByContext
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
23 import Data.Maybe (catMaybes)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (first, second, swap)
27 import Database.PostgreSQL.Simple.SqlQQ (sql)
28 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Core
31 import Gargantext.Data.HashMap.Strict.Utils as HM
32 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
33 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
34 import Gargantext.Database.Query.Table.Ngrams (selectNgramsId)
35 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..), NgramsId)
36 import Gargantext.Prelude
37 import qualified Data.HashMap.Strict as HM
38 import qualified Data.Map as Map
39 import qualified Data.Set as Set
40 import qualified Database.PostgreSQL.Simple as DPS
42 -- | fst is size of Supra Corpus
43 -- snd is Texts and size of Occurrences (different docs)
45 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
46 -> HashMap NgramsTerm (Set ContextId)
47 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
48 countContextsByNgramsWith f m = (total, m')
50 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
51 m' = HM.map ( swap . second (fromIntegral . Set.size))
52 $ groupContextsByNgramsWith f m
55 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
56 -> HashMap NgramsTerm (Set NodeId)
57 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
58 groupContextsByNgramsWith f' m'' =
59 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
62 ------------------------------------------------------------------------
63 getContextsByNgramsUser :: HasDBid NodeType
66 -> Cmd err (HashMap NgramsTerm (Set ContextId))
67 getContextsByNgramsUser cId nt =
68 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
69 <$> selectNgramsByContextUser cId nt
72 selectNgramsByContextUser :: HasDBid NodeType
75 -> Cmd err [(NodeId, Text)]
76 selectNgramsByContextUser cId' nt' =
77 runPGSQuery queryNgramsByContextUser
81 -- , 100 :: Int -- limit
82 -- , 0 :: Int -- offset
85 queryNgramsByContextUser :: DPS.Query
86 queryNgramsByContextUser = [sql|
87 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
88 JOIN ngrams ng ON cng.ngrams_id = ng.id
89 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
90 JOIN contexts c ON nc.context_id = c.id
91 WHERE nc.node_id = ? -- CorpusId
92 AND c.typename = ? -- toDBid
93 AND cng.ngrams_type = ? -- NgramsTypeId
94 AND nc.category > 0 -- is not in Trash
95 GROUP BY cng.context_id, ng.terms
99 ------------------------------------------------------------------------
100 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
105 -> Cmd err (HashMap NgramsTerm Int)
106 getOccByNgramsOnlyFast_withSample cId int nt ngs =
107 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
110 getOccByNgramsOnlyFast' :: CorpusId
114 -> Cmd err (HashMap NgramsTerm Int)
115 getOccByNgramsOnlyFast' cId lId nt tms = do -- trace (show (cId, lId)) $
116 mapNgramsIds <- selectNgramsId $ map unNgramsTerm tms
117 HM.fromListWith (+) <$> catMaybes
118 <$> map (\(nId, s) -> (,) <$> (NgramsTerm <$> (Map.lookup nId mapNgramsIds)) <*> (Just $ round s) )
119 <$> run cId lId nt (Map.keys mapNgramsIds)
126 -> Cmd err [(NgramsId, Double)]
127 run cId' lId' nt' tms' = runPGSQuery query
128 ( Values fields ((DPS.Only) <$> tms')
133 fields = [QualifiedIdentifier Nothing "int4"]
138 WITH input_ngrams(id) AS (?)
140 SELECT ngi.id, nng.weight FROM nodes_contexts nc
141 JOIN node_node_ngrams nng ON nng.node1_id = nc.node_id
142 JOIN input_ngrams ngi ON nng.ngrams_id = ngi.id
143 WHERE nng.node1_id = ?
145 AND nng.ngrams_type = ?
147 GROUP BY ngi.id, nng.weight
154 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
159 -> Cmd err [(NgramsTerm, Int)]
160 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
161 fmap (first NgramsTerm) <$>
162 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
164 , toDBid NodeDocument
166 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
171 fields = [QualifiedIdentifier Nothing "text"]
173 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
174 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
175 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
176 JOIN nodes_contexts nn ON n.id = nn.context_id
179 input_rows(terms) AS (?)
180 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
181 JOIN ngrams ng ON cng.ngrams_id = ng.id
182 JOIN input_rows ir ON ir.terms = ng.terms
183 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
184 JOIN nodes_sample n ON nn.context_id = n.id
185 WHERE nn.node_id = ? -- CorpusId
186 AND cng.ngrams_type = ? -- NgramsTypeId
188 GROUP BY cng.node_id, ng.terms
191 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
192 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
193 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
194 JOIN nodes_contexts nc ON c.id = nc.context_id
197 -- input_rows(terms) AS (?)
198 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
199 JOIN ngrams ng ON cng.ngrams_id = ng.id
200 JOIN input_rows ir ON ir.terms = ng.terms
201 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
202 JOIN contexts_sample c ON nc.context_id = c.id
203 WHERE nc.node_id = ? -- CorpusId
204 AND cng.ngrams_type = ? -- NgramsTypeId
206 GROUP BY cng.node_id, ng.terms
215 ------------------------------------------------------------------------
216 getContextsByNgramsOnlyUser :: HasDBid NodeType
221 -> Cmd err (HashMap NgramsTerm (Set NodeId))
222 getContextsByNgramsOnlyUser cId ls nt ngs =
224 . map (HM.fromListWith (<>)
225 . map (second Set.singleton))
226 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
227 (splitEvery 1000 ngs)
229 getNgramsByContextOnlyUser :: HasDBid NodeType
234 -> Cmd err (Map NodeId (Set NgramsTerm))
235 getNgramsByContextOnlyUser cId ls nt ngs =
237 . map ( Map.fromListWith (<>)
238 . map (second Set.singleton)
241 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
242 (splitEvery 1000 ngs)
244 ------------------------------------------------------------------------
245 selectNgramsOnlyByContextUser :: HasDBid NodeType
250 -> Cmd err [(NgramsTerm, ContextId)]
251 selectNgramsOnlyByContextUser cId ls nt tms =
252 fmap (first NgramsTerm) <$>
253 runPGSQuery queryNgramsOnlyByContextUser
254 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
255 , Values [QualifiedIdentifier Nothing "int4"]
256 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
258 , toDBid NodeDocument
262 fields = [QualifiedIdentifier Nothing "text"]
264 queryNgramsOnlyByContextUser :: DPS.Query
265 queryNgramsOnlyByContextUser = [sql|
266 WITH input_rows(terms) AS (?),
267 input_list(id) AS (?)
268 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
269 JOIN ngrams ng ON cng.ngrams_id = ng.id
270 JOIN input_rows ir ON ir.terms = ng.terms
271 JOIN input_list il ON il.id = cng.node_id
272 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
273 JOIN contexts c ON nc.context_id = c.id
274 WHERE nc.node_id = ? -- CorpusId
275 AND c.typename = ? -- toDBid (maybe not useful with context table)
276 AND cng.ngrams_type = ? -- NgramsTypeId
278 GROUP BY ng.terms, cng.context_id
281 getNgramsByDocOnlyUser :: DocId
285 -> Cmd err (HashMap NgramsTerm (Set NodeId))
286 getNgramsByDocOnlyUser cId ls nt ngs =
288 . map (HM.fromListWith (<>) . map (second Set.singleton))
289 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
292 selectNgramsOnlyByDocUser :: DocId
296 -> Cmd err [(NgramsTerm, NodeId)]
297 selectNgramsOnlyByDocUser dId ls nt tms =
298 fmap (first NgramsTerm) <$>
299 runPGSQuery queryNgramsOnlyByDocUser
300 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
301 , Values [QualifiedIdentifier Nothing "int4"]
302 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
307 fields = [QualifiedIdentifier Nothing "text"]
310 queryNgramsOnlyByDocUser :: DPS.Query
311 queryNgramsOnlyByDocUser = [sql|
312 WITH input_rows(terms) AS (?),
313 input_list(id) AS (?)
314 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
315 JOIN ngrams ng ON cng.ngrams_id = ng.id
316 JOIN input_rows ir ON ir.terms = ng.terms
317 JOIN input_list il ON il.id = cng.context_id
318 WHERE cng.node_id = ? -- DocId
319 AND cng.ngrams_type = ? -- NgramsTypeId
320 GROUP BY ng.terms, cng.node_id
323 ------------------------------------------------------------------------
324 -- | TODO filter by language, database, any social field
325 getContextsByNgramsMaster :: HasDBid NodeType
328 -> Cmd err (HashMap Text (Set NodeId))
329 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
330 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
331 -- . takeWhile (not . List.null)
332 -- . takeWhile (\l -> List.length l > 3)
333 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
335 selectNgramsByContextMaster :: HasDBid NodeType
340 -> Cmd err [(NodeId, Text)]
341 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
342 queryNgramsByContextMaster'
344 , ngramsTypeId NgramsTerms
345 , toDBid NodeDocument
347 , toDBid NodeDocument
351 , toDBid NodeDocument
352 , ngramsTypeId NgramsTerms
355 -- | TODO fix context_node_ngrams relation
356 queryNgramsByContextMaster' :: DPS.Query
357 queryNgramsByContextMaster' = [sql|
358 WITH contextsByNgramsUser AS (
360 SELECT n.id, ng.terms FROM contexts n
361 JOIN nodes_contexts nn ON n.id = nn.context_id
362 JOIN context_node_ngrams cng ON cng.context_id = n.id
363 JOIN ngrams ng ON cng.ngrams_id = ng.id
364 WHERE nn.node_id = ? -- UserCorpusId
365 -- AND n.typename = ? -- toDBid
366 AND cng.ngrams_type = ? -- NgramsTypeId
368 AND node_pos(n.id,?) >= ?
369 AND node_pos(n.id,?) < ?
370 GROUP BY n.id, ng.terms
374 contextsByNgramsMaster AS (
376 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
377 JOIN context_node_ngrams cng ON n.id = cng.context_id
378 JOIN ngrams ng ON ng.id = cng.ngrams_id
380 WHERE n.parent_id = ? -- Master Corpus toDBid
381 AND n.typename = ? -- toDBid
382 AND cng.ngrams_type = ? -- NgramsTypeId
383 GROUP BY n.id, ng.terms
386 SELECT m.id, m.terms FROM nodesByNgramsMaster m
387 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id