2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByContext
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
24 import Data.Text (Text)
25 import Data.Tuple.Extra (first, second, swap)
26 import Database.PostgreSQL.Simple.SqlQQ (sql)
27 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
28 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
29 import Gargantext.Core
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
40 -- | fst is size of Supra Corpus
41 -- snd is Texts and size of Occurrences (different docs)
43 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
44 -> HashMap NgramsTerm (Set ContextId)
45 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
46 countContextsByNgramsWith f m = (total, m')
48 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
49 m' = HM.map ( swap . second (fromIntegral . Set.size))
50 $ groupContextsByNgramsWith f m
53 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
54 -> HashMap NgramsTerm (Set NodeId)
55 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
56 groupContextsByNgramsWith f' m'' =
57 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
60 ------------------------------------------------------------------------
61 getContextsByNgramsUser :: HasDBid NodeType
64 -> Cmd err (HashMap NgramsTerm (Set ContextId))
65 getContextsByNgramsUser cId nt =
66 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
67 <$> selectNgramsByContextUser cId nt
70 selectNgramsByContextUser :: HasDBid NodeType
73 -> Cmd err [(NodeId, Text)]
74 selectNgramsByContextUser cId' nt' =
75 runPGSQuery queryNgramsByContextUser
79 -- , 100 :: Int -- limit
80 -- , 0 :: Int -- offset
83 queryNgramsByContextUser :: DPS.Query
84 queryNgramsByContextUser = [sql|
85 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
86 JOIN ngrams ng ON cng.ngrams_id = ng.id
87 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
88 JOIN contexts c ON nc.context_id = c.id
89 WHERE nc.node_id = ? -- CorpusId
90 AND c.typename = ? -- toDBid
91 AND cng.ngrams_type = ? -- NgramsTypeId
92 AND nc.category > 0 -- is not in Trash
93 GROUP BY cng.context_id, ng.terms
97 ------------------------------------------------------------------------
98 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
103 -> Cmd err (HashMap NgramsTerm Int)
104 getOccByNgramsOnlyFast_withSample cId int nt ngs =
105 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
108 getOccByNgramsOnlyFast :: CorpusId
111 -> Cmd err (HashMap NgramsTerm Int)
112 getOccByNgramsOnlyFast cId lId nt = do
113 HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
119 -> Cmd err [(Text, Double)]
120 run cId' lId' nt' = runPGSQuery query
133 -- , ns.ngrams_type_id
135 JOIN node_stories ns ON ng.id = ns.ngrams_id
136 JOIN node_node_ngrams nng ON ns.node_id = nng.node2_id
137 WHERE nng.node1_id = ?
139 AND nng.ngrams_type = ?
140 AND nng.ngrams_id = ng.id
141 AND nng.ngrams_type = ns.ngrams_type_id
152 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
157 -> Cmd err [(NgramsTerm, Int)]
158 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
159 fmap (first NgramsTerm) <$>
160 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
162 , toDBid NodeDocument
164 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
169 fields = [QualifiedIdentifier Nothing "text"]
171 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
172 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
173 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
174 JOIN nodes_contexts nn ON n.id = nn.context_id
177 input_rows(terms) AS (?)
178 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
179 JOIN ngrams ng ON cng.ngrams_id = ng.id
180 JOIN input_rows ir ON ir.terms = ng.terms
181 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
182 JOIN nodes_sample n ON nn.context_id = n.id
183 WHERE nn.node_id = ? -- CorpusId
184 AND cng.ngrams_type = ? -- NgramsTypeId
186 GROUP BY cng.node_id, ng.terms
190 ------------------------------------------------------------------------
191 getContextsByNgramsOnlyUser :: HasDBid NodeType
196 -> Cmd err (HashMap NgramsTerm (Set NodeId))
197 getContextsByNgramsOnlyUser cId ls nt ngs =
199 . map (HM.fromListWith (<>)
200 . map (second Set.singleton))
201 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
202 (splitEvery 1000 ngs)
204 getNgramsByContextOnlyUser :: HasDBid NodeType
209 -> Cmd err (Map NodeId (Set NgramsTerm))
210 getNgramsByContextOnlyUser cId ls nt ngs =
212 . map ( Map.fromListWith (<>)
213 . map (second Set.singleton)
216 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
217 (splitEvery 1000 ngs)
219 ------------------------------------------------------------------------
220 selectNgramsOnlyByContextUser :: HasDBid NodeType
225 -> Cmd err [(NgramsTerm, ContextId)]
226 selectNgramsOnlyByContextUser cId ls nt tms =
227 fmap (first NgramsTerm) <$>
228 runPGSQuery queryNgramsOnlyByContextUser
229 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
230 , Values [QualifiedIdentifier Nothing "int4"]
231 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
233 , toDBid NodeDocument
237 fields = [QualifiedIdentifier Nothing "text"]
239 queryNgramsOnlyByContextUser :: DPS.Query
240 queryNgramsOnlyByContextUser = [sql|
241 WITH input_rows(terms) AS (?),
242 input_list(id) AS (?)
243 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
244 JOIN ngrams ng ON cng.ngrams_id = ng.id
245 JOIN input_rows ir ON ir.terms = ng.terms
246 JOIN input_list il ON il.id = cng.node_id
247 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
248 JOIN contexts c ON nc.context_id = c.id
249 WHERE nc.node_id = ? -- CorpusId
250 AND c.typename = ? -- toDBid (maybe not useful with context table)
251 AND cng.ngrams_type = ? -- NgramsTypeId
253 GROUP BY ng.terms, cng.context_id
256 getNgramsByDocOnlyUser :: DocId
260 -> Cmd err (HashMap NgramsTerm (Set NodeId))
261 getNgramsByDocOnlyUser cId ls nt ngs =
263 . map (HM.fromListWith (<>) . map (second Set.singleton))
264 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
267 selectNgramsOnlyByDocUser :: DocId
271 -> Cmd err [(NgramsTerm, NodeId)]
272 selectNgramsOnlyByDocUser dId ls nt tms =
273 fmap (first NgramsTerm) <$>
274 runPGSQuery queryNgramsOnlyByDocUser
275 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
276 , Values [QualifiedIdentifier Nothing "int4"]
277 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
282 fields = [QualifiedIdentifier Nothing "text"]
285 queryNgramsOnlyByDocUser :: DPS.Query
286 queryNgramsOnlyByDocUser = [sql|
287 WITH input_rows(terms) AS (?),
288 input_list(id) AS (?)
289 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
290 JOIN ngrams ng ON cng.ngrams_id = ng.id
291 JOIN input_rows ir ON ir.terms = ng.terms
292 JOIN input_list il ON il.id = cng.context_id
293 WHERE cng.node_id = ? -- DocId
294 AND cng.ngrams_type = ? -- NgramsTypeId
295 GROUP BY ng.terms, cng.node_id
298 ------------------------------------------------------------------------
299 -- | TODO filter by language, database, any social field
300 getContextsByNgramsMaster :: HasDBid NodeType
303 -> Cmd err (HashMap Text (Set NodeId))
304 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
305 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
306 -- . takeWhile (not . List.null)
307 -- . takeWhile (\l -> List.length l > 3)
308 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
310 selectNgramsByContextMaster :: HasDBid NodeType
315 -> Cmd err [(NodeId, Text)]
316 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
317 queryNgramsByContextMaster'
319 , ngramsTypeId NgramsTerms
320 , toDBid NodeDocument
322 , toDBid NodeDocument
326 , toDBid NodeDocument
327 , ngramsTypeId NgramsTerms
330 -- | TODO fix context_node_ngrams relation
331 queryNgramsByContextMaster' :: DPS.Query
332 queryNgramsByContextMaster' = [sql|
333 WITH contextsByNgramsUser AS (
335 SELECT n.id, ng.terms FROM contexts n
336 JOIN nodes_contexts nn ON n.id = nn.context_id
337 JOIN context_node_ngrams cng ON cng.context_id = n.id
338 JOIN ngrams ng ON cng.ngrams_id = ng.id
339 WHERE nn.node_id = ? -- UserCorpusId
340 -- AND n.typename = ? -- toDBid
341 AND cng.ngrams_type = ? -- NgramsTypeId
343 AND node_pos(n.id,?) >= ?
344 AND node_pos(n.id,?) < ?
345 GROUP BY n.id, ng.terms
349 contextsByNgramsMaster AS (
351 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
352 JOIN context_node_ngrams cng ON n.id = cng.context_id
353 JOIN ngrams ng ON ng.id = cng.ngrams_id
355 WHERE n.parent_id = ? -- Master Corpus toDBid
356 AND n.typename = ? -- toDBid
357 AND cng.ngrams_type = ? -- NgramsTypeId
358 GROUP BY n.id, ng.terms
361 SELECT m.id, m.terms FROM nodesByNgramsMaster m
362 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id