2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByContext
19 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
20 import Data.HashMap.Strict (HashMap)
23 import Data.Text (Text)
24 import Data.Tuple.Extra (first, second, swap)
25 import Database.PostgreSQL.Simple.SqlQQ (sql)
26 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
27 -- import Debug.Trace (trace)
28 import Gargantext.Core
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
40 -- | fst is size of Supra Corpus
41 -- snd is Texts and size of Occurrences (different docs)
43 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
44 -> HashMap NgramsTerm (Set ContextId)
45 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
46 countContextsByNgramsWith f m = (total, m')
48 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
49 m' = HM.map ( swap . second (fromIntegral . Set.size))
50 $ groupContextsByNgramsWith f m
53 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
54 -> HashMap NgramsTerm (Set NodeId)
55 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
56 groupContextsByNgramsWith f' m'' =
57 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
60 ------------------------------------------------------------------------
62 getContextsByNgramsUser :: HasDBid NodeType
65 -> Cmd err (HashMap NgramsTerm (Set ContextId))
66 getContextsByNgramsUser cId nt =
67 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
68 <$> selectNgramsByContextUser cId nt
71 selectNgramsByContextUser :: HasDBid NodeType
74 -> Cmd err [(NodeId, Text)]
75 selectNgramsByContextUser cId' nt' =
76 runPGSQuery queryNgramsByContextUser
80 -- , 100 :: Int -- limit
81 -- , 0 :: Int -- offset
84 queryNgramsByContextUser :: DPS.Query
85 queryNgramsByContextUser = [sql|
86 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
87 JOIN ngrams ng ON cng.ngrams_id = ng.id
88 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
89 JOIN contexts c ON nc.context_id = c.id
90 WHERE nc.node_id = ? -- CorpusId
91 AND c.typename = ? -- toDBid
92 AND cng.ngrams_type = ? -- NgramsTypeId
93 AND nc.category > 0 -- is not in Trash
94 GROUP BY cng.context_id, ng.terms
98 ------------------------------------------------------------------------
99 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
104 -> Cmd err (HashMap NgramsTerm Int)
105 getOccByNgramsOnlyFast_withSample cId int nt ngs =
106 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
109 getOccByNgramsOnlyFast' :: CorpusId
113 -> Cmd err (HashMap NgramsTerm Int)
114 getOccByNgramsOnlyFast' cId lId nt tms = -- trace (show (cId, lId)) $
115 HM.fromListWith (+) <$> map (second round) <$> run cId lId nt tms
118 fields = [QualifiedIdentifier Nothing "text"]
124 -> Cmd err [(NgramsTerm, Double)]
125 run cId' lId' nt' tms' = map (first NgramsTerm) <$> runPGSQuery query
126 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms')
134 WITH input_rows(terms) AS (?)
135 SELECT ng.terms, nng.weight FROM nodes_contexts nc
136 JOIN node_node_ngrams nng ON nng.node1_id = nc.node_id
137 JOIN ngrams ng ON nng.ngrams_id = ng.id
138 JOIN input_rows ir ON ir.terms = ng.terms
139 WHERE nng.node1_id = ? -- CorpusId
140 AND nng.node2_id = ? -- ListId
141 AND nng.ngrams_type = ? -- NgramsTypeId
142 AND nc.category > 0 -- Not trash
143 GROUP BY ng.terms, nng.weight
147 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
152 -> Cmd err [(NgramsTerm, Int)]
153 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
154 fmap (first NgramsTerm) <$>
155 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
157 , toDBid NodeDocument
159 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
164 fields = [QualifiedIdentifier Nothing "text"]
166 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
167 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
168 WITH nodes_sample AS (SELECT id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
169 JOIN nodes_contexts nn ON n.id = nn.context_id
172 input_rows(terms) AS (?)
173 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
174 JOIN ngrams ng ON cng.ngrams_id = ng.id
175 JOIN input_rows ir ON ir.terms = ng.terms
176 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
177 JOIN nodes_sample n ON nn.context_id = n.id
178 WHERE nn.node_id = ? -- CorpusId
179 AND cng.ngrams_type = ? -- NgramsTypeId
181 GROUP BY cng.node_id, ng.terms
185 ------------------------------------------------------------------------
187 getContextsByNgramsOnlyUser :: HasDBid NodeType
192 -> Cmd err (HashMap NgramsTerm (Set NodeId))
193 getContextsByNgramsOnlyUser cId ls nt ngs =
195 . map (HM.fromListWith (<>)
196 . map (second Set.singleton))
197 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
198 (splitEvery 1000 ngs)
200 getNgramsByContextOnlyUser :: HasDBid NodeType
205 -> Cmd err (Map NodeId (Set NgramsTerm))
206 getNgramsByContextOnlyUser cId ls nt ngs =
208 . map ( Map.fromListWith (<>)
209 . map (second Set.singleton)
212 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
213 (splitEvery 1000 ngs)
215 ------------------------------------------------------------------------
216 -- used in G.Core.Text.List
217 selectNgramsOnlyByContextUser :: HasDBid NodeType
222 -> Cmd err [(NgramsTerm, ContextId)]
223 selectNgramsOnlyByContextUser cId ls nt tms =
224 fmap (first NgramsTerm) <$>
225 runPGSQuery queryNgramsOnlyByContextUser
226 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
227 , Values [QualifiedIdentifier Nothing "int4"]
228 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
230 , toDBid NodeDocument
234 fields = [QualifiedIdentifier Nothing "text"]
236 queryNgramsOnlyByContextUser :: DPS.Query
237 queryNgramsOnlyByContextUser = [sql|
238 WITH input_rows(terms) AS (?),
239 input_list(id) AS (?)
240 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
241 JOIN ngrams ng ON cng.ngrams_id = ng.id
242 JOIN input_rows ir ON ir.terms = ng.terms
243 JOIN input_list il ON il.id = cng.node_id
244 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
245 JOIN contexts c ON nc.context_id = c.id
246 WHERE nc.node_id = ? -- CorpusId
247 AND c.typename = ? -- toDBid (maybe not useful with context table)
248 AND cng.ngrams_type = ? -- NgramsTypeId
250 GROUP BY ng.terms, cng.context_id
253 getNgramsByDocOnlyUser :: DocId
257 -> Cmd err (HashMap NgramsTerm (Set NodeId))
258 getNgramsByDocOnlyUser cId ls nt ngs =
260 . map (HM.fromListWith (<>) . map (second Set.singleton))
261 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
264 selectNgramsOnlyByDocUser :: DocId
268 -> Cmd err [(NgramsTerm, NodeId)]
269 selectNgramsOnlyByDocUser dId ls nt tms =
270 fmap (first NgramsTerm) <$>
271 runPGSQuery queryNgramsOnlyByDocUser
272 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
273 , Values [QualifiedIdentifier Nothing "int4"]
274 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
279 fields = [QualifiedIdentifier Nothing "text"]
282 queryNgramsOnlyByDocUser :: DPS.Query
283 queryNgramsOnlyByDocUser = [sql|
284 WITH input_rows(terms) AS (?),
285 input_list(id) AS (?)
286 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
287 JOIN ngrams ng ON cng.ngrams_id = ng.id
288 JOIN input_rows ir ON ir.terms = ng.terms
289 JOIN input_list il ON il.id = cng.context_id
290 WHERE cng.node_id = ? -- DocId
291 AND cng.ngrams_type = ? -- NgramsTypeId
292 GROUP BY ng.terms, cng.node_id
295 ------------------------------------------------------------------------
296 -- | TODO filter by language, database, any social field
297 getContextsByNgramsMaster :: HasDBid NodeType
300 -> Cmd err (HashMap Text (Set NodeId))
301 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
302 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
303 -- . takeWhile (not . List.null)
304 -- . takeWhile (\l -> List.length l > 3)
305 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
307 selectNgramsByContextMaster :: HasDBid NodeType
312 -> Cmd err [(NodeId, Text)]
313 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
314 queryNgramsByContextMaster'
316 , ngramsTypeId NgramsTerms
317 , toDBid NodeDocument
319 , toDBid NodeDocument
323 , toDBid NodeDocument
324 , ngramsTypeId NgramsTerms
327 -- | TODO fix context_node_ngrams relation
328 queryNgramsByContextMaster' :: DPS.Query
329 queryNgramsByContextMaster' = [sql|
330 WITH contextsByNgramsUser AS (
332 SELECT n.id, ng.terms FROM contexts n
333 JOIN nodes_contexts nn ON n.id = nn.context_id
334 JOIN context_node_ngrams cng ON cng.context_id = n.id
335 JOIN ngrams ng ON cng.ngrams_id = ng.id
336 WHERE nn.node_id = ? -- UserCorpusId
337 -- AND n.typename = ? -- toDBid
338 AND cng.ngrams_type = ? -- NgramsTypeId
340 AND node_pos(n.id,?) >= ?
341 AND node_pos(n.id,?) < ?
342 GROUP BY n.id, ng.terms
346 contextsByNgramsMaster AS (
348 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
349 JOIN context_node_ngrams cng ON n.id = cng.context_id
350 JOIN ngrams ng ON ng.id = cng.ngrams_id
352 WHERE n.parent_id = ? -- Master Corpus toDBid
353 AND n.typename = ? -- toDBid
354 AND cng.ngrams_type = ? -- NgramsTypeId
355 GROUP BY n.id, ng.terms
358 SELECT m.id, m.terms FROM nodesByNgramsMaster m
359 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id