2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByContext
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
23 import Data.Maybe (catMaybes)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (first, second, swap)
27 import Database.PostgreSQL.Simple.SqlQQ (sql)
28 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Core
31 import Gargantext.Data.HashMap.Strict.Utils as HM
32 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
33 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
34 import Gargantext.Database.Query.Table.Ngrams (selectNgramsId)
35 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..), NgramsId)
36 import Gargantext.Prelude
37 import qualified Data.HashMap.Strict as HM
38 import qualified Data.Map as Map
39 import qualified Data.Set as Set
40 import qualified Database.PostgreSQL.Simple as DPS
42 -- | fst is size of Supra Corpus
43 -- snd is Texts and size of Occurrences (different docs)
45 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
46 -> HashMap NgramsTerm (Set ContextId)
47 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
48 countContextsByNgramsWith f m = (total, m')
50 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
51 m' = HM.map ( swap . second (fromIntegral . Set.size))
52 $ groupContextsByNgramsWith f m
55 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
56 -> HashMap NgramsTerm (Set NodeId)
57 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
58 groupContextsByNgramsWith f' m'' =
59 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
62 ------------------------------------------------------------------------
63 getContextsByNgramsUser :: HasDBid NodeType
66 -> Cmd err (HashMap NgramsTerm (Set ContextId))
67 getContextsByNgramsUser cId nt =
68 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
69 <$> selectNgramsByContextUser cId nt
72 selectNgramsByContextUser :: HasDBid NodeType
75 -> Cmd err [(NodeId, Text)]
76 selectNgramsByContextUser cId' nt' =
77 runPGSQuery queryNgramsByContextUser
81 -- , 100 :: Int -- limit
82 -- , 0 :: Int -- offset
85 queryNgramsByContextUser :: DPS.Query
86 queryNgramsByContextUser = [sql|
87 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
88 JOIN ngrams ng ON cng.ngrams_id = ng.id
89 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
90 JOIN contexts c ON nc.context_id = c.id
91 WHERE nc.node_id = ? -- CorpusId
92 AND c.typename = ? -- toDBid
93 AND cng.ngrams_type = ? -- NgramsTypeId
94 AND nc.category > 0 -- is not in Trash
95 GROUP BY cng.context_id, ng.terms
99 ------------------------------------------------------------------------
100 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
105 -> Cmd err (HashMap NgramsTerm Int)
106 getOccByNgramsOnlyFast_withSample cId int nt ngs =
107 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
110 getOccByNgramsOnlyFast' :: CorpusId
114 -> Cmd err (HashMap NgramsTerm Int)
115 getOccByNgramsOnlyFast' cId lId nt tms = do -- trace (show (cId, lId)) $
116 mapNgramsIds <- selectNgramsId $ map unNgramsTerm tms
117 HM.fromListWith (+) <$> catMaybes
118 <$> map (\(nId, s) -> (,) <$> (NgramsTerm <$> (Map.lookup nId mapNgramsIds)) <*> (Just $ round s) )
119 <$> run cId lId nt (Map.keys mapNgramsIds)
126 -> Cmd err [(NgramsId, Double)]
127 run cId' lId' nt' tms' = runPGSQuery query
128 ( Values fields ((DPS.Only) <$> tms')
133 fields = [QualifiedIdentifier Nothing "int4"]
138 WITH input_ngrams(id) AS (?)
140 SELECT ngi.id, nng.weight FROM nodes_contexts nc
141 JOIN node_node_ngrams nng ON nng.node1_id = nc.node_id
142 JOIN input_ngrams ngi ON nng.ngrams_id = ngi.id
143 WHERE nng.node1_id = ?
145 AND nng.ngrams_type = ?
147 GROUP BY ngi.id, nng.weight
154 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
159 -> Cmd err [(NgramsTerm, Int)]
160 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
161 fmap (first NgramsTerm) <$>
162 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
164 , toDBid NodeDocument
166 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
171 fields = [QualifiedIdentifier Nothing "text"]
173 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
174 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
175 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
176 JOIN nodes_contexts nn ON n.id = nn.context_id
179 input_rows(terms) AS (?)
180 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
181 JOIN ngrams ng ON cng.ngrams_id = ng.id
182 JOIN input_rows ir ON ir.terms = ng.terms
183 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
184 JOIN nodes_sample n ON nn.context_id = n.id
185 WHERE nn.node_id = ? -- CorpusId
186 AND cng.ngrams_type = ? -- NgramsTypeId
188 GROUP BY cng.node_id, ng.terms
192 ------------------------------------------------------------------------
193 getContextsByNgramsOnlyUser :: HasDBid NodeType
198 -> Cmd err (HashMap NgramsTerm (Set NodeId))
199 getContextsByNgramsOnlyUser cId ls nt ngs =
201 . map (HM.fromListWith (<>)
202 . map (second Set.singleton))
203 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
204 (splitEvery 1000 ngs)
206 getNgramsByContextOnlyUser :: HasDBid NodeType
211 -> Cmd err (Map NodeId (Set NgramsTerm))
212 getNgramsByContextOnlyUser cId ls nt ngs =
214 . map ( Map.fromListWith (<>)
215 . map (second Set.singleton)
218 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
219 (splitEvery 1000 ngs)
221 ------------------------------------------------------------------------
222 selectNgramsOnlyByContextUser :: HasDBid NodeType
227 -> Cmd err [(NgramsTerm, ContextId)]
228 selectNgramsOnlyByContextUser cId ls nt tms =
229 fmap (first NgramsTerm) <$>
230 runPGSQuery queryNgramsOnlyByContextUser
231 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
232 , Values [QualifiedIdentifier Nothing "int4"]
233 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
235 , toDBid NodeDocument
239 fields = [QualifiedIdentifier Nothing "text"]
241 queryNgramsOnlyByContextUser :: DPS.Query
242 queryNgramsOnlyByContextUser = [sql|
243 WITH input_rows(terms) AS (?),
244 input_list(id) AS (?)
245 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
246 JOIN ngrams ng ON cng.ngrams_id = ng.id
247 JOIN input_rows ir ON ir.terms = ng.terms
248 JOIN input_list il ON il.id = cng.node_id
249 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
250 JOIN contexts c ON nc.context_id = c.id
251 WHERE nc.node_id = ? -- CorpusId
252 AND c.typename = ? -- toDBid (maybe not useful with context table)
253 AND cng.ngrams_type = ? -- NgramsTypeId
255 GROUP BY ng.terms, cng.context_id
258 getNgramsByDocOnlyUser :: DocId
262 -> Cmd err (HashMap NgramsTerm (Set NodeId))
263 getNgramsByDocOnlyUser cId ls nt ngs =
265 . map (HM.fromListWith (<>) . map (second Set.singleton))
266 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
269 selectNgramsOnlyByDocUser :: DocId
273 -> Cmd err [(NgramsTerm, NodeId)]
274 selectNgramsOnlyByDocUser dId ls nt tms =
275 fmap (first NgramsTerm) <$>
276 runPGSQuery queryNgramsOnlyByDocUser
277 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
278 , Values [QualifiedIdentifier Nothing "int4"]
279 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
284 fields = [QualifiedIdentifier Nothing "text"]
287 queryNgramsOnlyByDocUser :: DPS.Query
288 queryNgramsOnlyByDocUser = [sql|
289 WITH input_rows(terms) AS (?),
290 input_list(id) AS (?)
291 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
292 JOIN ngrams ng ON cng.ngrams_id = ng.id
293 JOIN input_rows ir ON ir.terms = ng.terms
294 JOIN input_list il ON il.id = cng.context_id
295 WHERE cng.node_id = ? -- DocId
296 AND cng.ngrams_type = ? -- NgramsTypeId
297 GROUP BY ng.terms, cng.node_id
300 ------------------------------------------------------------------------
301 -- | TODO filter by language, database, any social field
302 getContextsByNgramsMaster :: HasDBid NodeType
305 -> Cmd err (HashMap Text (Set NodeId))
306 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
307 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
308 -- . takeWhile (not . List.null)
309 -- . takeWhile (\l -> List.length l > 3)
310 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
312 selectNgramsByContextMaster :: HasDBid NodeType
317 -> Cmd err [(NodeId, Text)]
318 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
319 queryNgramsByContextMaster'
321 , ngramsTypeId NgramsTerms
322 , toDBid NodeDocument
324 , toDBid NodeDocument
328 , toDBid NodeDocument
329 , ngramsTypeId NgramsTerms
332 -- | TODO fix context_node_ngrams relation
333 queryNgramsByContextMaster' :: DPS.Query
334 queryNgramsByContextMaster' = [sql|
335 WITH contextsByNgramsUser AS (
337 SELECT n.id, ng.terms FROM contexts n
338 JOIN nodes_contexts nn ON n.id = nn.context_id
339 JOIN context_node_ngrams cng ON cng.context_id = n.id
340 JOIN ngrams ng ON cng.ngrams_id = ng.id
341 WHERE nn.node_id = ? -- UserCorpusId
342 -- AND n.typename = ? -- toDBid
343 AND cng.ngrams_type = ? -- NgramsTypeId
345 AND node_pos(n.id,?) >= ?
346 AND node_pos(n.id,?) < ?
347 GROUP BY n.id, ng.terms
351 contextsByNgramsMaster AS (
353 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
354 JOIN context_node_ngrams cng ON n.id = cng.context_id
355 JOIN ngrams ng ON ng.id = cng.ngrams_id
357 WHERE n.parent_id = ? -- Master Corpus toDBid
358 AND n.typename = ? -- toDBid
359 AND cng.ngrams_type = ? -- NgramsTypeId
360 GROUP BY n.id, ng.terms
363 SELECT m.id, m.terms FROM nodesByNgramsMaster m
364 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id