2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByContext
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
23 import Data.Maybe (catMaybes)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (first, second, swap)
27 import Database.PostgreSQL.Simple.SqlQQ (sql)
28 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Core
31 import Gargantext.Data.HashMap.Strict.Utils as HM
32 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
33 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
34 import Gargantext.Database.Query.Table.Ngrams (selectNgramsId)
35 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..), NgramsId)
36 import Gargantext.Prelude
37 import qualified Data.HashMap.Strict as HM
38 import qualified Data.Map as Map
39 import qualified Data.Set as Set
40 import qualified Database.PostgreSQL.Simple as DPS
42 -- | fst is size of Supra Corpus
43 -- snd is Texts and size of Occurrences (different docs)
45 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
46 -> HashMap NgramsTerm (Set ContextId)
47 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
48 countContextsByNgramsWith f m = (total, m')
50 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
51 m' = HM.map ( swap . second (fromIntegral . Set.size))
52 $ groupContextsByNgramsWith f m
55 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
56 -> HashMap NgramsTerm (Set NodeId)
57 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
58 groupContextsByNgramsWith f' m'' =
59 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
62 ------------------------------------------------------------------------
64 getContextsByNgramsUser :: HasDBid NodeType
67 -> Cmd err (HashMap NgramsTerm (Set ContextId))
68 getContextsByNgramsUser cId nt =
69 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
70 <$> selectNgramsByContextUser cId nt
73 selectNgramsByContextUser :: HasDBid NodeType
76 -> Cmd err [(NodeId, Text)]
77 selectNgramsByContextUser cId' nt' =
78 runPGSQuery queryNgramsByContextUser
82 -- , 100 :: Int -- limit
83 -- , 0 :: Int -- offset
86 queryNgramsByContextUser :: DPS.Query
87 queryNgramsByContextUser = [sql|
88 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
89 JOIN ngrams ng ON cng.ngrams_id = ng.id
90 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
91 JOIN contexts c ON nc.context_id = c.id
92 WHERE nc.node_id = ? -- CorpusId
93 AND c.typename = ? -- toDBid
94 AND cng.ngrams_type = ? -- NgramsTypeId
95 AND nc.category > 0 -- is not in Trash
96 GROUP BY cng.context_id, ng.terms
100 ------------------------------------------------------------------------
101 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
106 -> Cmd err (HashMap NgramsTerm Int)
107 getOccByNgramsOnlyFast_withSample cId int nt ngs =
108 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
111 getOccByNgramsOnlyFast' :: CorpusId
115 -> Cmd err (HashMap NgramsTerm Int)
116 getOccByNgramsOnlyFast' cId lId nt tms = do -- trace (show (cId, lId)) $
117 mapNgramsIds <- selectNgramsId $ map unNgramsTerm tms
118 HM.fromListWith (+) <$> catMaybes
119 <$> map (\(nId, s) -> (,) <$> (NgramsTerm <$> (Map.lookup nId mapNgramsIds)) <*> (Just $ round s) )
120 <$> run cId lId nt (Map.keys mapNgramsIds)
127 -> Cmd err [(NgramsId, Double)]
128 run cId' lId' nt' tms' = runPGSQuery query
129 ( Values fields ((DPS.Only) <$> tms')
134 fields = [QualifiedIdentifier Nothing "int4"]
139 WITH input_ngrams(id) AS (?)
141 SELECT ngi.id, nng.weight FROM nodes_contexts nc
142 JOIN node_node_ngrams nng ON nng.node1_id = nc.node_id
143 JOIN input_ngrams ngi ON nng.ngrams_id = ngi.id
144 WHERE nng.node1_id = ?
146 AND nng.ngrams_type = ?
148 GROUP BY ngi.id, nng.weight
155 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
160 -> Cmd err [(NgramsTerm, Int)]
161 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
162 fmap (first NgramsTerm) <$>
163 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
165 , toDBid NodeDocument
167 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
172 fields = [QualifiedIdentifier Nothing "text"]
174 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
175 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
176 WITH nodes_sample AS (SELECT id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
177 JOIN nodes_contexts nn ON n.id = nn.context_id
180 input_rows(terms) AS (?)
181 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
182 JOIN ngrams ng ON cng.ngrams_id = ng.id
183 JOIN input_rows ir ON ir.terms = ng.terms
184 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
185 JOIN nodes_sample n ON nn.context_id = n.id
186 WHERE nn.node_id = ? -- CorpusId
187 AND cng.ngrams_type = ? -- NgramsTypeId
189 GROUP BY cng.node_id, ng.terms
193 ------------------------------------------------------------------------
195 getContextsByNgramsOnlyUser :: HasDBid NodeType
200 -> Cmd err (HashMap NgramsTerm (Set NodeId))
201 getContextsByNgramsOnlyUser cId ls nt ngs =
203 . map (HM.fromListWith (<>)
204 . map (second Set.singleton))
205 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
206 (splitEvery 1000 ngs)
208 getNgramsByContextOnlyUser :: HasDBid NodeType
213 -> Cmd err (Map NodeId (Set NgramsTerm))
214 getNgramsByContextOnlyUser cId ls nt ngs =
216 . map ( Map.fromListWith (<>)
217 . map (second Set.singleton)
220 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
221 (splitEvery 1000 ngs)
223 ------------------------------------------------------------------------
224 -- used in G.Core.Text.List
225 selectNgramsOnlyByContextUser :: HasDBid NodeType
230 -> Cmd err [(NgramsTerm, ContextId)]
231 selectNgramsOnlyByContextUser cId ls nt tms =
232 fmap (first NgramsTerm) <$>
233 runPGSQuery queryNgramsOnlyByContextUser
234 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
235 , Values [QualifiedIdentifier Nothing "int4"]
236 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
238 , toDBid NodeDocument
242 fields = [QualifiedIdentifier Nothing "text"]
244 queryNgramsOnlyByContextUser :: DPS.Query
245 queryNgramsOnlyByContextUser = [sql|
246 WITH input_rows(terms) AS (?),
247 input_list(id) AS (?)
248 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
249 JOIN ngrams ng ON cng.ngrams_id = ng.id
250 JOIN input_rows ir ON ir.terms = ng.terms
251 JOIN input_list il ON il.id = cng.node_id
252 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
253 JOIN contexts c ON nc.context_id = c.id
254 WHERE nc.node_id = ? -- CorpusId
255 AND c.typename = ? -- toDBid (maybe not useful with context table)
256 AND cng.ngrams_type = ? -- NgramsTypeId
258 GROUP BY ng.terms, cng.context_id
261 getNgramsByDocOnlyUser :: DocId
265 -> Cmd err (HashMap NgramsTerm (Set NodeId))
266 getNgramsByDocOnlyUser cId ls nt ngs =
268 . map (HM.fromListWith (<>) . map (second Set.singleton))
269 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
272 selectNgramsOnlyByDocUser :: DocId
276 -> Cmd err [(NgramsTerm, NodeId)]
277 selectNgramsOnlyByDocUser dId ls nt tms =
278 fmap (first NgramsTerm) <$>
279 runPGSQuery queryNgramsOnlyByDocUser
280 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
281 , Values [QualifiedIdentifier Nothing "int4"]
282 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
287 fields = [QualifiedIdentifier Nothing "text"]
290 queryNgramsOnlyByDocUser :: DPS.Query
291 queryNgramsOnlyByDocUser = [sql|
292 WITH input_rows(terms) AS (?),
293 input_list(id) AS (?)
294 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
295 JOIN ngrams ng ON cng.ngrams_id = ng.id
296 JOIN input_rows ir ON ir.terms = ng.terms
297 JOIN input_list il ON il.id = cng.context_id
298 WHERE cng.node_id = ? -- DocId
299 AND cng.ngrams_type = ? -- NgramsTypeId
300 GROUP BY ng.terms, cng.node_id
303 ------------------------------------------------------------------------
304 -- | TODO filter by language, database, any social field
305 getContextsByNgramsMaster :: HasDBid NodeType
308 -> Cmd err (HashMap Text (Set NodeId))
309 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
310 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
311 -- . takeWhile (not . List.null)
312 -- . takeWhile (\l -> List.length l > 3)
313 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
315 selectNgramsByContextMaster :: HasDBid NodeType
320 -> Cmd err [(NodeId, Text)]
321 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
322 queryNgramsByContextMaster'
324 , ngramsTypeId NgramsTerms
325 , toDBid NodeDocument
327 , toDBid NodeDocument
331 , toDBid NodeDocument
332 , ngramsTypeId NgramsTerms
335 -- | TODO fix context_node_ngrams relation
336 queryNgramsByContextMaster' :: DPS.Query
337 queryNgramsByContextMaster' = [sql|
338 WITH contextsByNgramsUser AS (
340 SELECT n.id, ng.terms FROM contexts n
341 JOIN nodes_contexts nn ON n.id = nn.context_id
342 JOIN context_node_ngrams cng ON cng.context_id = n.id
343 JOIN ngrams ng ON cng.ngrams_id = ng.id
344 WHERE nn.node_id = ? -- UserCorpusId
345 -- AND n.typename = ? -- toDBid
346 AND cng.ngrams_type = ? -- NgramsTypeId
348 AND node_pos(n.id,?) >= ?
349 AND node_pos(n.id,?) < ?
350 GROUP BY n.id, ng.terms
354 contextsByNgramsMaster AS (
356 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
357 JOIN context_node_ngrams cng ON n.id = cng.context_id
358 JOIN ngrams ng ON ng.id = cng.ngrams_id
360 WHERE n.parent_id = ? -- Master Corpus toDBid
361 AND n.typename = ? -- toDBid
362 AND cng.ngrams_type = ? -- NgramsTypeId
363 GROUP BY n.id, ng.terms
366 SELECT m.id, m.terms FROM nodesByNgramsMaster m
367 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id