]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
Merge branch 'dev' into 151-dev-pubmed-api-key
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByContext.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByContext
17 where
18
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map (Map)
23 import Data.Set (Set)
24 import Data.Text (Text)
25 import Data.Tuple.Extra (first, second, swap)
26 import Database.PostgreSQL.Simple.SqlQQ (sql)
27 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
28 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
29 import Gargantext.Core
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
39 import qualified Database.PostgreSQL.Simple.Types as DPST
40
41 -- | fst is size of Supra Corpus
42 -- snd is Texts and size of Occurrences (different docs)
43
44 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
45 -> HashMap NgramsTerm (Set ContextId)
46 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
47 countContextsByNgramsWith f m = (total, m')
48 where
49 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
50 m' = HM.map ( swap . second (fromIntegral . Set.size))
51 $ groupContextsByNgramsWith f m
52
53
54 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
55 -> HashMap NgramsTerm (Set NodeId)
56 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
57 groupContextsByNgramsWith f' m'' =
58 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
59 $ HM.toList m''
60
61 ------------------------------------------------------------------------
62 getContextsByNgramsUser :: HasDBid NodeType
63 => CorpusId
64 -> NgramsType
65 -> Cmd err (HashMap NgramsTerm (Set ContextId))
66 getContextsByNgramsUser cId nt =
67 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
68 <$> selectNgramsByContextUser cId nt
69 where
70
71 selectNgramsByContextUser :: HasDBid NodeType
72 => CorpusId
73 -> NgramsType
74 -> Cmd err [(NodeId, Text)]
75 selectNgramsByContextUser cId' nt' =
76 runPGSQuery queryNgramsByContextUser
77 ( cId'
78 , toDBid NodeDocument
79 , ngramsTypeId nt'
80 -- , 100 :: Int -- limit
81 -- , 0 :: Int -- offset
82 )
83
84 queryNgramsByContextUser :: DPS.Query
85 queryNgramsByContextUser = [sql|
86 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
87 JOIN ngrams ng ON cng.ngrams_id = ng.id
88 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
89 JOIN contexts c ON nc.context_id = c.id
90 WHERE nc.node_id = ? -- CorpusId
91 AND c.typename = ? -- toDBid
92 AND cng.ngrams_type = ? -- NgramsTypeId
93 AND nc.category > 0 -- is not in Trash
94 GROUP BY cng.context_id, ng.terms
95 |]
96
97
98 ------------------------------------------------------------------------
99 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
100 => CorpusId
101 -> Int
102 -> NgramsType
103 -> [NgramsTerm]
104 -> Cmd err (HashMap NgramsTerm Int)
105 getOccByNgramsOnlyFast_withSample cId int nt ngs =
106 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
107
108
109 getOccByNgramsOnlyFast :: CorpusId
110 -> ListId
111 -> NgramsType
112 -> Cmd err (HashMap NgramsTerm [ContextId])
113 getOccByNgramsOnlyFast cId lId nt = do
114 --HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
115 HM.fromList <$> map (\(t, ns) -> (NgramsTerm t, NodeId <$> DPST.fromPGArray ns)) <$> run cId lId nt
116 where
117
118 run :: CorpusId
119 -> ListId
120 -> NgramsType
121 -> Cmd err [(Text, DPST.PGArray Int)]
122 run cId' lId' nt' = runPGSQuery query
123 ( cId'
124 , cId'
125 , lId'
126 , ngramsTypeId nt'
127 )
128
129 query :: DPS.Query
130 query = [sql|
131 SELECT ng.terms
132 -- , ng.id
133 --, round(nng.weight)
134 , ARRAY(
135 SELECT DISTINCT context_node_ngrams.context_id
136 FROM context_node_ngrams
137 JOIN nodes_contexts
138 ON context_node_ngrams.context_id = nodes_contexts.context_id
139 WHERE ng.id = context_node_ngrams.ngrams_id
140 AND nodes_contexts.node_id = ?
141 ) AS context_ids
142 -- , ns.version
143 -- , nng.ngrams_type
144 -- , ns.ngrams_type_id
145 FROM ngrams ng
146 JOIN node_stories ns ON ng.id = ns.ngrams_id
147 JOIN node_node_ngrams nng ON ns.node_id = nng.node2_id
148 WHERE nng.node1_id = ?
149 AND nng.node2_id = ?
150 AND nng.ngrams_type = ?
151 AND nng.ngrams_id = ng.id
152 AND nng.ngrams_type = ns.ngrams_type_id
153 ORDER BY ng.id ASC;
154 |]
155
156
157 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
158 => CorpusId
159 -> Int
160 -> NgramsType
161 -> [NgramsTerm]
162 -> Cmd err [(NgramsTerm, Int)]
163 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
164 fmap (first NgramsTerm) <$>
165 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
166 ( int
167 , toDBid NodeDocument
168 , cId
169 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
170 , cId
171 , ngramsTypeId nt
172 )
173 where
174 fields = [QualifiedIdentifier Nothing "text"]
175
176 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
177 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
178 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
179 JOIN nodes_contexts nn ON n.id = nn.context_id
180 WHERE n.typename = ?
181 AND nn.node_id = ?),
182 input_rows(terms) AS (?)
183 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
184 JOIN ngrams ng ON cng.ngrams_id = ng.id
185 JOIN input_rows ir ON ir.terms = ng.terms
186 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
187 JOIN nodes_sample n ON nn.context_id = n.id
188 WHERE nn.node_id = ? -- CorpusId
189 AND cng.ngrams_type = ? -- NgramsTypeId
190 AND nn.category > 0
191 GROUP BY cng.node_id, ng.terms
192 |]
193
194 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
195 => CorpusId
196 -> Int
197 -> NgramsType
198 -> Cmd err [(NgramsTerm, Int)]
199 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
200 fmap (first NgramsTerm) <$>
201 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
202 ( int
203 , toDBid NodeDocument
204 , cId
205 , cId
206 , ngramsTypeId nt
207 )
208
209 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
210 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
211 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
212 JOIN nodes_contexts nc ON c.id = nc.context_id
213 WHERE c.typename = ?
214 AND nc.node_id = ?)
215 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
216 JOIN ngrams ng ON cng.ngrams_id = ng.id
217 JOIN node_stories ns ON ns.ngrams_id = ng.id
218 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
219 JOIN contexts_sample c ON nc.context_id = c.id
220 WHERE nc.node_id = ? -- CorpusId
221 AND cng.ngrams_type = ? -- NgramsTypeId
222 AND nc.category > 0
223 GROUP BY ng.id
224 |]
225
226
227
228
229
230
231
232 ------------------------------------------------------------------------
233 getContextsByNgramsOnlyUser :: HasDBid NodeType
234 => CorpusId
235 -> [ListId]
236 -> NgramsType
237 -> [NgramsTerm]
238 -> Cmd err (HashMap NgramsTerm (Set NodeId))
239 getContextsByNgramsOnlyUser cId ls nt ngs =
240 HM.unionsWith (<>)
241 . map (HM.fromListWith (<>)
242 . map (second Set.singleton))
243 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
244 (splitEvery 1000 ngs)
245
246 getNgramsByContextOnlyUser :: HasDBid NodeType
247 => NodeId
248 -> [ListId]
249 -> NgramsType
250 -> [NgramsTerm]
251 -> Cmd err (Map NodeId (Set NgramsTerm))
252 getNgramsByContextOnlyUser cId ls nt ngs =
253 Map.unionsWith (<>)
254 . map ( Map.fromListWith (<>)
255 . map (second Set.singleton)
256 )
257 . map (map swap)
258 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
259 (splitEvery 1000 ngs)
260
261 ------------------------------------------------------------------------
262 selectNgramsOnlyByContextUser :: HasDBid NodeType
263 => CorpusId
264 -> [ListId]
265 -> NgramsType
266 -> [NgramsTerm]
267 -> Cmd err [(NgramsTerm, ContextId)]
268 selectNgramsOnlyByContextUser cId ls nt tms =
269 fmap (first NgramsTerm) <$>
270 runPGSQuery queryNgramsOnlyByContextUser
271 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
272 , Values [QualifiedIdentifier Nothing "int4"]
273 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
274 , cId
275 , toDBid NodeDocument
276 , ngramsTypeId nt
277 )
278 where
279 fields = [QualifiedIdentifier Nothing "text"]
280
281 queryNgramsOnlyByContextUser :: DPS.Query
282 queryNgramsOnlyByContextUser = [sql|
283 WITH input_rows(terms) AS (?),
284 input_list(id) AS (?)
285 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
286 JOIN ngrams ng ON cng.ngrams_id = ng.id
287 JOIN input_rows ir ON ir.terms = ng.terms
288 JOIN input_list il ON il.id = cng.node_id
289 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
290 JOIN contexts c ON nc.context_id = c.id
291 WHERE nc.node_id = ? -- CorpusId
292 AND c.typename = ? -- toDBid (maybe not useful with context table)
293 AND cng.ngrams_type = ? -- NgramsTypeId
294 AND nc.category > 0
295 GROUP BY ng.terms, cng.context_id
296 |]
297
298 getNgramsByDocOnlyUser :: DocId
299 -> [ListId]
300 -> NgramsType
301 -> [NgramsTerm]
302 -> Cmd err (HashMap NgramsTerm (Set NodeId))
303 getNgramsByDocOnlyUser cId ls nt ngs =
304 HM.unionsWith (<>)
305 . map (HM.fromListWith (<>) . map (second Set.singleton))
306 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
307
308
309 selectNgramsOnlyByDocUser :: DocId
310 -> [ListId]
311 -> NgramsType
312 -> [NgramsTerm]
313 -> Cmd err [(NgramsTerm, NodeId)]
314 selectNgramsOnlyByDocUser dId ls nt tms =
315 fmap (first NgramsTerm) <$>
316 runPGSQuery queryNgramsOnlyByDocUser
317 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
318 , Values [QualifiedIdentifier Nothing "int4"]
319 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
320 , dId
321 , ngramsTypeId nt
322 )
323 where
324 fields = [QualifiedIdentifier Nothing "text"]
325
326
327 queryNgramsOnlyByDocUser :: DPS.Query
328 queryNgramsOnlyByDocUser = [sql|
329 WITH input_rows(terms) AS (?),
330 input_list(id) AS (?)
331 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
332 JOIN ngrams ng ON cng.ngrams_id = ng.id
333 JOIN input_rows ir ON ir.terms = ng.terms
334 JOIN input_list il ON il.id = cng.context_id
335 WHERE cng.node_id = ? -- DocId
336 AND cng.ngrams_type = ? -- NgramsTypeId
337 GROUP BY ng.terms, cng.node_id
338 |]
339
340 ------------------------------------------------------------------------
341 -- | TODO filter by language, database, any social field
342 getContextsByNgramsMaster :: HasDBid NodeType
343 => UserCorpusId
344 -> MasterCorpusId
345 -> Cmd err (HashMap Text (Set NodeId))
346 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
347 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
348 -- . takeWhile (not . List.null)
349 -- . takeWhile (\l -> List.length l > 3)
350 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
351
352 selectNgramsByContextMaster :: HasDBid NodeType
353 => Int
354 -> UserCorpusId
355 -> MasterCorpusId
356 -> Int
357 -> Cmd err [(NodeId, Text)]
358 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
359 queryNgramsByContextMaster'
360 ( ucId
361 , ngramsTypeId NgramsTerms
362 , toDBid NodeDocument
363 , p
364 , toDBid NodeDocument
365 , p
366 , n
367 , mcId
368 , toDBid NodeDocument
369 , ngramsTypeId NgramsTerms
370 )
371
372 -- | TODO fix context_node_ngrams relation
373 queryNgramsByContextMaster' :: DPS.Query
374 queryNgramsByContextMaster' = [sql|
375 WITH contextsByNgramsUser AS (
376
377 SELECT n.id, ng.terms FROM contexts n
378 JOIN nodes_contexts nn ON n.id = nn.context_id
379 JOIN context_node_ngrams cng ON cng.context_id = n.id
380 JOIN ngrams ng ON cng.ngrams_id = ng.id
381 WHERE nn.node_id = ? -- UserCorpusId
382 -- AND n.typename = ? -- toDBid
383 AND cng.ngrams_type = ? -- NgramsTypeId
384 AND nn.category > 0
385 AND node_pos(n.id,?) >= ?
386 AND node_pos(n.id,?) < ?
387 GROUP BY n.id, ng.terms
388
389 ),
390
391 contextsByNgramsMaster AS (
392
393 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
394 JOIN context_node_ngrams cng ON n.id = cng.context_id
395 JOIN ngrams ng ON ng.id = cng.ngrams_id
396
397 WHERE n.parent_id = ? -- Master Corpus toDBid
398 AND n.typename = ? -- toDBid
399 AND cng.ngrams_type = ? -- NgramsTypeId
400 GROUP BY n.id, ng.terms
401 )
402
403 SELECT m.id, m.terms FROM nodesByNgramsMaster m
404 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
405 |]