]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
Merge remote-tracking branch 'origin/adinapoli/issue-188' into dev
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByContext.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByContext
17 where
18
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Control.Monad (void)
22 import Data.HashMap.Strict (HashMap)
23 import Data.Map.Strict (Map)
24 import Data.Set (Set)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (first, second, swap)
27 import Database.PostgreSQL.Simple.SqlQQ (sql)
28 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Core
31 import Gargantext.Data.HashMap.Strict.Utils as HM
32 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
33 import Gargantext.Database.Prelude (Cmd, runPGSQuery, execPGSQuery)
34 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
35 import Gargantext.Prelude
36 import qualified Data.HashMap.Strict as HM
37 import qualified Data.List as List
38 import qualified Data.Map.Strict as Map
39 import qualified Data.Set as Set
40 import qualified Database.PostgreSQL.Simple as DPS
41 import qualified Database.PostgreSQL.Simple.Types as DPST
42
43 -- | fst is size of Supra Corpus
44 -- snd is Texts and size of Occurrences (different docs)
45
46 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
47 -> HashMap NgramsTerm (Set ContextId)
48 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
49 countContextsByNgramsWith f m = (total, m')
50 where
51 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
52 m' = HM.map ( swap . second (fromIntegral . Set.size))
53 $ groupContextsByNgramsWith f m
54
55
56 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
57 -> HashMap NgramsTerm (Set NodeId)
58 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
59 groupContextsByNgramsWith f' m'' =
60 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
61 $ HM.toList m''
62
63 ------------------------------------------------------------------------
64 getContextsByNgramsUser :: HasDBid NodeType
65 => CorpusId
66 -> NgramsType
67 -> Cmd err (HashMap NgramsTerm (Set ContextId))
68 getContextsByNgramsUser cId nt =
69 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
70 <$> selectNgramsByContextUser cId nt
71 where
72
73 selectNgramsByContextUser :: HasDBid NodeType
74 => CorpusId
75 -> NgramsType
76 -> Cmd err [(NodeId, Text)]
77 selectNgramsByContextUser cId' nt' =
78 runPGSQuery queryNgramsByContextUser
79 ( cId'
80 , toDBid NodeDocument
81 , ngramsTypeId nt'
82 -- , 100 :: Int -- limit
83 -- , 0 :: Int -- offset
84 )
85
86 queryNgramsByContextUser :: DPS.Query
87 queryNgramsByContextUser = [sql|
88 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
89 JOIN ngrams ng ON cng.ngrams_id = ng.id
90 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
91 JOIN contexts c ON nc.context_id = c.id
92 WHERE nc.node_id = ? -- CorpusId
93 AND c.typename = ? -- toDBid
94 AND cng.ngrams_type = ? -- NgramsTypeId
95 AND nc.category > 0 -- is not in Trash
96 GROUP BY cng.context_id, ng.terms
97 |]
98
99
100 ------------------------------------------------------------------------
101 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
102 => CorpusId
103 -> Int
104 -> NgramsType
105 -> [NgramsTerm]
106 -> Cmd err (HashMap NgramsTerm Int)
107 getOccByNgramsOnlyFast_withSample cId int nt ngs =
108 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
109
110
111 getOccByNgramsOnlyFast :: CorpusId
112 -> ListId
113 -> NgramsType
114 -> Cmd err (HashMap NgramsTerm [ContextId])
115 getOccByNgramsOnlyFast cId lId nt = do
116 --HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
117 HM.fromList <$> map (\(t, ns) -> (NgramsTerm t, NodeId <$> DPST.fromPGArray ns)) <$> run cId lId nt
118 where
119
120 run :: CorpusId
121 -> ListId
122 -> NgramsType
123 -> Cmd err [(Text, DPST.PGArray Int)]
124 run cId' lId' nt' = runPGSQuery query
125 ( cId'
126 , lId'
127 , ngramsTypeId nt'
128 )
129
130 query :: DPS.Query
131 query = [sql|
132 WITH node_context_ids AS
133 (select context_id, ngrams_id
134 FROM context_node_ngrams_view
135 WHERE node_id = ?
136 ), ns AS
137 (select ngrams_id FROM node_stories
138 WHERE node_id = ? AND ngrams_type_id = ?
139 )
140
141 SELECT ng.terms,
142 ARRAY ( SELECT DISTINCT context_id
143 FROM node_context_ids
144 WHERE ns.ngrams_id = node_context_ids.ngrams_id
145 )
146 AS context_ids
147 FROM ngrams ng
148 JOIN ns ON ng.id = ns.ngrams_id
149 |]
150
151
152 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
153 => CorpusId
154 -> Int
155 -> NgramsType
156 -> [NgramsTerm]
157 -> Cmd err [(NgramsTerm, Int)]
158 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
159 fmap (first NgramsTerm) <$>
160 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
161 ( int
162 , toDBid NodeDocument
163 , cId
164 , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
165 , cId
166 , ngramsTypeId nt
167 )
168 where
169 fields = [QualifiedIdentifier Nothing "text"]
170
171 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
172 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
173 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
174 JOIN nodes_contexts nn ON n.id = nn.context_id
175 WHERE n.typename = ?
176 AND nn.node_id = ?),
177 input_rows(terms) AS (?)
178 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
179 JOIN ngrams ng ON cng.ngrams_id = ng.id
180 JOIN input_rows ir ON ir.terms = ng.terms
181 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
182 JOIN nodes_sample n ON nn.context_id = n.id
183 WHERE nn.node_id = ? -- CorpusId
184 AND cng.ngrams_type = ? -- NgramsTypeId
185 AND nn.category > 0
186 GROUP BY cng.node_id, ng.terms
187 |]
188
189 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
190 => CorpusId
191 -> Int
192 -> NgramsType
193 -> Cmd err [(NgramsTerm, Int)]
194 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
195 fmap (first NgramsTerm) <$>
196 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
197 ( int
198 , toDBid NodeDocument
199 , cId
200 , cId
201 , ngramsTypeId nt
202 )
203
204 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
205 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
206 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
207 JOIN nodes_contexts nc ON c.id = nc.context_id
208 WHERE c.typename = ?
209 AND nc.node_id = ?)
210 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
211 JOIN ngrams ng ON cng.ngrams_id = ng.id
212 JOIN node_stories ns ON ns.ngrams_id = ng.id
213 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
214 JOIN contexts_sample c ON nc.context_id = c.id
215 WHERE nc.node_id = ? -- CorpusId
216 AND cng.ngrams_type = ? -- NgramsTypeId
217 AND nc.category > 0
218 GROUP BY ng.id
219 |]
220
221 ------------------------------------------------------------------------
222 getContextsByNgramsOnlyUser :: HasDBid NodeType
223 => CorpusId
224 -> [ListId]
225 -> NgramsType
226 -> [NgramsTerm]
227 -> Cmd err (HashMap NgramsTerm (Set NodeId))
228 getContextsByNgramsOnlyUser cId ls nt ngs =
229 HM.unionsWith (<>)
230 . map (HM.fromListWith (<>)
231 . map (second Set.singleton))
232 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
233 (splitEvery 1000 ngs)
234
235 getNgramsByContextOnlyUser :: HasDBid NodeType
236 => NodeId
237 -> [ListId]
238 -> NgramsType
239 -> [NgramsTerm]
240 -> Cmd err (Map NodeId (Set NgramsTerm))
241 getNgramsByContextOnlyUser cId ls nt ngs =
242 Map.unionsWith (<>)
243 . map ( Map.fromListWith (<>)
244 . map (second Set.singleton)
245 )
246 . map (map swap)
247 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
248 (splitEvery 1000 ngs)
249
250 ------------------------------------------------------------------------
251 selectNgramsOnlyByContextUser :: HasDBid NodeType
252 => CorpusId
253 -> [ListId]
254 -> NgramsType
255 -> [NgramsTerm]
256 -> Cmd err [(NgramsTerm, ContextId)]
257 selectNgramsOnlyByContextUser cId ls nt tms =
258 fmap (first NgramsTerm) <$>
259 runPGSQuery queryNgramsOnlyByContextUser
260 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
261 , Values [QualifiedIdentifier Nothing "int4"]
262 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
263 , cId
264 , toDBid NodeDocument
265 , ngramsTypeId nt
266 )
267 where
268 fields = [QualifiedIdentifier Nothing "text"]
269
270 queryNgramsOnlyByContextUser :: DPS.Query
271 queryNgramsOnlyByContextUser = [sql|
272 WITH input_rows(terms) AS (?),
273 input_list(id) AS (?)
274 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
275 JOIN ngrams ng ON cng.ngrams_id = ng.id
276 JOIN input_rows ir ON ir.terms = ng.terms
277 JOIN input_list il ON il.id = cng.node_id
278 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
279 JOIN contexts c ON nc.context_id = c.id
280 WHERE nc.node_id = ? -- CorpusId
281 AND c.typename = ? -- toDBid (maybe not useful with context table)
282 AND cng.ngrams_type = ? -- NgramsTypeId
283 AND nc.category > 0
284 GROUP BY ng.terms, cng.context_id
285 |]
286
287 getNgramsByDocOnlyUser :: DocId
288 -> [ListId]
289 -> NgramsType
290 -> [NgramsTerm]
291 -> Cmd err (HashMap NgramsTerm (Set NodeId))
292 getNgramsByDocOnlyUser cId ls nt ngs =
293 HM.unionsWith (<>)
294 . map (HM.fromListWith (<>) . map (second Set.singleton))
295 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
296
297
298 selectNgramsOnlyByDocUser :: DocId
299 -> [ListId]
300 -> NgramsType
301 -> [NgramsTerm]
302 -> Cmd err [(NgramsTerm, NodeId)]
303 selectNgramsOnlyByDocUser dId ls nt tms =
304 fmap (first NgramsTerm) <$>
305 runPGSQuery queryNgramsOnlyByDocUser
306 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
307 , Values [QualifiedIdentifier Nothing "int4"]
308 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
309 , dId
310 , ngramsTypeId nt
311 )
312 where
313 fields = [QualifiedIdentifier Nothing "text"]
314
315
316 queryNgramsOnlyByDocUser :: DPS.Query
317 queryNgramsOnlyByDocUser = [sql|
318 WITH input_rows(terms) AS (?),
319 input_list(id) AS (?)
320 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
321 JOIN ngrams ng ON cng.ngrams_id = ng.id
322 JOIN input_rows ir ON ir.terms = ng.terms
323 JOIN input_list il ON il.id = cng.context_id
324 WHERE cng.node_id = ? -- DocId
325 AND cng.ngrams_type = ? -- NgramsTypeId
326 GROUP BY ng.terms, cng.node_id
327 |]
328
329 ------------------------------------------------------------------------
330 -- | TODO filter by language, database, any social field
331 getContextsByNgramsMaster :: HasDBid NodeType
332 => UserCorpusId
333 -> MasterCorpusId
334 -> Cmd err (HashMap Text (Set NodeId))
335 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
336 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
337 -- . takeWhile (not . List.null)
338 -- . takeWhile (\l -> List.length l > 3)
339 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
340
341 selectNgramsByContextMaster :: HasDBid NodeType
342 => Int
343 -> UserCorpusId
344 -> MasterCorpusId
345 -> Int
346 -> Cmd err [(NodeId, Text)]
347 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
348 queryNgramsByContextMaster'
349 ( ucId
350 , ngramsTypeId NgramsTerms
351 , toDBid NodeDocument
352 , p
353 , toDBid NodeDocument
354 , p
355 , n
356 , mcId
357 , toDBid NodeDocument
358 , ngramsTypeId NgramsTerms
359 )
360
361 -- | TODO fix context_node_ngrams relation
362 queryNgramsByContextMaster' :: DPS.Query
363 queryNgramsByContextMaster' = [sql|
364 WITH contextsByNgramsUser AS (
365
366 SELECT n.id, ng.terms FROM contexts n
367 JOIN nodes_contexts nn ON n.id = nn.context_id
368 JOIN context_node_ngrams cng ON cng.context_id = n.id
369 JOIN ngrams ng ON cng.ngrams_id = ng.id
370 WHERE nn.node_id = ? -- UserCorpusId
371 -- AND n.typename = ? -- toDBid
372 AND cng.ngrams_type = ? -- NgramsTypeId
373 AND nn.category > 0
374 AND node_pos(n.id,?) >= ?
375 AND node_pos(n.id,?) < ?
376 GROUP BY n.id, ng.terms
377
378 ),
379
380 contextsByNgramsMaster AS (
381
382 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
383 JOIN context_node_ngrams cng ON n.id = cng.context_id
384 JOIN ngrams ng ON ng.id = cng.ngrams_id
385
386 WHERE n.parent_id = ? -- Master Corpus toDBid
387 AND n.typename = ? -- toDBid
388 AND cng.ngrams_type = ? -- NgramsTypeId
389 GROUP BY n.id, ng.terms
390 )
391
392 SELECT m.id, m.terms FROM nodesByNgramsMaster m
393 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
394 |]
395
396 -- | Refreshes the \"context_node_ngrams_view\" materialized view.
397 -- This function will be run :
398 -- - periodically
399 -- - at reindex stage
400 -- - at the end of each text flow
401
402 refreshNgramsMaterialized :: Cmd err ()
403 refreshNgramsMaterialized = void $ execPGSQuery refreshNgramsMaterializedQuery ()
404 where
405 refreshNgramsMaterializedQuery :: DPS.Query
406 refreshNgramsMaterializedQuery = [sql| refresh materialized view context_node_ngrams_view; |]
407
408