]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
[FIX] #199
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByContext.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByContext
17 where
18
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map.Strict (Map)
23 import Data.Set (Set)
24 import Data.Text (Text)
25 import Data.Tuple.Extra (first, second, swap)
26 import Database.PostgreSQL.Simple.SqlQQ (sql)
27 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
28 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
29 import Gargantext.Core
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.List as List
37 import qualified Data.Map.Strict as Map
38 import qualified Data.Set as Set
39 import qualified Database.PostgreSQL.Simple as DPS
40 import qualified Database.PostgreSQL.Simple.Types as DPST
41
42 -- | fst is size of Supra Corpus
43 -- snd is Texts and size of Occurrences (different docs)
44
45 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
46 -> HashMap NgramsTerm (Set ContextId)
47 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
48 countContextsByNgramsWith f m = (total, m')
49 where
50 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
51 m' = HM.map ( swap . second (fromIntegral . Set.size))
52 $ groupContextsByNgramsWith f m
53
54
55 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
56 -> HashMap NgramsTerm (Set NodeId)
57 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
58 groupContextsByNgramsWith f' m'' =
59 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
60 $ HM.toList m''
61
62 ------------------------------------------------------------------------
63 getContextsByNgramsUser :: HasDBid NodeType
64 => CorpusId
65 -> NgramsType
66 -> Cmd err (HashMap NgramsTerm (Set ContextId))
67 getContextsByNgramsUser cId nt =
68 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
69 <$> selectNgramsByContextUser cId nt
70 where
71
72 selectNgramsByContextUser :: HasDBid NodeType
73 => CorpusId
74 -> NgramsType
75 -> Cmd err [(NodeId, Text)]
76 selectNgramsByContextUser cId' nt' =
77 runPGSQuery queryNgramsByContextUser
78 ( cId'
79 , toDBid NodeDocument
80 , ngramsTypeId nt'
81 -- , 100 :: Int -- limit
82 -- , 0 :: Int -- offset
83 )
84
85 queryNgramsByContextUser :: DPS.Query
86 queryNgramsByContextUser = [sql|
87 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
88 JOIN ngrams ng ON cng.ngrams_id = ng.id
89 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
90 JOIN contexts c ON nc.context_id = c.id
91 WHERE nc.node_id = ? -- CorpusId
92 AND c.typename = ? -- toDBid
93 AND cng.ngrams_type = ? -- NgramsTypeId
94 AND nc.category > 0 -- is not in Trash
95 GROUP BY cng.context_id, ng.terms
96 |]
97
98
99 ------------------------------------------------------------------------
100 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
101 => CorpusId
102 -> Int
103 -> NgramsType
104 -> [NgramsTerm]
105 -> Cmd err (HashMap NgramsTerm Int)
106 getOccByNgramsOnlyFast_withSample cId int nt ngs =
107 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
108
109
110 getOccByNgramsOnlyFast :: CorpusId
111 -> ListId
112 -> NgramsType
113 -> Cmd err (HashMap NgramsTerm [ContextId])
114 getOccByNgramsOnlyFast cId lId nt = do
115 --HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
116 HM.fromList <$> map (\(t, ns) -> (NgramsTerm t, NodeId <$> DPST.fromPGArray ns)) <$> run cId lId nt
117 where
118
119 run :: CorpusId
120 -> ListId
121 -> NgramsType
122 -> Cmd err [(Text, DPST.PGArray Int)]
123 run cId' lId' nt' = runPGSQuery query
124 ( cId'
125 , lId'
126 , ngramsTypeId nt'
127 )
128
129 query :: DPS.Query
130 query = [sql|
131 WITH node_context_ids AS
132 (select context_id, ngrams_id
133 FROM context_node_ngrams_view
134 WHERE node_id = ?
135 ), ns AS
136 (select ngrams_id FROM node_stories
137 WHERE node_id = ? AND ngrams_type_id = ?
138 )
139
140 SELECT ng.terms,
141 ARRAY ( SELECT DISTINCT context_id
142 FROM node_context_ids
143 WHERE ns.ngrams_id = node_context_ids.ngrams_id
144 )
145 AS context_ids
146 FROM ngrams ng
147 JOIN ns ON ng.id = ns.ngrams_id
148 |]
149
150
151 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
152 => CorpusId
153 -> Int
154 -> NgramsType
155 -> [NgramsTerm]
156 -> Cmd err [(NgramsTerm, Int)]
157 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
158 fmap (first NgramsTerm) <$>
159 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
160 ( int
161 , toDBid NodeDocument
162 , cId
163 , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
164 , cId
165 , ngramsTypeId nt
166 )
167 where
168 fields = [QualifiedIdentifier Nothing "text"]
169
170 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
171 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
172 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
173 JOIN nodes_contexts nn ON n.id = nn.context_id
174 WHERE n.typename = ?
175 AND nn.node_id = ?),
176 input_rows(terms) AS (?)
177 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
178 JOIN ngrams ng ON cng.ngrams_id = ng.id
179 JOIN input_rows ir ON ir.terms = ng.terms
180 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
181 JOIN nodes_sample n ON nn.context_id = n.id
182 WHERE nn.node_id = ? -- CorpusId
183 AND cng.ngrams_type = ? -- NgramsTypeId
184 AND nn.category > 0
185 GROUP BY cng.node_id, ng.terms
186 |]
187
188 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
189 => CorpusId
190 -> Int
191 -> NgramsType
192 -> Cmd err [(NgramsTerm, Int)]
193 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
194 fmap (first NgramsTerm) <$>
195 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
196 ( int
197 , toDBid NodeDocument
198 , cId
199 , cId
200 , ngramsTypeId nt
201 )
202
203 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
204 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
205 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
206 JOIN nodes_contexts nc ON c.id = nc.context_id
207 WHERE c.typename = ?
208 AND nc.node_id = ?)
209 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
210 JOIN ngrams ng ON cng.ngrams_id = ng.id
211 JOIN node_stories ns ON ns.ngrams_id = ng.id
212 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
213 JOIN contexts_sample c ON nc.context_id = c.id
214 WHERE nc.node_id = ? -- CorpusId
215 AND cng.ngrams_type = ? -- NgramsTypeId
216 AND nc.category > 0
217 GROUP BY ng.id
218 |]
219
220
221
222
223
224
225
226 ------------------------------------------------------------------------
227 getContextsByNgramsOnlyUser :: HasDBid NodeType
228 => CorpusId
229 -> [ListId]
230 -> NgramsType
231 -> [NgramsTerm]
232 -> Cmd err (HashMap NgramsTerm (Set NodeId))
233 getContextsByNgramsOnlyUser cId ls nt ngs =
234 HM.unionsWith (<>)
235 . map (HM.fromListWith (<>)
236 . map (second Set.singleton))
237 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
238 (splitEvery 1000 ngs)
239
240 getNgramsByContextOnlyUser :: HasDBid NodeType
241 => NodeId
242 -> [ListId]
243 -> NgramsType
244 -> [NgramsTerm]
245 -> Cmd err (Map NodeId (Set NgramsTerm))
246 getNgramsByContextOnlyUser cId ls nt ngs =
247 Map.unionsWith (<>)
248 . map ( Map.fromListWith (<>)
249 . map (second Set.singleton)
250 )
251 . map (map swap)
252 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
253 (splitEvery 1000 ngs)
254
255 ------------------------------------------------------------------------
256 selectNgramsOnlyByContextUser :: HasDBid NodeType
257 => CorpusId
258 -> [ListId]
259 -> NgramsType
260 -> [NgramsTerm]
261 -> Cmd err [(NgramsTerm, ContextId)]
262 selectNgramsOnlyByContextUser cId ls nt tms =
263 fmap (first NgramsTerm) <$>
264 runPGSQuery queryNgramsOnlyByContextUser
265 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
266 , Values [QualifiedIdentifier Nothing "int4"]
267 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
268 , cId
269 , toDBid NodeDocument
270 , ngramsTypeId nt
271 )
272 where
273 fields = [QualifiedIdentifier Nothing "text"]
274
275 queryNgramsOnlyByContextUser :: DPS.Query
276 queryNgramsOnlyByContextUser = [sql|
277 WITH input_rows(terms) AS (?),
278 input_list(id) AS (?)
279 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
280 JOIN ngrams ng ON cng.ngrams_id = ng.id
281 JOIN input_rows ir ON ir.terms = ng.terms
282 JOIN input_list il ON il.id = cng.node_id
283 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
284 JOIN contexts c ON nc.context_id = c.id
285 WHERE nc.node_id = ? -- CorpusId
286 AND c.typename = ? -- toDBid (maybe not useful with context table)
287 AND cng.ngrams_type = ? -- NgramsTypeId
288 AND nc.category > 0
289 GROUP BY ng.terms, cng.context_id
290 |]
291
292 getNgramsByDocOnlyUser :: DocId
293 -> [ListId]
294 -> NgramsType
295 -> [NgramsTerm]
296 -> Cmd err (HashMap NgramsTerm (Set NodeId))
297 getNgramsByDocOnlyUser cId ls nt ngs =
298 HM.unionsWith (<>)
299 . map (HM.fromListWith (<>) . map (second Set.singleton))
300 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
301
302
303 selectNgramsOnlyByDocUser :: DocId
304 -> [ListId]
305 -> NgramsType
306 -> [NgramsTerm]
307 -> Cmd err [(NgramsTerm, NodeId)]
308 selectNgramsOnlyByDocUser dId ls nt tms =
309 fmap (first NgramsTerm) <$>
310 runPGSQuery queryNgramsOnlyByDocUser
311 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
312 , Values [QualifiedIdentifier Nothing "int4"]
313 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
314 , dId
315 , ngramsTypeId nt
316 )
317 where
318 fields = [QualifiedIdentifier Nothing "text"]
319
320
321 queryNgramsOnlyByDocUser :: DPS.Query
322 queryNgramsOnlyByDocUser = [sql|
323 WITH input_rows(terms) AS (?),
324 input_list(id) AS (?)
325 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
326 JOIN ngrams ng ON cng.ngrams_id = ng.id
327 JOIN input_rows ir ON ir.terms = ng.terms
328 JOIN input_list il ON il.id = cng.context_id
329 WHERE cng.node_id = ? -- DocId
330 AND cng.ngrams_type = ? -- NgramsTypeId
331 GROUP BY ng.terms, cng.node_id
332 |]
333
334 ------------------------------------------------------------------------
335 -- | TODO filter by language, database, any social field
336 getContextsByNgramsMaster :: HasDBid NodeType
337 => UserCorpusId
338 -> MasterCorpusId
339 -> Cmd err (HashMap Text (Set NodeId))
340 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
341 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
342 -- . takeWhile (not . List.null)
343 -- . takeWhile (\l -> List.length l > 3)
344 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
345
346 selectNgramsByContextMaster :: HasDBid NodeType
347 => Int
348 -> UserCorpusId
349 -> MasterCorpusId
350 -> Int
351 -> Cmd err [(NodeId, Text)]
352 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
353 queryNgramsByContextMaster'
354 ( ucId
355 , ngramsTypeId NgramsTerms
356 , toDBid NodeDocument
357 , p
358 , toDBid NodeDocument
359 , p
360 , n
361 , mcId
362 , toDBid NodeDocument
363 , ngramsTypeId NgramsTerms
364 )
365
366 -- | TODO fix context_node_ngrams relation
367 queryNgramsByContextMaster' :: DPS.Query
368 queryNgramsByContextMaster' = [sql|
369 WITH contextsByNgramsUser AS (
370
371 SELECT n.id, ng.terms FROM contexts n
372 JOIN nodes_contexts nn ON n.id = nn.context_id
373 JOIN context_node_ngrams cng ON cng.context_id = n.id
374 JOIN ngrams ng ON cng.ngrams_id = ng.id
375 WHERE nn.node_id = ? -- UserCorpusId
376 -- AND n.typename = ? -- toDBid
377 AND cng.ngrams_type = ? -- NgramsTypeId
378 AND nn.category > 0
379 AND node_pos(n.id,?) >= ?
380 AND node_pos(n.id,?) < ?
381 GROUP BY n.id, ng.terms
382
383 ),
384
385 contextsByNgramsMaster AS (
386
387 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
388 JOIN context_node_ngrams cng ON n.id = cng.context_id
389 JOIN ngrams ng ON ng.id = cng.ngrams_id
390
391 WHERE n.parent_id = ? -- Master Corpus toDBid
392 AND n.typename = ? -- toDBid
393 AND cng.ngrams_type = ? -- NgramsTypeId
394 GROUP BY n.id, ng.terms
395 )
396
397 SELECT m.id, m.terms FROM nodesByNgramsMaster m
398 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
399 |]