]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
Merge remote-tracking branch 'origin/adinapoli/issue-198' into dev
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByContext.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByContext
17 where
18
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map.Strict (Map)
23 import Data.Set (Set)
24 import Data.Text (Text)
25 import Data.Tuple.Extra (first, second, swap)
26 import Database.PostgreSQL.Simple.SqlQQ (sql)
27 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
28 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
29 import Gargantext.Core
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.List as List
37 import qualified Data.Map.Strict as Map
38 import qualified Data.Set as Set
39 import qualified Database.PostgreSQL.Simple as DPS
40 import qualified Database.PostgreSQL.Simple.Types as DPST
41
42 -- | fst is size of Supra Corpus
43 -- snd is Texts and size of Occurrences (different docs)
44
45 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
46 -> HashMap NgramsTerm (Set ContextId)
47 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
48 countContextsByNgramsWith f m = (total, m')
49 where
50 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
51 m' = HM.map ( swap . second (fromIntegral . Set.size))
52 $ groupContextsByNgramsWith f m
53
54
55 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
56 -> HashMap NgramsTerm (Set NodeId)
57 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
58 groupContextsByNgramsWith f' m'' =
59 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
60 $ HM.toList m''
61
62 ------------------------------------------------------------------------
63 getContextsByNgramsUser :: HasDBid NodeType
64 => CorpusId
65 -> NgramsType
66 -> Cmd err (HashMap NgramsTerm (Set ContextId))
67 getContextsByNgramsUser cId nt =
68 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
69 <$> selectNgramsByContextUser cId nt
70 where
71
72 selectNgramsByContextUser :: HasDBid NodeType
73 => CorpusId
74 -> NgramsType
75 -> Cmd err [(NodeId, Text)]
76 selectNgramsByContextUser cId' nt' =
77 runPGSQuery queryNgramsByContextUser
78 ( cId'
79 , toDBid NodeDocument
80 , ngramsTypeId nt'
81 -- , 100 :: Int -- limit
82 -- , 0 :: Int -- offset
83 )
84
85 queryNgramsByContextUser :: DPS.Query
86 queryNgramsByContextUser = [sql|
87 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
88 JOIN ngrams ng ON cng.ngrams_id = ng.id
89 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
90 JOIN contexts c ON nc.context_id = c.id
91 WHERE nc.node_id = ? -- CorpusId
92 AND c.typename = ? -- toDBid
93 AND cng.ngrams_type = ? -- NgramsTypeId
94 AND nc.category > 0 -- is not in Trash
95 GROUP BY cng.context_id, ng.terms
96 |]
97
98
99 ------------------------------------------------------------------------
100 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
101 => CorpusId
102 -> Int
103 -> NgramsType
104 -> [NgramsTerm]
105 -> Cmd err (HashMap NgramsTerm Int)
106 getOccByNgramsOnlyFast_withSample cId int nt ngs =
107 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
108
109
110 getOccByNgramsOnlyFast :: CorpusId
111 -> ListId
112 -> NgramsType
113 -> Cmd err (HashMap NgramsTerm [ContextId])
114 getOccByNgramsOnlyFast cId lId nt = do
115 --HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
116 HM.fromList <$> map (\(t, ns) -> (NgramsTerm t, NodeId <$> DPST.fromPGArray ns)) <$> run cId lId nt
117 where
118
119 run :: CorpusId
120 -> ListId
121 -> NgramsType
122 -> Cmd err [(Text, DPST.PGArray Int)]
123 run cId' lId' nt' = runPGSQuery query
124 ( cId'
125 , cId'
126 , lId'
127 , ngramsTypeId nt'
128 )
129
130 query :: DPS.Query
131 query = [sql|
132 SELECT ng.terms
133 -- , ng.id
134 --, round(nng.weight)
135 , ARRAY(
136 SELECT DISTINCT context_node_ngrams.context_id
137 FROM context_node_ngrams
138 JOIN nodes_contexts
139 ON context_node_ngrams.context_id = nodes_contexts.context_id
140 WHERE ng.id = context_node_ngrams.ngrams_id
141 AND nodes_contexts.node_id = ?
142 ) AS context_ids
143 -- , ns.version
144 -- , nng.ngrams_type
145 -- , ns.ngrams_type_id
146 FROM ngrams ng
147 JOIN node_stories ns ON ng.id = ns.ngrams_id
148 JOIN node_node_ngrams nng ON ns.node_id = nng.node2_id
149 WHERE nng.node1_id = ?
150 AND nng.node2_id = ?
151 AND nng.ngrams_type = ?
152 AND nng.ngrams_id = ng.id
153 AND nng.ngrams_type = ns.ngrams_type_id
154 ORDER BY ng.id ASC;
155 |]
156
157
158 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
159 => CorpusId
160 -> Int
161 -> NgramsType
162 -> [NgramsTerm]
163 -> Cmd err [(NgramsTerm, Int)]
164 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
165 fmap (first NgramsTerm) <$>
166 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
167 ( int
168 , toDBid NodeDocument
169 , cId
170 , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
171 , cId
172 , ngramsTypeId nt
173 )
174 where
175 fields = [QualifiedIdentifier Nothing "text"]
176
177 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
178 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
179 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
180 JOIN nodes_contexts nn ON n.id = nn.context_id
181 WHERE n.typename = ?
182 AND nn.node_id = ?),
183 input_rows(terms) AS (?)
184 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
185 JOIN ngrams ng ON cng.ngrams_id = ng.id
186 JOIN input_rows ir ON ir.terms = ng.terms
187 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
188 JOIN nodes_sample n ON nn.context_id = n.id
189 WHERE nn.node_id = ? -- CorpusId
190 AND cng.ngrams_type = ? -- NgramsTypeId
191 AND nn.category > 0
192 GROUP BY cng.node_id, ng.terms
193 |]
194
195 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
196 => CorpusId
197 -> Int
198 -> NgramsType
199 -> Cmd err [(NgramsTerm, Int)]
200 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
201 fmap (first NgramsTerm) <$>
202 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
203 ( int
204 , toDBid NodeDocument
205 , cId
206 , cId
207 , ngramsTypeId nt
208 )
209
210 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
211 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
212 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
213 JOIN nodes_contexts nc ON c.id = nc.context_id
214 WHERE c.typename = ?
215 AND nc.node_id = ?)
216 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
217 JOIN ngrams ng ON cng.ngrams_id = ng.id
218 JOIN node_stories ns ON ns.ngrams_id = ng.id
219 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
220 JOIN contexts_sample c ON nc.context_id = c.id
221 WHERE nc.node_id = ? -- CorpusId
222 AND cng.ngrams_type = ? -- NgramsTypeId
223 AND nc.category > 0
224 GROUP BY ng.id
225 |]
226
227
228
229
230
231
232
233 ------------------------------------------------------------------------
234 getContextsByNgramsOnlyUser :: HasDBid NodeType
235 => CorpusId
236 -> [ListId]
237 -> NgramsType
238 -> [NgramsTerm]
239 -> Cmd err (HashMap NgramsTerm (Set NodeId))
240 getContextsByNgramsOnlyUser cId ls nt ngs =
241 HM.unionsWith (<>)
242 . map (HM.fromListWith (<>)
243 . map (second Set.singleton))
244 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
245 (splitEvery 1000 ngs)
246
247 getNgramsByContextOnlyUser :: HasDBid NodeType
248 => NodeId
249 -> [ListId]
250 -> NgramsType
251 -> [NgramsTerm]
252 -> Cmd err (Map NodeId (Set NgramsTerm))
253 getNgramsByContextOnlyUser cId ls nt ngs =
254 Map.unionsWith (<>)
255 . map ( Map.fromListWith (<>)
256 . map (second Set.singleton)
257 )
258 . map (map swap)
259 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
260 (splitEvery 1000 ngs)
261
262 ------------------------------------------------------------------------
263 selectNgramsOnlyByContextUser :: HasDBid NodeType
264 => CorpusId
265 -> [ListId]
266 -> NgramsType
267 -> [NgramsTerm]
268 -> Cmd err [(NgramsTerm, ContextId)]
269 selectNgramsOnlyByContextUser cId ls nt tms =
270 fmap (first NgramsTerm) <$>
271 runPGSQuery queryNgramsOnlyByContextUser
272 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
273 , Values [QualifiedIdentifier Nothing "int4"]
274 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
275 , cId
276 , toDBid NodeDocument
277 , ngramsTypeId nt
278 )
279 where
280 fields = [QualifiedIdentifier Nothing "text"]
281
282 queryNgramsOnlyByContextUser :: DPS.Query
283 queryNgramsOnlyByContextUser = [sql|
284 WITH input_rows(terms) AS (?),
285 input_list(id) AS (?)
286 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
287 JOIN ngrams ng ON cng.ngrams_id = ng.id
288 JOIN input_rows ir ON ir.terms = ng.terms
289 JOIN input_list il ON il.id = cng.node_id
290 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
291 JOIN contexts c ON nc.context_id = c.id
292 WHERE nc.node_id = ? -- CorpusId
293 AND c.typename = ? -- toDBid (maybe not useful with context table)
294 AND cng.ngrams_type = ? -- NgramsTypeId
295 AND nc.category > 0
296 GROUP BY ng.terms, cng.context_id
297 |]
298
299 getNgramsByDocOnlyUser :: DocId
300 -> [ListId]
301 -> NgramsType
302 -> [NgramsTerm]
303 -> Cmd err (HashMap NgramsTerm (Set NodeId))
304 getNgramsByDocOnlyUser cId ls nt ngs =
305 HM.unionsWith (<>)
306 . map (HM.fromListWith (<>) . map (second Set.singleton))
307 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
308
309
310 selectNgramsOnlyByDocUser :: DocId
311 -> [ListId]
312 -> NgramsType
313 -> [NgramsTerm]
314 -> Cmd err [(NgramsTerm, NodeId)]
315 selectNgramsOnlyByDocUser dId ls nt tms =
316 fmap (first NgramsTerm) <$>
317 runPGSQuery queryNgramsOnlyByDocUser
318 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
319 , Values [QualifiedIdentifier Nothing "int4"]
320 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
321 , dId
322 , ngramsTypeId nt
323 )
324 where
325 fields = [QualifiedIdentifier Nothing "text"]
326
327
328 queryNgramsOnlyByDocUser :: DPS.Query
329 queryNgramsOnlyByDocUser = [sql|
330 WITH input_rows(terms) AS (?),
331 input_list(id) AS (?)
332 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
333 JOIN ngrams ng ON cng.ngrams_id = ng.id
334 JOIN input_rows ir ON ir.terms = ng.terms
335 JOIN input_list il ON il.id = cng.context_id
336 WHERE cng.node_id = ? -- DocId
337 AND cng.ngrams_type = ? -- NgramsTypeId
338 GROUP BY ng.terms, cng.node_id
339 |]
340
341 ------------------------------------------------------------------------
342 -- | TODO filter by language, database, any social field
343 getContextsByNgramsMaster :: HasDBid NodeType
344 => UserCorpusId
345 -> MasterCorpusId
346 -> Cmd err (HashMap Text (Set NodeId))
347 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
348 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
349 -- . takeWhile (not . List.null)
350 -- . takeWhile (\l -> List.length l > 3)
351 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
352
353 selectNgramsByContextMaster :: HasDBid NodeType
354 => Int
355 -> UserCorpusId
356 -> MasterCorpusId
357 -> Int
358 -> Cmd err [(NodeId, Text)]
359 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
360 queryNgramsByContextMaster'
361 ( ucId
362 , ngramsTypeId NgramsTerms
363 , toDBid NodeDocument
364 , p
365 , toDBid NodeDocument
366 , p
367 , n
368 , mcId
369 , toDBid NodeDocument
370 , ngramsTypeId NgramsTerms
371 )
372
373 -- | TODO fix context_node_ngrams relation
374 queryNgramsByContextMaster' :: DPS.Query
375 queryNgramsByContextMaster' = [sql|
376 WITH contextsByNgramsUser AS (
377
378 SELECT n.id, ng.terms FROM contexts n
379 JOIN nodes_contexts nn ON n.id = nn.context_id
380 JOIN context_node_ngrams cng ON cng.context_id = n.id
381 JOIN ngrams ng ON cng.ngrams_id = ng.id
382 WHERE nn.node_id = ? -- UserCorpusId
383 -- AND n.typename = ? -- toDBid
384 AND cng.ngrams_type = ? -- NgramsTypeId
385 AND nn.category > 0
386 AND node_pos(n.id,?) >= ?
387 AND node_pos(n.id,?) < ?
388 GROUP BY n.id, ng.terms
389
390 ),
391
392 contextsByNgramsMaster AS (
393
394 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
395 JOIN context_node_ngrams cng ON n.id = cng.context_id
396 JOIN ngrams ng ON ng.id = cng.ngrams_id
397
398 WHERE n.parent_id = ? -- Master Corpus toDBid
399 AND n.typename = ? -- toDBid
400 AND cng.ngrams_type = ? -- NgramsTypeId
401 GROUP BY n.id, ng.terms
402 )
403
404 SELECT m.id, m.terms FROM nodesByNgramsMaster m
405 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
406 |]