]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
Merge branch 'dev' into 151-dev-pubmed-api-key
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByContext.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByContext
17 where
18
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map (Map)
23 import Data.Set (Set)
24 import Data.Text (Text)
25 import Data.Tuple.Extra (first, second, swap)
26 import Database.PostgreSQL.Simple.SqlQQ (sql)
27 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
28 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
29 import Gargantext.Core
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
39 import qualified Database.PostgreSQL.Simple.Types as DPST
40
41 -- | fst is size of Supra Corpus
42 -- snd is Texts and size of Occurrences (different docs)
43
44 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
45 -> HashMap NgramsTerm (Set ContextId)
46 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
47 countContextsByNgramsWith f m = (total, m')
48 where
49 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
50 m' = HM.map ( swap . second (fromIntegral . Set.size))
51 $ groupContextsByNgramsWith f m
52
53
54 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
55 -> HashMap NgramsTerm (Set NodeId)
56 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
57 groupContextsByNgramsWith f' m'' =
58 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
59 $ HM.toList m''
60
61 ------------------------------------------------------------------------
62 getContextsByNgramsUser :: HasDBid NodeType
63 => CorpusId
64 -> NgramsType
65 -> Cmd err (HashMap NgramsTerm (Set ContextId))
66 getContextsByNgramsUser cId nt =
67 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
68 <$> selectNgramsByContextUser cId nt
69 where
70
71 selectNgramsByContextUser :: HasDBid NodeType
72 => CorpusId
73 -> NgramsType
74 -> Cmd err [(NodeId, Text)]
75 selectNgramsByContextUser cId' nt' =
76 runPGSQuery queryNgramsByContextUser
77 ( cId'
78 , toDBid NodeDocument
79 , ngramsTypeId nt'
80 -- , 100 :: Int -- limit
81 -- , 0 :: Int -- offset
82 )
83
84 queryNgramsByContextUser :: DPS.Query
85 queryNgramsByContextUser = [sql|
86 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
87 JOIN ngrams ng ON cng.ngrams_id = ng.id
88 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
89 JOIN contexts c ON nc.context_id = c.id
90 WHERE nc.node_id = ? -- CorpusId
91 AND c.typename = ? -- toDBid
92 AND cng.ngrams_type = ? -- NgramsTypeId
93 AND nc.category > 0 -- is not in Trash
94 GROUP BY cng.context_id, ng.terms
95 |]
96
97
98 ------------------------------------------------------------------------
99 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
100 => CorpusId
101 -> Int
102 -> NgramsType
103 -> [NgramsTerm]
104 -> Cmd err (HashMap NgramsTerm Int)
105 getOccByNgramsOnlyFast_withSample cId int nt ngs =
106 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
107
108
109 getOccByNgramsOnlyFast :: CorpusId
110 -> ListId
111 -> NgramsType
112 -> Cmd err (HashMap NgramsTerm [ContextId])
113 getOccByNgramsOnlyFast cId lId nt = do
114 --HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
115 HM.fromList <$> map (\(t, ns) -> (NgramsTerm t, NodeId <$> DPST.fromPGArray ns)) <$> run cId lId nt
116 where
117
118 run :: CorpusId
119 -> ListId
120 -> NgramsType
121 -> Cmd err [(Text, DPST.PGArray Int)]
122 run cId' lId' nt' = runPGSQuery query
123 ( cId'
124 , lId'
125 , ngramsTypeId nt'
126 )
127
128 query :: DPS.Query
129 query = [sql|
130 SELECT ng.terms
131 -- , ng.id
132 --, round(nng.weight)
133 , ARRAY(SELECT DISTINCT context_node_ngrams.context_id FROM context_node_ngrams WHERE ng.id = ngrams_id) AS context_ids
134 -- , ns.version
135 -- , nng.ngrams_type
136 -- , ns.ngrams_type_id
137 FROM ngrams ng
138 JOIN node_stories ns ON ng.id = ns.ngrams_id
139 JOIN node_node_ngrams nng ON ns.node_id = nng.node2_id
140 WHERE nng.node1_id = ?
141 AND nng.node2_id = ?
142 AND nng.ngrams_type = ?
143 AND nng.ngrams_id = ng.id
144 AND nng.ngrams_type = ns.ngrams_type_id
145 ORDER BY ng.id ASC;
146 |]
147
148
149 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
150 => CorpusId
151 -> Int
152 -> NgramsType
153 -> [NgramsTerm]
154 -> Cmd err [(NgramsTerm, Int)]
155 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
156 fmap (first NgramsTerm) <$>
157 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
158 ( int
159 , toDBid NodeDocument
160 , cId
161 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
162 , cId
163 , ngramsTypeId nt
164 )
165 where
166 fields = [QualifiedIdentifier Nothing "text"]
167
168 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
169 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
170 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
171 JOIN nodes_contexts nn ON n.id = nn.context_id
172 WHERE n.typename = ?
173 AND nn.node_id = ?),
174 input_rows(terms) AS (?)
175 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
176 JOIN ngrams ng ON cng.ngrams_id = ng.id
177 JOIN input_rows ir ON ir.terms = ng.terms
178 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
179 JOIN nodes_sample n ON nn.context_id = n.id
180 WHERE nn.node_id = ? -- CorpusId
181 AND cng.ngrams_type = ? -- NgramsTypeId
182 AND nn.category > 0
183 GROUP BY cng.node_id, ng.terms
184 |]
185
186 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
187 => CorpusId
188 -> Int
189 -> NgramsType
190 -> Cmd err [(NgramsTerm, Int)]
191 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
192 fmap (first NgramsTerm) <$>
193 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
194 ( int
195 , toDBid NodeDocument
196 , cId
197 , cId
198 , ngramsTypeId nt
199 )
200
201 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
202 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
203 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
204 JOIN nodes_contexts nc ON c.id = nc.context_id
205 WHERE c.typename = ?
206 AND nc.node_id = ?)
207 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
208 JOIN ngrams ng ON cng.ngrams_id = ng.id
209 JOIN node_stories ns ON ns.ngrams_id = ng.id
210 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
211 JOIN contexts_sample c ON nc.context_id = c.id
212 WHERE nc.node_id = ? -- CorpusId
213 AND cng.ngrams_type = ? -- NgramsTypeId
214 AND nc.category > 0
215 GROUP BY ng.id
216 |]
217
218
219
220
221
222
223
224 ------------------------------------------------------------------------
225 getContextsByNgramsOnlyUser :: HasDBid NodeType
226 => CorpusId
227 -> [ListId]
228 -> NgramsType
229 -> [NgramsTerm]
230 -> Cmd err (HashMap NgramsTerm (Set NodeId))
231 getContextsByNgramsOnlyUser cId ls nt ngs =
232 HM.unionsWith (<>)
233 . map (HM.fromListWith (<>)
234 . map (second Set.singleton))
235 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
236 (splitEvery 1000 ngs)
237
238 getNgramsByContextOnlyUser :: HasDBid NodeType
239 => NodeId
240 -> [ListId]
241 -> NgramsType
242 -> [NgramsTerm]
243 -> Cmd err (Map NodeId (Set NgramsTerm))
244 getNgramsByContextOnlyUser cId ls nt ngs =
245 Map.unionsWith (<>)
246 . map ( Map.fromListWith (<>)
247 . map (second Set.singleton)
248 )
249 . map (map swap)
250 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
251 (splitEvery 1000 ngs)
252
253 ------------------------------------------------------------------------
254 selectNgramsOnlyByContextUser :: HasDBid NodeType
255 => CorpusId
256 -> [ListId]
257 -> NgramsType
258 -> [NgramsTerm]
259 -> Cmd err [(NgramsTerm, ContextId)]
260 selectNgramsOnlyByContextUser cId ls nt tms =
261 fmap (first NgramsTerm) <$>
262 runPGSQuery queryNgramsOnlyByContextUser
263 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
264 , Values [QualifiedIdentifier Nothing "int4"]
265 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
266 , cId
267 , toDBid NodeDocument
268 , ngramsTypeId nt
269 )
270 where
271 fields = [QualifiedIdentifier Nothing "text"]
272
273 queryNgramsOnlyByContextUser :: DPS.Query
274 queryNgramsOnlyByContextUser = [sql|
275 WITH input_rows(terms) AS (?),
276 input_list(id) AS (?)
277 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
278 JOIN ngrams ng ON cng.ngrams_id = ng.id
279 JOIN input_rows ir ON ir.terms = ng.terms
280 JOIN input_list il ON il.id = cng.node_id
281 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
282 JOIN contexts c ON nc.context_id = c.id
283 WHERE nc.node_id = ? -- CorpusId
284 AND c.typename = ? -- toDBid (maybe not useful with context table)
285 AND cng.ngrams_type = ? -- NgramsTypeId
286 AND nc.category > 0
287 GROUP BY ng.terms, cng.context_id
288 |]
289
290 getNgramsByDocOnlyUser :: DocId
291 -> [ListId]
292 -> NgramsType
293 -> [NgramsTerm]
294 -> Cmd err (HashMap NgramsTerm (Set NodeId))
295 getNgramsByDocOnlyUser cId ls nt ngs =
296 HM.unionsWith (<>)
297 . map (HM.fromListWith (<>) . map (second Set.singleton))
298 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
299
300
301 selectNgramsOnlyByDocUser :: DocId
302 -> [ListId]
303 -> NgramsType
304 -> [NgramsTerm]
305 -> Cmd err [(NgramsTerm, NodeId)]
306 selectNgramsOnlyByDocUser dId ls nt tms =
307 fmap (first NgramsTerm) <$>
308 runPGSQuery queryNgramsOnlyByDocUser
309 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
310 , Values [QualifiedIdentifier Nothing "int4"]
311 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
312 , dId
313 , ngramsTypeId nt
314 )
315 where
316 fields = [QualifiedIdentifier Nothing "text"]
317
318
319 queryNgramsOnlyByDocUser :: DPS.Query
320 queryNgramsOnlyByDocUser = [sql|
321 WITH input_rows(terms) AS (?),
322 input_list(id) AS (?)
323 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
324 JOIN ngrams ng ON cng.ngrams_id = ng.id
325 JOIN input_rows ir ON ir.terms = ng.terms
326 JOIN input_list il ON il.id = cng.context_id
327 WHERE cng.node_id = ? -- DocId
328 AND cng.ngrams_type = ? -- NgramsTypeId
329 GROUP BY ng.terms, cng.node_id
330 |]
331
332 ------------------------------------------------------------------------
333 -- | TODO filter by language, database, any social field
334 getContextsByNgramsMaster :: HasDBid NodeType
335 => UserCorpusId
336 -> MasterCorpusId
337 -> Cmd err (HashMap Text (Set NodeId))
338 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
339 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
340 -- . takeWhile (not . List.null)
341 -- . takeWhile (\l -> List.length l > 3)
342 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
343
344 selectNgramsByContextMaster :: HasDBid NodeType
345 => Int
346 -> UserCorpusId
347 -> MasterCorpusId
348 -> Int
349 -> Cmd err [(NodeId, Text)]
350 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
351 queryNgramsByContextMaster'
352 ( ucId
353 , ngramsTypeId NgramsTerms
354 , toDBid NodeDocument
355 , p
356 , toDBid NodeDocument
357 , p
358 , n
359 , mcId
360 , toDBid NodeDocument
361 , ngramsTypeId NgramsTerms
362 )
363
364 -- | TODO fix context_node_ngrams relation
365 queryNgramsByContextMaster' :: DPS.Query
366 queryNgramsByContextMaster' = [sql|
367 WITH contextsByNgramsUser AS (
368
369 SELECT n.id, ng.terms FROM contexts n
370 JOIN nodes_contexts nn ON n.id = nn.context_id
371 JOIN context_node_ngrams cng ON cng.context_id = n.id
372 JOIN ngrams ng ON cng.ngrams_id = ng.id
373 WHERE nn.node_id = ? -- UserCorpusId
374 -- AND n.typename = ? -- toDBid
375 AND cng.ngrams_type = ? -- NgramsTypeId
376 AND nn.category > 0
377 AND node_pos(n.id,?) >= ?
378 AND node_pos(n.id,?) < ?
379 GROUP BY n.id, ng.terms
380
381 ),
382
383 contextsByNgramsMaster AS (
384
385 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
386 JOIN context_node_ngrams cng ON n.id = cng.context_id
387 JOIN ngrams ng ON ng.id = cng.ngrams_id
388
389 WHERE n.parent_id = ? -- Master Corpus toDBid
390 AND n.typename = ? -- toDBid
391 AND cng.ngrams_type = ? -- NgramsTypeId
392 GROUP BY n.id, ng.terms
393 )
394
395 SELECT m.id, m.terms FROM nodesByNgramsMaster m
396 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
397 |]