]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
Merge remote-tracking branch 'origin/flexible-job-queue' into dev
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByContext.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByContext
17 where
18
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map (Map)
23 import Data.Set (Set)
24 import Data.Text (Text)
25 import Data.Tuple.Extra (first, second, swap)
26 import Database.PostgreSQL.Simple.SqlQQ (sql)
27 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
28 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
29 import Gargantext.Core
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
39
40 -- | fst is size of Supra Corpus
41 -- snd is Texts and size of Occurrences (different docs)
42
43 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
44 -> HashMap NgramsTerm (Set ContextId)
45 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
46 countContextsByNgramsWith f m = (total, m')
47 where
48 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
49 m' = HM.map ( swap . second (fromIntegral . Set.size))
50 $ groupContextsByNgramsWith f m
51
52
53 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
54 -> HashMap NgramsTerm (Set NodeId)
55 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
56 groupContextsByNgramsWith f' m'' =
57 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
58 $ HM.toList m''
59
60 ------------------------------------------------------------------------
61 getContextsByNgramsUser :: HasDBid NodeType
62 => CorpusId
63 -> NgramsType
64 -> Cmd err (HashMap NgramsTerm (Set ContextId))
65 getContextsByNgramsUser cId nt =
66 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
67 <$> selectNgramsByContextUser cId nt
68 where
69
70 selectNgramsByContextUser :: HasDBid NodeType
71 => CorpusId
72 -> NgramsType
73 -> Cmd err [(NodeId, Text)]
74 selectNgramsByContextUser cId' nt' =
75 runPGSQuery queryNgramsByContextUser
76 ( cId'
77 , toDBid NodeDocument
78 , ngramsTypeId nt'
79 -- , 100 :: Int -- limit
80 -- , 0 :: Int -- offset
81 )
82
83 queryNgramsByContextUser :: DPS.Query
84 queryNgramsByContextUser = [sql|
85 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
86 JOIN ngrams ng ON cng.ngrams_id = ng.id
87 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
88 JOIN contexts c ON nc.context_id = c.id
89 WHERE nc.node_id = ? -- CorpusId
90 AND c.typename = ? -- toDBid
91 AND cng.ngrams_type = ? -- NgramsTypeId
92 AND nc.category > 0 -- is not in Trash
93 GROUP BY cng.context_id, ng.terms
94 |]
95
96
97 ------------------------------------------------------------------------
98 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
99 => CorpusId
100 -> Int
101 -> NgramsType
102 -> [NgramsTerm]
103 -> Cmd err (HashMap NgramsTerm Int)
104 getOccByNgramsOnlyFast_withSample cId int nt ngs =
105 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
106
107
108 getOccByNgramsOnlyFast :: CorpusId
109 -> ListId
110 -> NgramsType
111 -> Cmd err (HashMap NgramsTerm Int)
112 getOccByNgramsOnlyFast cId lId nt = do
113 HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
114 where
115
116 run :: CorpusId
117 -> ListId
118 -> NgramsType
119 -> Cmd err [(Text, Double)]
120 run cId' lId' nt' = runPGSQuery query
121 ( cId'
122 , lId'
123 , ngramsTypeId nt'
124 )
125
126 query :: DPS.Query
127 query = [sql|
128 SELECT ng.terms
129 -- , ng.id
130 , round(nng.weight)
131 -- , ns.version
132 -- , nng.ngrams_type
133 -- , ns.ngrams_type_id
134 FROM ngrams ng
135 JOIN node_stories ns ON ng.id = ns.ngrams_id
136 JOIN node_node_ngrams nng ON ns.node_id = nng.node2_id
137 WHERE nng.node1_id = ?
138 AND nng.node2_id = ?
139 AND nng.ngrams_type = ?
140 AND nng.ngrams_id = ng.id
141 AND nng.ngrams_type = ns.ngrams_type_id
142 ORDER BY ng.id ASC;
143 |]
144
145
146 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
147 => CorpusId
148 -> Int
149 -> NgramsType
150 -> [NgramsTerm]
151 -> Cmd err [(NgramsTerm, Int)]
152 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
153 fmap (first NgramsTerm) <$>
154 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
155 ( int
156 , toDBid NodeDocument
157 , cId
158 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
159 , cId
160 , ngramsTypeId nt
161 )
162 where
163 fields = [QualifiedIdentifier Nothing "text"]
164
165 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
166 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
167 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
168 JOIN nodes_contexts nn ON n.id = nn.context_id
169 WHERE n.typename = ?
170 AND nn.node_id = ?),
171 input_rows(terms) AS (?)
172 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
173 JOIN ngrams ng ON cng.ngrams_id = ng.id
174 JOIN input_rows ir ON ir.terms = ng.terms
175 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
176 JOIN nodes_sample n ON nn.context_id = n.id
177 WHERE nn.node_id = ? -- CorpusId
178 AND cng.ngrams_type = ? -- NgramsTypeId
179 AND nn.category > 0
180 GROUP BY cng.node_id, ng.terms
181 |]
182
183 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
184 => CorpusId
185 -> Int
186 -> NgramsType
187 -> Cmd err [(NgramsTerm, Int)]
188 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
189 fmap (first NgramsTerm) <$>
190 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
191 ( int
192 , toDBid NodeDocument
193 , cId
194 , cId
195 , ngramsTypeId nt
196 )
197 where
198 fields = [QualifiedIdentifier Nothing "text"]
199
200 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
201 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
202 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
203 JOIN nodes_contexts nc ON c.id = nc.context_id
204 WHERE c.typename = ?
205 AND nc.node_id = ?)
206 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
207 JOIN ngrams ng ON cng.ngrams_id = ng.id
208 JOIN node_stories ns ON ns.ngrams_id = ng.id
209 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
210 JOIN contexts_sample c ON nc.context_id = c.id
211 WHERE nc.node_id = ? -- CorpusId
212 AND cng.ngrams_type = ? -- NgramsTypeId
213 AND nc.category > 0
214 GROUP BY ng.id
215 |]
216
217
218
219
220
221
222
223 ------------------------------------------------------------------------
224 getContextsByNgramsOnlyUser :: HasDBid NodeType
225 => CorpusId
226 -> [ListId]
227 -> NgramsType
228 -> [NgramsTerm]
229 -> Cmd err (HashMap NgramsTerm (Set NodeId))
230 getContextsByNgramsOnlyUser cId ls nt ngs =
231 HM.unionsWith (<>)
232 . map (HM.fromListWith (<>)
233 . map (second Set.singleton))
234 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
235 (splitEvery 1000 ngs)
236
237 getNgramsByContextOnlyUser :: HasDBid NodeType
238 => NodeId
239 -> [ListId]
240 -> NgramsType
241 -> [NgramsTerm]
242 -> Cmd err (Map NodeId (Set NgramsTerm))
243 getNgramsByContextOnlyUser cId ls nt ngs =
244 Map.unionsWith (<>)
245 . map ( Map.fromListWith (<>)
246 . map (second Set.singleton)
247 )
248 . map (map swap)
249 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
250 (splitEvery 1000 ngs)
251
252 ------------------------------------------------------------------------
253 selectNgramsOnlyByContextUser :: HasDBid NodeType
254 => CorpusId
255 -> [ListId]
256 -> NgramsType
257 -> [NgramsTerm]
258 -> Cmd err [(NgramsTerm, ContextId)]
259 selectNgramsOnlyByContextUser cId ls nt tms =
260 fmap (first NgramsTerm) <$>
261 runPGSQuery queryNgramsOnlyByContextUser
262 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
263 , Values [QualifiedIdentifier Nothing "int4"]
264 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
265 , cId
266 , toDBid NodeDocument
267 , ngramsTypeId nt
268 )
269 where
270 fields = [QualifiedIdentifier Nothing "text"]
271
272 queryNgramsOnlyByContextUser :: DPS.Query
273 queryNgramsOnlyByContextUser = [sql|
274 WITH input_rows(terms) AS (?),
275 input_list(id) AS (?)
276 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
277 JOIN ngrams ng ON cng.ngrams_id = ng.id
278 JOIN input_rows ir ON ir.terms = ng.terms
279 JOIN input_list il ON il.id = cng.node_id
280 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
281 JOIN contexts c ON nc.context_id = c.id
282 WHERE nc.node_id = ? -- CorpusId
283 AND c.typename = ? -- toDBid (maybe not useful with context table)
284 AND cng.ngrams_type = ? -- NgramsTypeId
285 AND nc.category > 0
286 GROUP BY ng.terms, cng.context_id
287 |]
288
289 getNgramsByDocOnlyUser :: DocId
290 -> [ListId]
291 -> NgramsType
292 -> [NgramsTerm]
293 -> Cmd err (HashMap NgramsTerm (Set NodeId))
294 getNgramsByDocOnlyUser cId ls nt ngs =
295 HM.unionsWith (<>)
296 . map (HM.fromListWith (<>) . map (second Set.singleton))
297 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
298
299
300 selectNgramsOnlyByDocUser :: DocId
301 -> [ListId]
302 -> NgramsType
303 -> [NgramsTerm]
304 -> Cmd err [(NgramsTerm, NodeId)]
305 selectNgramsOnlyByDocUser dId ls nt tms =
306 fmap (first NgramsTerm) <$>
307 runPGSQuery queryNgramsOnlyByDocUser
308 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
309 , Values [QualifiedIdentifier Nothing "int4"]
310 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
311 , dId
312 , ngramsTypeId nt
313 )
314 where
315 fields = [QualifiedIdentifier Nothing "text"]
316
317
318 queryNgramsOnlyByDocUser :: DPS.Query
319 queryNgramsOnlyByDocUser = [sql|
320 WITH input_rows(terms) AS (?),
321 input_list(id) AS (?)
322 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
323 JOIN ngrams ng ON cng.ngrams_id = ng.id
324 JOIN input_rows ir ON ir.terms = ng.terms
325 JOIN input_list il ON il.id = cng.context_id
326 WHERE cng.node_id = ? -- DocId
327 AND cng.ngrams_type = ? -- NgramsTypeId
328 GROUP BY ng.terms, cng.node_id
329 |]
330
331 ------------------------------------------------------------------------
332 -- | TODO filter by language, database, any social field
333 getContextsByNgramsMaster :: HasDBid NodeType
334 => UserCorpusId
335 -> MasterCorpusId
336 -> Cmd err (HashMap Text (Set NodeId))
337 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
338 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
339 -- . takeWhile (not . List.null)
340 -- . takeWhile (\l -> List.length l > 3)
341 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
342
343 selectNgramsByContextMaster :: HasDBid NodeType
344 => Int
345 -> UserCorpusId
346 -> MasterCorpusId
347 -> Int
348 -> Cmd err [(NodeId, Text)]
349 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
350 queryNgramsByContextMaster'
351 ( ucId
352 , ngramsTypeId NgramsTerms
353 , toDBid NodeDocument
354 , p
355 , toDBid NodeDocument
356 , p
357 , n
358 , mcId
359 , toDBid NodeDocument
360 , ngramsTypeId NgramsTerms
361 )
362
363 -- | TODO fix context_node_ngrams relation
364 queryNgramsByContextMaster' :: DPS.Query
365 queryNgramsByContextMaster' = [sql|
366 WITH contextsByNgramsUser AS (
367
368 SELECT n.id, ng.terms FROM contexts n
369 JOIN nodes_contexts nn ON n.id = nn.context_id
370 JOIN context_node_ngrams cng ON cng.context_id = n.id
371 JOIN ngrams ng ON cng.ngrams_id = ng.id
372 WHERE nn.node_id = ? -- UserCorpusId
373 -- AND n.typename = ? -- toDBid
374 AND cng.ngrams_type = ? -- NgramsTypeId
375 AND nn.category > 0
376 AND node_pos(n.id,?) >= ?
377 AND node_pos(n.id,?) < ?
378 GROUP BY n.id, ng.terms
379
380 ),
381
382 contextsByNgramsMaster AS (
383
384 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
385 JOIN context_node_ngrams cng ON n.id = cng.context_id
386 JOIN ngrams ng ON ng.id = cng.ngrams_id
387
388 WHERE n.parent_id = ? -- Master Corpus toDBid
389 AND n.typename = ? -- toDBid
390 AND cng.ngrams_type = ? -- NgramsTypeId
391 GROUP BY n.id, ng.terms
392 )
393
394 SELECT m.id, m.terms FROM nodesByNgramsMaster m
395 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
396 |]