]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
[WIP] specification for optimization
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByContext.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByContext
17 where
18
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map (Map)
23 import Data.Maybe (catMaybes)
24 import Data.Set (Set)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (first, second, swap)
27 import Database.PostgreSQL.Simple.SqlQQ (sql)
28 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Core
31 import Gargantext.Data.HashMap.Strict.Utils as HM
32 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
33 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
34 import Gargantext.Database.Query.Table.Ngrams (selectNgramsId)
35 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..), NgramsId)
36 import Gargantext.Prelude
37 import qualified Data.HashMap.Strict as HM
38 import qualified Data.Map as Map
39 import qualified Data.Set as Set
40 import qualified Database.PostgreSQL.Simple as DPS
41
42 -- | fst is size of Supra Corpus
43 -- snd is Texts and size of Occurrences (different docs)
44
45 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
46 -> HashMap NgramsTerm (Set ContextId)
47 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
48 countContextsByNgramsWith f m = (total, m')
49 where
50 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
51 m' = HM.map ( swap . second (fromIntegral . Set.size))
52 $ groupContextsByNgramsWith f m
53
54
55 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
56 -> HashMap NgramsTerm (Set NodeId)
57 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
58 groupContextsByNgramsWith f' m'' =
59 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
60 $ HM.toList m''
61
62 ------------------------------------------------------------------------
63 getContextsByNgramsUser :: HasDBid NodeType
64 => CorpusId
65 -> NgramsType
66 -> Cmd err (HashMap NgramsTerm (Set ContextId))
67 getContextsByNgramsUser cId nt =
68 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
69 <$> selectNgramsByContextUser cId nt
70 where
71
72 selectNgramsByContextUser :: HasDBid NodeType
73 => CorpusId
74 -> NgramsType
75 -> Cmd err [(NodeId, Text)]
76 selectNgramsByContextUser cId' nt' =
77 runPGSQuery queryNgramsByContextUser
78 ( cId'
79 , toDBid NodeDocument
80 , ngramsTypeId nt'
81 -- , 100 :: Int -- limit
82 -- , 0 :: Int -- offset
83 )
84
85 queryNgramsByContextUser :: DPS.Query
86 queryNgramsByContextUser = [sql|
87 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
88 JOIN ngrams ng ON cng.ngrams_id = ng.id
89 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
90 JOIN contexts c ON nc.context_id = c.id
91 WHERE nc.node_id = ? -- CorpusId
92 AND c.typename = ? -- toDBid
93 AND cng.ngrams_type = ? -- NgramsTypeId
94 AND nc.category > 0 -- is not in Trash
95 GROUP BY cng.context_id, ng.terms
96 |]
97
98
99 ------------------------------------------------------------------------
100 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
101 => CorpusId
102 -> Int
103 -> NgramsType
104 -> [NgramsTerm]
105 -> Cmd err (HashMap NgramsTerm Int)
106 getOccByNgramsOnlyFast_withSample cId int nt ngs =
107 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
108
109
110 getOccByNgramsOnlyFast' :: CorpusId
111 -> ListId
112 -> NgramsType
113 -> [NgramsTerm]
114 -> Cmd err (HashMap NgramsTerm Int)
115 getOccByNgramsOnlyFast' cId lId nt tms = do -- trace (show (cId, lId)) $
116 mapNgramsIds <- selectNgramsId $ map unNgramsTerm tms
117 HM.fromListWith (+) <$> catMaybes
118 <$> map (\(nId, s) -> (,) <$> (NgramsTerm <$> (Map.lookup nId mapNgramsIds)) <*> (Just $ round s) )
119 <$> run cId lId nt (Map.keys mapNgramsIds)
120 where
121
122 run :: CorpusId
123 -> ListId
124 -> NgramsType
125 -> [NgramsId]
126 -> Cmd err [(NgramsId, Double)]
127 run cId' lId' nt' tms' = runPGSQuery query
128 ( Values fields ((DPS.Only) <$> tms')
129 , cId'
130 , lId'
131 , ngramsTypeId nt'
132 )
133 fields = [QualifiedIdentifier Nothing "int4"]
134
135
136 query :: DPS.Query
137 query = [sql|
138 WITH input_ngrams(id) AS (?)
139
140 SELECT ngi.id, nng.weight FROM nodes_contexts nc
141 JOIN node_node_ngrams nng ON nng.node1_id = nc.node_id
142 JOIN input_ngrams ngi ON nng.ngrams_id = ngi.id
143 WHERE nng.node1_id = ?
144 AND nng.node2_id = ?
145 AND nng.ngrams_type = ?
146 AND nc.category > 0
147 GROUP BY ngi.id, nng.weight
148
149 |]
150
151
152
153
154 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
155 => CorpusId
156 -> Int
157 -> NgramsType
158 -> [NgramsTerm]
159 -> Cmd err [(NgramsTerm, Int)]
160 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
161 fmap (first NgramsTerm) <$>
162 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
163 ( int
164 , toDBid NodeDocument
165 , cId
166 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
167 , cId
168 , ngramsTypeId nt
169 )
170 where
171 fields = [QualifiedIdentifier Nothing "text"]
172
173 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
174 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
175 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
176 JOIN nodes_contexts nn ON n.id = nn.context_id
177 WHERE n.typename = ?
178 AND nn.node_id = ?),
179 input_rows(terms) AS (?)
180 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
181 JOIN ngrams ng ON cng.ngrams_id = ng.id
182 JOIN input_rows ir ON ir.terms = ng.terms
183 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
184 JOIN nodes_sample n ON nn.context_id = n.id
185 WHERE nn.node_id = ? -- CorpusId
186 AND cng.ngrams_type = ? -- NgramsTypeId
187 AND nn.category > 0
188 GROUP BY cng.node_id, ng.terms
189 |]
190
191 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
192 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
193 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
194 JOIN nodes_contexts nc ON c.id = nc.context_id
195 WHERE c.typename = ?
196 AND nc.node_id = ?),
197 -- input_rows(terms) AS (?)
198 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
199 JOIN ngrams ng ON cng.ngrams_id = ng.id
200 JOIN input_rows ir ON ir.terms = ng.terms
201 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
202 JOIN contexts_sample c ON nc.context_id = c.id
203 WHERE nc.node_id = ? -- CorpusId
204 AND cng.ngrams_type = ? -- NgramsTypeId
205 AND nc.category > 0
206 GROUP BY cng.node_id, ng.terms
207 |]
208
209
210
211
212
213
214
215 ------------------------------------------------------------------------
216 getContextsByNgramsOnlyUser :: HasDBid NodeType
217 => CorpusId
218 -> [ListId]
219 -> NgramsType
220 -> [NgramsTerm]
221 -> Cmd err (HashMap NgramsTerm (Set NodeId))
222 getContextsByNgramsOnlyUser cId ls nt ngs =
223 HM.unionsWith (<>)
224 . map (HM.fromListWith (<>)
225 . map (second Set.singleton))
226 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
227 (splitEvery 1000 ngs)
228
229 getNgramsByContextOnlyUser :: HasDBid NodeType
230 => NodeId
231 -> [ListId]
232 -> NgramsType
233 -> [NgramsTerm]
234 -> Cmd err (Map NodeId (Set NgramsTerm))
235 getNgramsByContextOnlyUser cId ls nt ngs =
236 Map.unionsWith (<>)
237 . map ( Map.fromListWith (<>)
238 . map (second Set.singleton)
239 )
240 . map (map swap)
241 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
242 (splitEvery 1000 ngs)
243
244 ------------------------------------------------------------------------
245 selectNgramsOnlyByContextUser :: HasDBid NodeType
246 => CorpusId
247 -> [ListId]
248 -> NgramsType
249 -> [NgramsTerm]
250 -> Cmd err [(NgramsTerm, ContextId)]
251 selectNgramsOnlyByContextUser cId ls nt tms =
252 fmap (first NgramsTerm) <$>
253 runPGSQuery queryNgramsOnlyByContextUser
254 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
255 , Values [QualifiedIdentifier Nothing "int4"]
256 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
257 , cId
258 , toDBid NodeDocument
259 , ngramsTypeId nt
260 )
261 where
262 fields = [QualifiedIdentifier Nothing "text"]
263
264 queryNgramsOnlyByContextUser :: DPS.Query
265 queryNgramsOnlyByContextUser = [sql|
266 WITH input_rows(terms) AS (?),
267 input_list(id) AS (?)
268 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
269 JOIN ngrams ng ON cng.ngrams_id = ng.id
270 JOIN input_rows ir ON ir.terms = ng.terms
271 JOIN input_list il ON il.id = cng.node_id
272 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
273 JOIN contexts c ON nc.context_id = c.id
274 WHERE nc.node_id = ? -- CorpusId
275 AND c.typename = ? -- toDBid (maybe not useful with context table)
276 AND cng.ngrams_type = ? -- NgramsTypeId
277 AND nc.category > 0
278 GROUP BY ng.terms, cng.context_id
279 |]
280
281 getNgramsByDocOnlyUser :: DocId
282 -> [ListId]
283 -> NgramsType
284 -> [NgramsTerm]
285 -> Cmd err (HashMap NgramsTerm (Set NodeId))
286 getNgramsByDocOnlyUser cId ls nt ngs =
287 HM.unionsWith (<>)
288 . map (HM.fromListWith (<>) . map (second Set.singleton))
289 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
290
291
292 selectNgramsOnlyByDocUser :: DocId
293 -> [ListId]
294 -> NgramsType
295 -> [NgramsTerm]
296 -> Cmd err [(NgramsTerm, NodeId)]
297 selectNgramsOnlyByDocUser dId ls nt tms =
298 fmap (first NgramsTerm) <$>
299 runPGSQuery queryNgramsOnlyByDocUser
300 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
301 , Values [QualifiedIdentifier Nothing "int4"]
302 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
303 , dId
304 , ngramsTypeId nt
305 )
306 where
307 fields = [QualifiedIdentifier Nothing "text"]
308
309
310 queryNgramsOnlyByDocUser :: DPS.Query
311 queryNgramsOnlyByDocUser = [sql|
312 WITH input_rows(terms) AS (?),
313 input_list(id) AS (?)
314 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
315 JOIN ngrams ng ON cng.ngrams_id = ng.id
316 JOIN input_rows ir ON ir.terms = ng.terms
317 JOIN input_list il ON il.id = cng.context_id
318 WHERE cng.node_id = ? -- DocId
319 AND cng.ngrams_type = ? -- NgramsTypeId
320 GROUP BY ng.terms, cng.node_id
321 |]
322
323 ------------------------------------------------------------------------
324 -- | TODO filter by language, database, any social field
325 getContextsByNgramsMaster :: HasDBid NodeType
326 => UserCorpusId
327 -> MasterCorpusId
328 -> Cmd err (HashMap Text (Set NodeId))
329 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
330 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
331 -- . takeWhile (not . List.null)
332 -- . takeWhile (\l -> List.length l > 3)
333 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
334
335 selectNgramsByContextMaster :: HasDBid NodeType
336 => Int
337 -> UserCorpusId
338 -> MasterCorpusId
339 -> Int
340 -> Cmd err [(NodeId, Text)]
341 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
342 queryNgramsByContextMaster'
343 ( ucId
344 , ngramsTypeId NgramsTerms
345 , toDBid NodeDocument
346 , p
347 , toDBid NodeDocument
348 , p
349 , n
350 , mcId
351 , toDBid NodeDocument
352 , ngramsTypeId NgramsTerms
353 )
354
355 -- | TODO fix context_node_ngrams relation
356 queryNgramsByContextMaster' :: DPS.Query
357 queryNgramsByContextMaster' = [sql|
358 WITH contextsByNgramsUser AS (
359
360 SELECT n.id, ng.terms FROM contexts n
361 JOIN nodes_contexts nn ON n.id = nn.context_id
362 JOIN context_node_ngrams cng ON cng.context_id = n.id
363 JOIN ngrams ng ON cng.ngrams_id = ng.id
364 WHERE nn.node_id = ? -- UserCorpusId
365 -- AND n.typename = ? -- toDBid
366 AND cng.ngrams_type = ? -- NgramsTypeId
367 AND nn.category > 0
368 AND node_pos(n.id,?) >= ?
369 AND node_pos(n.id,?) < ?
370 GROUP BY n.id, ng.terms
371
372 ),
373
374 contextsByNgramsMaster AS (
375
376 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
377 JOIN context_node_ngrams cng ON n.id = cng.context_id
378 JOIN ngrams ng ON ng.id = cng.ngrams_id
379
380 WHERE n.parent_id = ? -- Master Corpus toDBid
381 AND n.typename = ? -- toDBid
382 AND cng.ngrams_type = ? -- NgramsTypeId
383 GROUP BY n.id, ng.terms
384 )
385
386 SELECT m.id, m.terms FROM nodesByNgramsMaster m
387 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
388 |]