]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
[OPTIM] Ngrams Table query
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByContext.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByContext
17 where
18
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map (Map)
23 import Data.Set (Set)
24 import Data.Text (Text)
25 import Data.Tuple.Extra (first, second, swap)
26 import Database.PostgreSQL.Simple.SqlQQ (sql)
27 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
28 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
29 import Gargantext.Core
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
39
40 -- | fst is size of Supra Corpus
41 -- snd is Texts and size of Occurrences (different docs)
42
43 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
44 -> HashMap NgramsTerm (Set ContextId)
45 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
46 countContextsByNgramsWith f m = (total, m')
47 where
48 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
49 m' = HM.map ( swap . second (fromIntegral . Set.size))
50 $ groupContextsByNgramsWith f m
51
52
53 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
54 -> HashMap NgramsTerm (Set NodeId)
55 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
56 groupContextsByNgramsWith f' m'' =
57 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
58 $ HM.toList m''
59
60 ------------------------------------------------------------------------
61 getContextsByNgramsUser :: HasDBid NodeType
62 => CorpusId
63 -> NgramsType
64 -> Cmd err (HashMap NgramsTerm (Set ContextId))
65 getContextsByNgramsUser cId nt =
66 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
67 <$> selectNgramsByContextUser cId nt
68 where
69
70 selectNgramsByContextUser :: HasDBid NodeType
71 => CorpusId
72 -> NgramsType
73 -> Cmd err [(NodeId, Text)]
74 selectNgramsByContextUser cId' nt' =
75 runPGSQuery queryNgramsByContextUser
76 ( cId'
77 , toDBid NodeDocument
78 , ngramsTypeId nt'
79 -- , 100 :: Int -- limit
80 -- , 0 :: Int -- offset
81 )
82
83 queryNgramsByContextUser :: DPS.Query
84 queryNgramsByContextUser = [sql|
85 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
86 JOIN ngrams ng ON cng.ngrams_id = ng.id
87 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
88 JOIN contexts c ON nc.context_id = c.id
89 WHERE nc.node_id = ? -- CorpusId
90 AND c.typename = ? -- toDBid
91 AND cng.ngrams_type = ? -- NgramsTypeId
92 AND nc.category > 0 -- is not in Trash
93 GROUP BY cng.context_id, ng.terms
94 |]
95
96
97 ------------------------------------------------------------------------
98 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
99 => CorpusId
100 -> Int
101 -> NgramsType
102 -> [NgramsTerm]
103 -> Cmd err (HashMap NgramsTerm Int)
104 getOccByNgramsOnlyFast_withSample cId int nt ngs =
105 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
106
107
108 getOccByNgramsOnlyFast :: CorpusId
109 -> ListId
110 -> NgramsType
111 -> Cmd err (HashMap NgramsTerm Int)
112 getOccByNgramsOnlyFast cId lId nt = do
113 HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
114 where
115
116 run :: CorpusId
117 -> ListId
118 -> NgramsType
119 -> Cmd err [(Text, Double)]
120 run cId' lId' nt' = runPGSQuery query
121 ( cId'
122 , lId'
123 , ngramsTypeId nt'
124 )
125
126 query :: DPS.Query
127 query = [sql|
128 SELECT ng.terms
129 -- , ng.id
130 , round(nng.weight)
131 -- , ns.version
132 -- , nng.ngrams_type
133 -- , ns.ngrams_type_id
134 FROM ngrams ng
135 JOIN node_stories ns ON ng.id = ns.ngrams_id
136 JOIN node_node_ngrams nng ON ns.node_id = nng.node2_id
137 WHERE nng.node1_id = ?
138 AND nng.node2_id = ?
139 AND nng.ngrams_type = ?
140 AND nng.ngrams_id = ng.id
141 AND nng.ngrams_type = ns.ngrams_type_id
142 ORDER BY ng.id ASC;
143 |]
144
145
146
147
148
149
150
151
152 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
153 => CorpusId
154 -> Int
155 -> NgramsType
156 -> [NgramsTerm]
157 -> Cmd err [(NgramsTerm, Int)]
158 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
159 fmap (first NgramsTerm) <$>
160 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
161 ( int
162 , toDBid NodeDocument
163 , cId
164 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
165 , cId
166 , ngramsTypeId nt
167 )
168 where
169 fields = [QualifiedIdentifier Nothing "text"]
170
171 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
172 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
173 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
174 JOIN nodes_contexts nn ON n.id = nn.context_id
175 WHERE n.typename = ?
176 AND nn.node_id = ?),
177 input_rows(terms) AS (?)
178 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
179 JOIN ngrams ng ON cng.ngrams_id = ng.id
180 JOIN input_rows ir ON ir.terms = ng.terms
181 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
182 JOIN nodes_sample n ON nn.context_id = n.id
183 WHERE nn.node_id = ? -- CorpusId
184 AND cng.ngrams_type = ? -- NgramsTypeId
185 AND nn.category > 0
186 GROUP BY cng.node_id, ng.terms
187 |]
188
189
190 ------------------------------------------------------------------------
191 getContextsByNgramsOnlyUser :: HasDBid NodeType
192 => CorpusId
193 -> [ListId]
194 -> NgramsType
195 -> [NgramsTerm]
196 -> Cmd err (HashMap NgramsTerm (Set NodeId))
197 getContextsByNgramsOnlyUser cId ls nt ngs =
198 HM.unionsWith (<>)
199 . map (HM.fromListWith (<>)
200 . map (second Set.singleton))
201 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
202 (splitEvery 1000 ngs)
203
204 getNgramsByContextOnlyUser :: HasDBid NodeType
205 => NodeId
206 -> [ListId]
207 -> NgramsType
208 -> [NgramsTerm]
209 -> Cmd err (Map NodeId (Set NgramsTerm))
210 getNgramsByContextOnlyUser cId ls nt ngs =
211 Map.unionsWith (<>)
212 . map ( Map.fromListWith (<>)
213 . map (second Set.singleton)
214 )
215 . map (map swap)
216 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
217 (splitEvery 1000 ngs)
218
219 ------------------------------------------------------------------------
220 selectNgramsOnlyByContextUser :: HasDBid NodeType
221 => CorpusId
222 -> [ListId]
223 -> NgramsType
224 -> [NgramsTerm]
225 -> Cmd err [(NgramsTerm, ContextId)]
226 selectNgramsOnlyByContextUser cId ls nt tms =
227 fmap (first NgramsTerm) <$>
228 runPGSQuery queryNgramsOnlyByContextUser
229 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
230 , Values [QualifiedIdentifier Nothing "int4"]
231 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
232 , cId
233 , toDBid NodeDocument
234 , ngramsTypeId nt
235 )
236 where
237 fields = [QualifiedIdentifier Nothing "text"]
238
239 queryNgramsOnlyByContextUser :: DPS.Query
240 queryNgramsOnlyByContextUser = [sql|
241 WITH input_rows(terms) AS (?),
242 input_list(id) AS (?)
243 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
244 JOIN ngrams ng ON cng.ngrams_id = ng.id
245 JOIN input_rows ir ON ir.terms = ng.terms
246 JOIN input_list il ON il.id = cng.node_id
247 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
248 JOIN contexts c ON nc.context_id = c.id
249 WHERE nc.node_id = ? -- CorpusId
250 AND c.typename = ? -- toDBid (maybe not useful with context table)
251 AND cng.ngrams_type = ? -- NgramsTypeId
252 AND nc.category > 0
253 GROUP BY ng.terms, cng.context_id
254 |]
255
256 getNgramsByDocOnlyUser :: DocId
257 -> [ListId]
258 -> NgramsType
259 -> [NgramsTerm]
260 -> Cmd err (HashMap NgramsTerm (Set NodeId))
261 getNgramsByDocOnlyUser cId ls nt ngs =
262 HM.unionsWith (<>)
263 . map (HM.fromListWith (<>) . map (second Set.singleton))
264 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
265
266
267 selectNgramsOnlyByDocUser :: DocId
268 -> [ListId]
269 -> NgramsType
270 -> [NgramsTerm]
271 -> Cmd err [(NgramsTerm, NodeId)]
272 selectNgramsOnlyByDocUser dId ls nt tms =
273 fmap (first NgramsTerm) <$>
274 runPGSQuery queryNgramsOnlyByDocUser
275 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
276 , Values [QualifiedIdentifier Nothing "int4"]
277 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
278 , dId
279 , ngramsTypeId nt
280 )
281 where
282 fields = [QualifiedIdentifier Nothing "text"]
283
284
285 queryNgramsOnlyByDocUser :: DPS.Query
286 queryNgramsOnlyByDocUser = [sql|
287 WITH input_rows(terms) AS (?),
288 input_list(id) AS (?)
289 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
290 JOIN ngrams ng ON cng.ngrams_id = ng.id
291 JOIN input_rows ir ON ir.terms = ng.terms
292 JOIN input_list il ON il.id = cng.context_id
293 WHERE cng.node_id = ? -- DocId
294 AND cng.ngrams_type = ? -- NgramsTypeId
295 GROUP BY ng.terms, cng.node_id
296 |]
297
298 ------------------------------------------------------------------------
299 -- | TODO filter by language, database, any social field
300 getContextsByNgramsMaster :: HasDBid NodeType
301 => UserCorpusId
302 -> MasterCorpusId
303 -> Cmd err (HashMap Text (Set NodeId))
304 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
305 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
306 -- . takeWhile (not . List.null)
307 -- . takeWhile (\l -> List.length l > 3)
308 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
309
310 selectNgramsByContextMaster :: HasDBid NodeType
311 => Int
312 -> UserCorpusId
313 -> MasterCorpusId
314 -> Int
315 -> Cmd err [(NodeId, Text)]
316 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
317 queryNgramsByContextMaster'
318 ( ucId
319 , ngramsTypeId NgramsTerms
320 , toDBid NodeDocument
321 , p
322 , toDBid NodeDocument
323 , p
324 , n
325 , mcId
326 , toDBid NodeDocument
327 , ngramsTypeId NgramsTerms
328 )
329
330 -- | TODO fix context_node_ngrams relation
331 queryNgramsByContextMaster' :: DPS.Query
332 queryNgramsByContextMaster' = [sql|
333 WITH contextsByNgramsUser AS (
334
335 SELECT n.id, ng.terms FROM contexts n
336 JOIN nodes_contexts nn ON n.id = nn.context_id
337 JOIN context_node_ngrams cng ON cng.context_id = n.id
338 JOIN ngrams ng ON cng.ngrams_id = ng.id
339 WHERE nn.node_id = ? -- UserCorpusId
340 -- AND n.typename = ? -- toDBid
341 AND cng.ngrams_type = ? -- NgramsTypeId
342 AND nn.category > 0
343 AND node_pos(n.id,?) >= ?
344 AND node_pos(n.id,?) < ?
345 GROUP BY n.id, ng.terms
346
347 ),
348
349 contextsByNgramsMaster AS (
350
351 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
352 JOIN context_node_ngrams cng ON n.id = cng.context_id
353 JOIN ngrams ng ON ng.id = cng.ngrams_id
354
355 WHERE n.parent_id = ? -- Master Corpus toDBid
356 AND n.typename = ? -- toDBid
357 AND cng.ngrams_type = ? -- NgramsTypeId
358 GROUP BY n.id, ng.terms
359 )
360
361 SELECT m.id, m.terms FROM nodesByNgramsMaster m
362 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
363 |]