]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
[FEAT] Iramuteq parser (WIP)
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByContext.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByContext
17 where
18
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map.Strict (Map)
23 import Data.Set (Set)
24 import Data.Text (Text)
25 import Data.Tuple.Extra (first, second, swap)
26 import Database.PostgreSQL.Simple.SqlQQ (sql)
27 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
28 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
29 import Gargantext.Core
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.List as List
37 import qualified Data.Map.Strict as Map
38 import qualified Data.Set as Set
39 import qualified Database.PostgreSQL.Simple as DPS
40 import qualified Database.PostgreSQL.Simple.Types as DPST
41
42 -- | fst is size of Supra Corpus
43 -- snd is Texts and size of Occurrences (different docs)
44
45 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
46 -> HashMap NgramsTerm (Set ContextId)
47 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
48 countContextsByNgramsWith f m = (total, m')
49 where
50 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
51 m' = HM.map ( swap . second (fromIntegral . Set.size))
52 $ groupContextsByNgramsWith f m
53
54
55 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
56 -> HashMap NgramsTerm (Set NodeId)
57 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
58 groupContextsByNgramsWith f' m'' =
59 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
60 $ HM.toList m''
61
62 ------------------------------------------------------------------------
63 getContextsByNgramsUser :: HasDBid NodeType
64 => CorpusId
65 -> NgramsType
66 -> Cmd err (HashMap NgramsTerm (Set ContextId))
67 getContextsByNgramsUser cId nt =
68 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
69 <$> selectNgramsByContextUser cId nt
70 where
71
72 selectNgramsByContextUser :: HasDBid NodeType
73 => CorpusId
74 -> NgramsType
75 -> Cmd err [(NodeId, Text)]
76 selectNgramsByContextUser cId' nt' =
77 runPGSQuery queryNgramsByContextUser
78 ( cId'
79 , toDBid NodeDocument
80 , ngramsTypeId nt'
81 -- , 100 :: Int -- limit
82 -- , 0 :: Int -- offset
83 )
84
85 queryNgramsByContextUser :: DPS.Query
86 queryNgramsByContextUser = [sql|
87 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
88 JOIN ngrams ng ON cng.ngrams_id = ng.id
89 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
90 JOIN contexts c ON nc.context_id = c.id
91 WHERE nc.node_id = ? -- CorpusId
92 AND c.typename = ? -- toDBid
93 AND cng.ngrams_type = ? -- NgramsTypeId
94 AND nc.category > 0 -- is not in Trash
95 GROUP BY cng.context_id, ng.terms
96 |]
97
98
99 ------------------------------------------------------------------------
100 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
101 => CorpusId
102 -> Int
103 -> NgramsType
104 -> [NgramsTerm]
105 -> Cmd err (HashMap NgramsTerm Int)
106 getOccByNgramsOnlyFast_withSample cId int nt ngs =
107 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
108
109
110 getOccByNgramsOnlyFast :: CorpusId
111 -> ListId
112 -> NgramsType
113 -> Cmd err (HashMap NgramsTerm [ContextId])
114 getOccByNgramsOnlyFast cId lId nt = do
115 --HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
116 HM.fromList <$> map (\(t, ns) -> (NgramsTerm t, NodeId <$> DPST.fromPGArray ns)) <$> run cId lId nt
117 where
118
119 run :: CorpusId
120 -> ListId
121 -> NgramsType
122 -> Cmd err [(Text, DPST.PGArray Int)]
123 run cId' lId' nt' = runPGSQuery query
124 ( cId'
125 , cId'
126 , lId'
127 , ngramsTypeId nt'
128 )
129
130 query :: DPS.Query
131 query = [sql|
132 SELECT ng.terms
133 , ARRAY(
134 SELECT DISTINCT context_node_ngrams.context_id
135 FROM context_node_ngrams
136 JOIN nodes_contexts
137 ON context_node_ngrams.context_id = nodes_contexts.context_id
138 WHERE ng.id = context_node_ngrams.ngrams_id
139 AND nodes_contexts.node_id = ?
140 ) AS context_ids
141 FROM ngrams ng
142 JOIN node_stories ns ON ng.id = ns.ngrams_id
143 JOIN node_node_ngrams nng ON ns.node_id = nng.node2_id
144 WHERE nng.node1_id = ?
145 AND nng.node2_id = ?
146 AND nng.ngrams_type = ?
147 AND nng.ngrams_id = ng.id
148 AND nng.ngrams_type = ns.ngrams_type_id
149 ORDER BY ng.id ASC;
150 |]
151
152
153 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
154 => CorpusId
155 -> Int
156 -> NgramsType
157 -> [NgramsTerm]
158 -> Cmd err [(NgramsTerm, Int)]
159 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
160 fmap (first NgramsTerm) <$>
161 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
162 ( int
163 , toDBid NodeDocument
164 , cId
165 , Values fields ((DPS.Only . unNgramsTerm) <$> (List.take 10000 tms))
166 , cId
167 , ngramsTypeId nt
168 )
169 where
170 fields = [QualifiedIdentifier Nothing "text"]
171
172 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
173 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
174 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
175 JOIN nodes_contexts nn ON n.id = nn.context_id
176 WHERE n.typename = ?
177 AND nn.node_id = ?),
178 input_rows(terms) AS (?)
179 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
180 JOIN ngrams ng ON cng.ngrams_id = ng.id
181 JOIN input_rows ir ON ir.terms = ng.terms
182 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
183 JOIN nodes_sample n ON nn.context_id = n.id
184 WHERE nn.node_id = ? -- CorpusId
185 AND cng.ngrams_type = ? -- NgramsTypeId
186 AND nn.category > 0
187 GROUP BY cng.node_id, ng.terms
188 |]
189
190 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
191 => CorpusId
192 -> Int
193 -> NgramsType
194 -> Cmd err [(NgramsTerm, Int)]
195 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
196 fmap (first NgramsTerm) <$>
197 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
198 ( int
199 , toDBid NodeDocument
200 , cId
201 , cId
202 , ngramsTypeId nt
203 )
204
205 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
206 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
207 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
208 JOIN nodes_contexts nc ON c.id = nc.context_id
209 WHERE c.typename = ?
210 AND nc.node_id = ?)
211 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
212 JOIN ngrams ng ON cng.ngrams_id = ng.id
213 JOIN node_stories ns ON ns.ngrams_id = ng.id
214 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
215 JOIN contexts_sample c ON nc.context_id = c.id
216 WHERE nc.node_id = ? -- CorpusId
217 AND cng.ngrams_type = ? -- NgramsTypeId
218 AND nc.category > 0
219 GROUP BY ng.id
220 |]
221
222
223
224
225
226
227
228 ------------------------------------------------------------------------
229 getContextsByNgramsOnlyUser :: HasDBid NodeType
230 => CorpusId
231 -> [ListId]
232 -> NgramsType
233 -> [NgramsTerm]
234 -> Cmd err (HashMap NgramsTerm (Set NodeId))
235 getContextsByNgramsOnlyUser cId ls nt ngs =
236 HM.unionsWith (<>)
237 . map (HM.fromListWith (<>)
238 . map (second Set.singleton))
239 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
240 (splitEvery 1000 ngs)
241
242 getNgramsByContextOnlyUser :: HasDBid NodeType
243 => NodeId
244 -> [ListId]
245 -> NgramsType
246 -> [NgramsTerm]
247 -> Cmd err (Map NodeId (Set NgramsTerm))
248 getNgramsByContextOnlyUser cId ls nt ngs =
249 Map.unionsWith (<>)
250 . map ( Map.fromListWith (<>)
251 . map (second Set.singleton)
252 )
253 . map (map swap)
254 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
255 (splitEvery 1000 ngs)
256
257 ------------------------------------------------------------------------
258 selectNgramsOnlyByContextUser :: HasDBid NodeType
259 => CorpusId
260 -> [ListId]
261 -> NgramsType
262 -> [NgramsTerm]
263 -> Cmd err [(NgramsTerm, ContextId)]
264 selectNgramsOnlyByContextUser cId ls nt tms =
265 fmap (first NgramsTerm) <$>
266 runPGSQuery queryNgramsOnlyByContextUser
267 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
268 , Values [QualifiedIdentifier Nothing "int4"]
269 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
270 , cId
271 , toDBid NodeDocument
272 , ngramsTypeId nt
273 )
274 where
275 fields = [QualifiedIdentifier Nothing "text"]
276
277 queryNgramsOnlyByContextUser :: DPS.Query
278 queryNgramsOnlyByContextUser = [sql|
279 WITH input_rows(terms) AS (?),
280 input_list(id) AS (?)
281 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
282 JOIN ngrams ng ON cng.ngrams_id = ng.id
283 JOIN input_rows ir ON ir.terms = ng.terms
284 JOIN input_list il ON il.id = cng.node_id
285 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
286 JOIN contexts c ON nc.context_id = c.id
287 WHERE nc.node_id = ? -- CorpusId
288 AND c.typename = ? -- toDBid (maybe not useful with context table)
289 AND cng.ngrams_type = ? -- NgramsTypeId
290 AND nc.category > 0
291 GROUP BY ng.terms, cng.context_id
292 |]
293
294 getNgramsByDocOnlyUser :: DocId
295 -> [ListId]
296 -> NgramsType
297 -> [NgramsTerm]
298 -> Cmd err (HashMap NgramsTerm (Set NodeId))
299 getNgramsByDocOnlyUser cId ls nt ngs =
300 HM.unionsWith (<>)
301 . map (HM.fromListWith (<>) . map (second Set.singleton))
302 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
303
304
305 selectNgramsOnlyByDocUser :: DocId
306 -> [ListId]
307 -> NgramsType
308 -> [NgramsTerm]
309 -> Cmd err [(NgramsTerm, NodeId)]
310 selectNgramsOnlyByDocUser dId ls nt tms =
311 fmap (first NgramsTerm) <$>
312 runPGSQuery queryNgramsOnlyByDocUser
313 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
314 , Values [QualifiedIdentifier Nothing "int4"]
315 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
316 , dId
317 , ngramsTypeId nt
318 )
319 where
320 fields = [QualifiedIdentifier Nothing "text"]
321
322
323 queryNgramsOnlyByDocUser :: DPS.Query
324 queryNgramsOnlyByDocUser = [sql|
325 WITH input_rows(terms) AS (?),
326 input_list(id) AS (?)
327 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
328 JOIN ngrams ng ON cng.ngrams_id = ng.id
329 JOIN input_rows ir ON ir.terms = ng.terms
330 JOIN input_list il ON il.id = cng.context_id
331 WHERE cng.node_id = ? -- DocId
332 AND cng.ngrams_type = ? -- NgramsTypeId
333 GROUP BY ng.terms, cng.node_id
334 |]
335
336 ------------------------------------------------------------------------
337 -- | TODO filter by language, database, any social field
338 getContextsByNgramsMaster :: HasDBid NodeType
339 => UserCorpusId
340 -> MasterCorpusId
341 -> Cmd err (HashMap Text (Set NodeId))
342 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
343 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
344 -- . takeWhile (not . List.null)
345 -- . takeWhile (\l -> List.length l > 3)
346 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
347
348 selectNgramsByContextMaster :: HasDBid NodeType
349 => Int
350 -> UserCorpusId
351 -> MasterCorpusId
352 -> Int
353 -> Cmd err [(NodeId, Text)]
354 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
355 queryNgramsByContextMaster'
356 ( ucId
357 , ngramsTypeId NgramsTerms
358 , toDBid NodeDocument
359 , p
360 , toDBid NodeDocument
361 , p
362 , n
363 , mcId
364 , toDBid NodeDocument
365 , ngramsTypeId NgramsTerms
366 )
367
368 -- | TODO fix context_node_ngrams relation
369 queryNgramsByContextMaster' :: DPS.Query
370 queryNgramsByContextMaster' = [sql|
371 WITH contextsByNgramsUser AS (
372
373 SELECT n.id, ng.terms FROM contexts n
374 JOIN nodes_contexts nn ON n.id = nn.context_id
375 JOIN context_node_ngrams cng ON cng.context_id = n.id
376 JOIN ngrams ng ON cng.ngrams_id = ng.id
377 WHERE nn.node_id = ? -- UserCorpusId
378 -- AND n.typename = ? -- toDBid
379 AND cng.ngrams_type = ? -- NgramsTypeId
380 AND nn.category > 0
381 AND node_pos(n.id,?) >= ?
382 AND node_pos(n.id,?) < ?
383 GROUP BY n.id, ng.terms
384
385 ),
386
387 contextsByNgramsMaster AS (
388
389 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
390 JOIN context_node_ngrams cng ON n.id = cng.context_id
391 JOIN ngrams ng ON ng.id = cng.ngrams_id
392
393 WHERE n.parent_id = ? -- Master Corpus toDBid
394 AND n.typename = ? -- toDBid
395 AND cng.ngrams_type = ? -- NgramsTypeId
396 GROUP BY n.id, ng.terms
397 )
398
399 SELECT m.id, m.terms FROM nodesByNgramsMaster m
400 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
401 |]