]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
Merge branch '145-graphExplorerSearch' into 145-dev-graph-explorer-search-tfidf
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByContext.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByContext
17 where
18
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map (Map)
23 import Data.Set (Set)
24 import Data.Text (Text)
25 import Data.Tuple.Extra (first, second, swap)
26 import Database.PostgreSQL.Simple.SqlQQ (sql)
27 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
28 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
29 import Gargantext.Core
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
39
40 -- | fst is size of Supra Corpus
41 -- snd is Texts and size of Occurrences (different docs)
42
43 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
44 -> HashMap NgramsTerm (Set ContextId)
45 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
46 countContextsByNgramsWith f m = (total, m')
47 where
48 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
49 m' = HM.map ( swap . second (fromIntegral . Set.size))
50 $ groupContextsByNgramsWith f m
51
52
53 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
54 -> HashMap NgramsTerm (Set NodeId)
55 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
56 groupContextsByNgramsWith f' m'' =
57 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
58 $ HM.toList m''
59
60 ------------------------------------------------------------------------
61 getContextsByNgramsUser :: HasDBid NodeType
62 => CorpusId
63 -> NgramsType
64 -> Cmd err (HashMap NgramsTerm (Set ContextId))
65 getContextsByNgramsUser cId nt =
66 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
67 <$> selectNgramsByContextUser cId nt
68 where
69
70 selectNgramsByContextUser :: HasDBid NodeType
71 => CorpusId
72 -> NgramsType
73 -> Cmd err [(NodeId, Text)]
74 selectNgramsByContextUser cId' nt' =
75 runPGSQuery queryNgramsByContextUser
76 ( cId'
77 , toDBid NodeDocument
78 , ngramsTypeId nt'
79 -- , 100 :: Int -- limit
80 -- , 0 :: Int -- offset
81 )
82
83 queryNgramsByContextUser :: DPS.Query
84 queryNgramsByContextUser = [sql|
85 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
86 JOIN ngrams ng ON cng.ngrams_id = ng.id
87 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
88 JOIN contexts c ON nc.context_id = c.id
89 WHERE nc.node_id = ? -- CorpusId
90 AND c.typename = ? -- toDBid
91 AND cng.ngrams_type = ? -- NgramsTypeId
92 AND nc.category > 0 -- is not in Trash
93 GROUP BY cng.context_id, ng.terms
94 |]
95
96
97 ------------------------------------------------------------------------
98 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
99 => CorpusId
100 -> Int
101 -> NgramsType
102 -> [NgramsTerm]
103 -> Cmd err (HashMap NgramsTerm Int)
104 getOccByNgramsOnlyFast_withSample cId int nt ngs =
105 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
106
107
108 getOccByNgramsOnlyFast :: CorpusId
109 -> ListId
110 -> NgramsType
111 -> Cmd err (HashMap NgramsTerm Int)
112 getOccByNgramsOnlyFast cId lId nt = do
113 HM.fromList <$> map (\(t,n) -> (NgramsTerm t, round n)) <$> run cId lId nt
114 where
115
116 run :: CorpusId
117 -> ListId
118 -> NgramsType
119 -> Cmd err [(Text, Double)]
120 run cId' lId' nt' = runPGSQuery query
121 ( cId'
122 , lId'
123 , ngramsTypeId nt'
124 )
125
126 query :: DPS.Query
127 query = [sql|
128 SELECT ng.terms
129 -- , ng.id
130 , round(nng.weight)
131 -- , ns.version
132 -- , nng.ngrams_type
133 -- , ns.ngrams_type_id
134 FROM ngrams ng
135 JOIN node_stories ns ON ng.id = ns.ngrams_id
136 JOIN node_node_ngrams nng ON ns.node_id = nng.node2_id
137 WHERE nng.node1_id = ?
138 AND nng.node2_id = ?
139 AND nng.ngrams_type = ?
140 AND nng.ngrams_id = ng.id
141 AND nng.ngrams_type = ns.ngrams_type_id
142 ORDER BY ng.id ASC;
143 |]
144
145
146 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
147 => CorpusId
148 -> Int
149 -> NgramsType
150 -> [NgramsTerm]
151 -> Cmd err [(NgramsTerm, Int)]
152 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
153 fmap (first NgramsTerm) <$>
154 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
155 ( int
156 , toDBid NodeDocument
157 , cId
158 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
159 , cId
160 , ngramsTypeId nt
161 )
162 where
163 fields = [QualifiedIdentifier Nothing "text"]
164
165 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
166 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
167 WITH nodes_sample AS (SELECT n.id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
168 JOIN nodes_contexts nn ON n.id = nn.context_id
169 WHERE n.typename = ?
170 AND nn.node_id = ?),
171 input_rows(terms) AS (?)
172 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
173 JOIN ngrams ng ON cng.ngrams_id = ng.id
174 JOIN input_rows ir ON ir.terms = ng.terms
175 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
176 JOIN nodes_sample n ON nn.context_id = n.id
177 WHERE nn.node_id = ? -- CorpusId
178 AND cng.ngrams_type = ? -- NgramsTypeId
179 AND nn.category > 0
180 GROUP BY cng.node_id, ng.terms
181 |]
182
183 selectNgramsOccurrencesOnlyByContextUser_withSample' :: HasDBid NodeType
184 => CorpusId
185 -> Int
186 -> NgramsType
187 -> Cmd err [(NgramsTerm, Int)]
188 selectNgramsOccurrencesOnlyByContextUser_withSample' cId int nt =
189 fmap (first NgramsTerm) <$>
190 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
191 ( int
192 , toDBid NodeDocument
193 , cId
194 , cId
195 , ngramsTypeId nt
196 )
197
198 queryNgramsOccurrencesOnlyByContextUser_withSample' :: DPS.Query
199 queryNgramsOccurrencesOnlyByContextUser_withSample' = [sql|
200 WITH contexts_sample AS (SELECT c.id FROM contexts c TABLESAMPLE SYSTEM_ROWS (?)
201 JOIN nodes_contexts nc ON c.id = nc.context_id
202 WHERE c.typename = ?
203 AND nc.node_id = ?)
204 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
205 JOIN ngrams ng ON cng.ngrams_id = ng.id
206 JOIN node_stories ns ON ns.ngrams_id = ng.id
207 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
208 JOIN contexts_sample c ON nc.context_id = c.id
209 WHERE nc.node_id = ? -- CorpusId
210 AND cng.ngrams_type = ? -- NgramsTypeId
211 AND nc.category > 0
212 GROUP BY ng.id
213 |]
214
215
216
217
218
219
220
221 ------------------------------------------------------------------------
222 getContextsByNgramsOnlyUser :: HasDBid NodeType
223 => CorpusId
224 -> [ListId]
225 -> NgramsType
226 -> [NgramsTerm]
227 -> Cmd err (HashMap NgramsTerm (Set NodeId))
228 getContextsByNgramsOnlyUser cId ls nt ngs =
229 HM.unionsWith (<>)
230 . map (HM.fromListWith (<>)
231 . map (second Set.singleton))
232 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
233 (splitEvery 1000 ngs)
234
235 getNgramsByContextOnlyUser :: HasDBid NodeType
236 => NodeId
237 -> [ListId]
238 -> NgramsType
239 -> [NgramsTerm]
240 -> Cmd err (Map NodeId (Set NgramsTerm))
241 getNgramsByContextOnlyUser cId ls nt ngs =
242 Map.unionsWith (<>)
243 . map ( Map.fromListWith (<>)
244 . map (second Set.singleton)
245 )
246 . map (map swap)
247 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
248 (splitEvery 1000 ngs)
249
250 ------------------------------------------------------------------------
251 selectNgramsOnlyByContextUser :: HasDBid NodeType
252 => CorpusId
253 -> [ListId]
254 -> NgramsType
255 -> [NgramsTerm]
256 -> Cmd err [(NgramsTerm, ContextId)]
257 selectNgramsOnlyByContextUser cId ls nt tms =
258 fmap (first NgramsTerm) <$>
259 runPGSQuery queryNgramsOnlyByContextUser
260 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
261 , Values [QualifiedIdentifier Nothing "int4"]
262 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
263 , cId
264 , toDBid NodeDocument
265 , ngramsTypeId nt
266 )
267 where
268 fields = [QualifiedIdentifier Nothing "text"]
269
270 queryNgramsOnlyByContextUser :: DPS.Query
271 queryNgramsOnlyByContextUser = [sql|
272 WITH input_rows(terms) AS (?),
273 input_list(id) AS (?)
274 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
275 JOIN ngrams ng ON cng.ngrams_id = ng.id
276 JOIN input_rows ir ON ir.terms = ng.terms
277 JOIN input_list il ON il.id = cng.node_id
278 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
279 JOIN contexts c ON nc.context_id = c.id
280 WHERE nc.node_id = ? -- CorpusId
281 AND c.typename = ? -- toDBid (maybe not useful with context table)
282 AND cng.ngrams_type = ? -- NgramsTypeId
283 AND nc.category > 0
284 GROUP BY ng.terms, cng.context_id
285 |]
286
287 getNgramsByDocOnlyUser :: DocId
288 -> [ListId]
289 -> NgramsType
290 -> [NgramsTerm]
291 -> Cmd err (HashMap NgramsTerm (Set NodeId))
292 getNgramsByDocOnlyUser cId ls nt ngs =
293 HM.unionsWith (<>)
294 . map (HM.fromListWith (<>) . map (second Set.singleton))
295 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
296
297
298 selectNgramsOnlyByDocUser :: DocId
299 -> [ListId]
300 -> NgramsType
301 -> [NgramsTerm]
302 -> Cmd err [(NgramsTerm, NodeId)]
303 selectNgramsOnlyByDocUser dId ls nt tms =
304 fmap (first NgramsTerm) <$>
305 runPGSQuery queryNgramsOnlyByDocUser
306 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
307 , Values [QualifiedIdentifier Nothing "int4"]
308 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
309 , dId
310 , ngramsTypeId nt
311 )
312 where
313 fields = [QualifiedIdentifier Nothing "text"]
314
315
316 queryNgramsOnlyByDocUser :: DPS.Query
317 queryNgramsOnlyByDocUser = [sql|
318 WITH input_rows(terms) AS (?),
319 input_list(id) AS (?)
320 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
321 JOIN ngrams ng ON cng.ngrams_id = ng.id
322 JOIN input_rows ir ON ir.terms = ng.terms
323 JOIN input_list il ON il.id = cng.context_id
324 WHERE cng.node_id = ? -- DocId
325 AND cng.ngrams_type = ? -- NgramsTypeId
326 GROUP BY ng.terms, cng.node_id
327 |]
328
329 ------------------------------------------------------------------------
330 -- | TODO filter by language, database, any social field
331 getContextsByNgramsMaster :: HasDBid NodeType
332 => UserCorpusId
333 -> MasterCorpusId
334 -> Cmd err (HashMap Text (Set NodeId))
335 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
336 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
337 -- . takeWhile (not . List.null)
338 -- . takeWhile (\l -> List.length l > 3)
339 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
340
341 selectNgramsByContextMaster :: HasDBid NodeType
342 => Int
343 -> UserCorpusId
344 -> MasterCorpusId
345 -> Int
346 -> Cmd err [(NodeId, Text)]
347 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
348 queryNgramsByContextMaster'
349 ( ucId
350 , ngramsTypeId NgramsTerms
351 , toDBid NodeDocument
352 , p
353 , toDBid NodeDocument
354 , p
355 , n
356 , mcId
357 , toDBid NodeDocument
358 , ngramsTypeId NgramsTerms
359 )
360
361 -- | TODO fix context_node_ngrams relation
362 queryNgramsByContextMaster' :: DPS.Query
363 queryNgramsByContextMaster' = [sql|
364 WITH contextsByNgramsUser AS (
365
366 SELECT n.id, ng.terms FROM contexts n
367 JOIN nodes_contexts nn ON n.id = nn.context_id
368 JOIN context_node_ngrams cng ON cng.context_id = n.id
369 JOIN ngrams ng ON cng.ngrams_id = ng.id
370 WHERE nn.node_id = ? -- UserCorpusId
371 -- AND n.typename = ? -- toDBid
372 AND cng.ngrams_type = ? -- NgramsTypeId
373 AND nn.category > 0
374 AND node_pos(n.id,?) >= ?
375 AND node_pos(n.id,?) < ?
376 GROUP BY n.id, ng.terms
377
378 ),
379
380 contextsByNgramsMaster AS (
381
382 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
383 JOIN context_node_ngrams cng ON n.id = cng.context_id
384 JOIN ngrams ng ON ng.id = cng.ngrams_id
385
386 WHERE n.parent_id = ? -- Master Corpus toDBid
387 AND n.typename = ? -- toDBid
388 AND cng.ngrams_type = ? -- NgramsTypeId
389 GROUP BY n.id, ng.terms
390 )
391
392 SELECT m.id, m.terms FROM nodesByNgramsMaster m
393 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
394 |]