]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
[conduit] some work towards flow migration to conduit
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByContext.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByContext
17 where
18
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map (Map)
23 import Data.Maybe (catMaybes)
24 import Data.Set (Set)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (first, second, swap)
27 import Database.PostgreSQL.Simple.SqlQQ (sql)
28 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Core
31 import Gargantext.Data.HashMap.Strict.Utils as HM
32 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
33 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
34 import Gargantext.Database.Query.Table.Ngrams (selectNgramsId)
35 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..), NgramsId)
36 import Gargantext.Prelude
37 import qualified Data.HashMap.Strict as HM
38 import qualified Data.Map as Map
39 import qualified Data.Set as Set
40 import qualified Database.PostgreSQL.Simple as DPS
41
42 -- | fst is size of Supra Corpus
43 -- snd is Texts and size of Occurrences (different docs)
44
45 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
46 -> HashMap NgramsTerm (Set ContextId)
47 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
48 countContextsByNgramsWith f m = (total, m')
49 where
50 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
51 m' = HM.map ( swap . second (fromIntegral . Set.size))
52 $ groupContextsByNgramsWith f m
53
54
55 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
56 -> HashMap NgramsTerm (Set NodeId)
57 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
58 groupContextsByNgramsWith f' m'' =
59 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
60 $ HM.toList m''
61
62 ------------------------------------------------------------------------
63 getContextsByNgramsUser :: HasDBid NodeType
64 => CorpusId
65 -> NgramsType
66 -> Cmd err (HashMap NgramsTerm (Set ContextId))
67 getContextsByNgramsUser cId nt =
68 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
69 <$> selectNgramsByContextUser cId nt
70 where
71
72 selectNgramsByContextUser :: HasDBid NodeType
73 => CorpusId
74 -> NgramsType
75 -> Cmd err [(NodeId, Text)]
76 selectNgramsByContextUser cId' nt' =
77 runPGSQuery queryNgramsByContextUser
78 ( cId'
79 , toDBid NodeDocument
80 , ngramsTypeId nt'
81 -- , 100 :: Int -- limit
82 -- , 0 :: Int -- offset
83 )
84
85 queryNgramsByContextUser :: DPS.Query
86 queryNgramsByContextUser = [sql|
87 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
88 JOIN ngrams ng ON cng.ngrams_id = ng.id
89 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
90 JOIN contexts c ON nc.context_id = c.id
91 WHERE nc.node_id = ? -- CorpusId
92 AND c.typename = ? -- toDBid
93 AND cng.ngrams_type = ? -- NgramsTypeId
94 AND nc.category > 0 -- is not in Trash
95 GROUP BY cng.context_id, ng.terms
96 |]
97
98
99 ------------------------------------------------------------------------
100 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
101 => CorpusId
102 -> Int
103 -> NgramsType
104 -> [NgramsTerm]
105 -> Cmd err (HashMap NgramsTerm Int)
106 getOccByNgramsOnlyFast_withSample cId int nt ngs =
107 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
108
109
110 getOccByNgramsOnlyFast' :: CorpusId
111 -> ListId
112 -> NgramsType
113 -> [NgramsTerm]
114 -> Cmd err (HashMap NgramsTerm Int)
115 getOccByNgramsOnlyFast' cId lId nt tms = do -- trace (show (cId, lId)) $
116 mapNgramsIds <- selectNgramsId $ map unNgramsTerm tms
117 HM.fromListWith (+) <$> catMaybes
118 <$> map (\(nId, s) -> (,) <$> (NgramsTerm <$> (Map.lookup nId mapNgramsIds)) <*> (Just $ round s) )
119 <$> run cId lId nt (Map.keys mapNgramsIds)
120 where
121
122 run :: CorpusId
123 -> ListId
124 -> NgramsType
125 -> [NgramsId]
126 -> Cmd err [(NgramsId, Double)]
127 run cId' lId' nt' tms' = runPGSQuery query
128 ( Values fields ((DPS.Only) <$> tms')
129 , cId'
130 , lId'
131 , ngramsTypeId nt'
132 )
133 fields = [QualifiedIdentifier Nothing "int4"]
134
135
136 query :: DPS.Query
137 query = [sql|
138 WITH input_ngrams(id) AS (?)
139
140 SELECT ngi.id, nng.weight FROM nodes_contexts nc
141 JOIN node_node_ngrams nng ON nng.node1_id = nc.node_id
142 JOIN input_ngrams ngi ON nng.ngrams_id = ngi.id
143 WHERE nng.node1_id = ?
144 AND nng.node2_id = ?
145 AND nng.ngrams_type = ?
146 AND nc.category > 0
147 GROUP BY ngi.id, nng.weight
148
149 |]
150
151
152
153
154 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
155 => CorpusId
156 -> Int
157 -> NgramsType
158 -> [NgramsTerm]
159 -> Cmd err [(NgramsTerm, Int)]
160 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
161 fmap (first NgramsTerm) <$>
162 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
163 ( int
164 , toDBid NodeDocument
165 , cId
166 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
167 , cId
168 , ngramsTypeId nt
169 )
170 where
171 fields = [QualifiedIdentifier Nothing "text"]
172
173 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
174 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
175 WITH nodes_sample AS (SELECT id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
176 JOIN nodes_contexts nn ON n.id = nn.context_id
177 WHERE n.typename = ?
178 AND nn.node_id = ?),
179 input_rows(terms) AS (?)
180 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
181 JOIN ngrams ng ON cng.ngrams_id = ng.id
182 JOIN input_rows ir ON ir.terms = ng.terms
183 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
184 JOIN nodes_sample n ON nn.context_id = n.id
185 WHERE nn.node_id = ? -- CorpusId
186 AND cng.ngrams_type = ? -- NgramsTypeId
187 AND nn.category > 0
188 GROUP BY cng.node_id, ng.terms
189 |]
190
191
192 ------------------------------------------------------------------------
193 getContextsByNgramsOnlyUser :: HasDBid NodeType
194 => CorpusId
195 -> [ListId]
196 -> NgramsType
197 -> [NgramsTerm]
198 -> Cmd err (HashMap NgramsTerm (Set NodeId))
199 getContextsByNgramsOnlyUser cId ls nt ngs =
200 HM.unionsWith (<>)
201 . map (HM.fromListWith (<>)
202 . map (second Set.singleton))
203 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
204 (splitEvery 1000 ngs)
205
206 getNgramsByContextOnlyUser :: HasDBid NodeType
207 => NodeId
208 -> [ListId]
209 -> NgramsType
210 -> [NgramsTerm]
211 -> Cmd err (Map NodeId (Set NgramsTerm))
212 getNgramsByContextOnlyUser cId ls nt ngs =
213 Map.unionsWith (<>)
214 . map ( Map.fromListWith (<>)
215 . map (second Set.singleton)
216 )
217 . map (map swap)
218 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
219 (splitEvery 1000 ngs)
220
221 ------------------------------------------------------------------------
222 selectNgramsOnlyByContextUser :: HasDBid NodeType
223 => CorpusId
224 -> [ListId]
225 -> NgramsType
226 -> [NgramsTerm]
227 -> Cmd err [(NgramsTerm, ContextId)]
228 selectNgramsOnlyByContextUser cId ls nt tms =
229 fmap (first NgramsTerm) <$>
230 runPGSQuery queryNgramsOnlyByContextUser
231 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
232 , Values [QualifiedIdentifier Nothing "int4"]
233 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
234 , cId
235 , toDBid NodeDocument
236 , ngramsTypeId nt
237 )
238 where
239 fields = [QualifiedIdentifier Nothing "text"]
240
241 queryNgramsOnlyByContextUser :: DPS.Query
242 queryNgramsOnlyByContextUser = [sql|
243 WITH input_rows(terms) AS (?),
244 input_list(id) AS (?)
245 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
246 JOIN ngrams ng ON cng.ngrams_id = ng.id
247 JOIN input_rows ir ON ir.terms = ng.terms
248 JOIN input_list il ON il.id = cng.node_id
249 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
250 JOIN contexts c ON nc.context_id = c.id
251 WHERE nc.node_id = ? -- CorpusId
252 AND c.typename = ? -- toDBid (maybe not useful with context table)
253 AND cng.ngrams_type = ? -- NgramsTypeId
254 AND nc.category > 0
255 GROUP BY ng.terms, cng.context_id
256 |]
257
258 getNgramsByDocOnlyUser :: DocId
259 -> [ListId]
260 -> NgramsType
261 -> [NgramsTerm]
262 -> Cmd err (HashMap NgramsTerm (Set NodeId))
263 getNgramsByDocOnlyUser cId ls nt ngs =
264 HM.unionsWith (<>)
265 . map (HM.fromListWith (<>) . map (second Set.singleton))
266 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
267
268
269 selectNgramsOnlyByDocUser :: DocId
270 -> [ListId]
271 -> NgramsType
272 -> [NgramsTerm]
273 -> Cmd err [(NgramsTerm, NodeId)]
274 selectNgramsOnlyByDocUser dId ls nt tms =
275 fmap (first NgramsTerm) <$>
276 runPGSQuery queryNgramsOnlyByDocUser
277 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
278 , Values [QualifiedIdentifier Nothing "int4"]
279 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
280 , dId
281 , ngramsTypeId nt
282 )
283 where
284 fields = [QualifiedIdentifier Nothing "text"]
285
286
287 queryNgramsOnlyByDocUser :: DPS.Query
288 queryNgramsOnlyByDocUser = [sql|
289 WITH input_rows(terms) AS (?),
290 input_list(id) AS (?)
291 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
292 JOIN ngrams ng ON cng.ngrams_id = ng.id
293 JOIN input_rows ir ON ir.terms = ng.terms
294 JOIN input_list il ON il.id = cng.context_id
295 WHERE cng.node_id = ? -- DocId
296 AND cng.ngrams_type = ? -- NgramsTypeId
297 GROUP BY ng.terms, cng.node_id
298 |]
299
300 ------------------------------------------------------------------------
301 -- | TODO filter by language, database, any social field
302 getContextsByNgramsMaster :: HasDBid NodeType
303 => UserCorpusId
304 -> MasterCorpusId
305 -> Cmd err (HashMap Text (Set NodeId))
306 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
307 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
308 -- . takeWhile (not . List.null)
309 -- . takeWhile (\l -> List.length l > 3)
310 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
311
312 selectNgramsByContextMaster :: HasDBid NodeType
313 => Int
314 -> UserCorpusId
315 -> MasterCorpusId
316 -> Int
317 -> Cmd err [(NodeId, Text)]
318 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
319 queryNgramsByContextMaster'
320 ( ucId
321 , ngramsTypeId NgramsTerms
322 , toDBid NodeDocument
323 , p
324 , toDBid NodeDocument
325 , p
326 , n
327 , mcId
328 , toDBid NodeDocument
329 , ngramsTypeId NgramsTerms
330 )
331
332 -- | TODO fix context_node_ngrams relation
333 queryNgramsByContextMaster' :: DPS.Query
334 queryNgramsByContextMaster' = [sql|
335 WITH contextsByNgramsUser AS (
336
337 SELECT n.id, ng.terms FROM contexts n
338 JOIN nodes_contexts nn ON n.id = nn.context_id
339 JOIN context_node_ngrams cng ON cng.context_id = n.id
340 JOIN ngrams ng ON cng.ngrams_id = ng.id
341 WHERE nn.node_id = ? -- UserCorpusId
342 -- AND n.typename = ? -- toDBid
343 AND cng.ngrams_type = ? -- NgramsTypeId
344 AND nn.category > 0
345 AND node_pos(n.id,?) >= ?
346 AND node_pos(n.id,?) < ?
347 GROUP BY n.id, ng.terms
348
349 ),
350
351 contextsByNgramsMaster AS (
352
353 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
354 JOIN context_node_ngrams cng ON n.id = cng.context_id
355 JOIN ngrams ng ON ng.id = cng.ngrams_id
356
357 WHERE n.parent_id = ? -- Master Corpus toDBid
358 AND n.typename = ? -- toDBid
359 AND cng.ngrams_type = ? -- NgramsTypeId
360 GROUP BY n.id, ng.terms
361 )
362
363 SELECT m.id, m.terms FROM nodesByNgramsMaster m
364 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
365 |]