]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
Merge branch 'dev' into 104-dev-john-snow-nlp
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByContext.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByContext
17 where
18
19 -- import Debug.Trace (trace)
20 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.HashMap.Strict (HashMap)
22 import Data.Map (Map)
23 import Data.Maybe (catMaybes)
24 import Data.Set (Set)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (first, second, swap)
27 import Database.PostgreSQL.Simple.SqlQQ (sql)
28 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Core
31 import Gargantext.Data.HashMap.Strict.Utils as HM
32 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
33 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
34 import Gargantext.Database.Query.Table.Ngrams (selectNgramsId)
35 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..), NgramsId)
36 import Gargantext.Prelude
37 import qualified Data.HashMap.Strict as HM
38 import qualified Data.Map as Map
39 import qualified Data.Set as Set
40 import qualified Database.PostgreSQL.Simple as DPS
41
42 -- | fst is size of Supra Corpus
43 -- snd is Texts and size of Occurrences (different docs)
44
45 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
46 -> HashMap NgramsTerm (Set ContextId)
47 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
48 countContextsByNgramsWith f m = (total, m')
49 where
50 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
51 m' = HM.map ( swap . second (fromIntegral . Set.size))
52 $ groupContextsByNgramsWith f m
53
54
55 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
56 -> HashMap NgramsTerm (Set NodeId)
57 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
58 groupContextsByNgramsWith f' m'' =
59 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
60 $ HM.toList m''
61
62 ------------------------------------------------------------------------
63
64 getContextsByNgramsUser :: HasDBid NodeType
65 => CorpusId
66 -> NgramsType
67 -> Cmd err (HashMap NgramsTerm (Set ContextId))
68 getContextsByNgramsUser cId nt =
69 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
70 <$> selectNgramsByContextUser cId nt
71 where
72
73 selectNgramsByContextUser :: HasDBid NodeType
74 => CorpusId
75 -> NgramsType
76 -> Cmd err [(NodeId, Text)]
77 selectNgramsByContextUser cId' nt' =
78 runPGSQuery queryNgramsByContextUser
79 ( cId'
80 , toDBid NodeDocument
81 , ngramsTypeId nt'
82 -- , 100 :: Int -- limit
83 -- , 0 :: Int -- offset
84 )
85
86 queryNgramsByContextUser :: DPS.Query
87 queryNgramsByContextUser = [sql|
88 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
89 JOIN ngrams ng ON cng.ngrams_id = ng.id
90 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
91 JOIN contexts c ON nc.context_id = c.id
92 WHERE nc.node_id = ? -- CorpusId
93 AND c.typename = ? -- toDBid
94 AND cng.ngrams_type = ? -- NgramsTypeId
95 AND nc.category > 0 -- is not in Trash
96 GROUP BY cng.context_id, ng.terms
97 |]
98
99
100 ------------------------------------------------------------------------
101 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
102 => CorpusId
103 -> Int
104 -> NgramsType
105 -> [NgramsTerm]
106 -> Cmd err (HashMap NgramsTerm Int)
107 getOccByNgramsOnlyFast_withSample cId int nt ngs =
108 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
109
110
111 getOccByNgramsOnlyFast' :: CorpusId
112 -> ListId
113 -> NgramsType
114 -> [NgramsTerm]
115 -> Cmd err (HashMap NgramsTerm Int)
116 getOccByNgramsOnlyFast' cId lId nt tms = do -- trace (show (cId, lId)) $
117 mapNgramsIds <- selectNgramsId $ map unNgramsTerm tms
118 HM.fromListWith (+) <$> catMaybes
119 <$> map (\(nId, s) -> (,) <$> (NgramsTerm <$> (Map.lookup nId mapNgramsIds)) <*> (Just $ round s) )
120 <$> run cId lId nt (Map.keys mapNgramsIds)
121 where
122
123 run :: CorpusId
124 -> ListId
125 -> NgramsType
126 -> [NgramsId]
127 -> Cmd err [(NgramsId, Double)]
128 run cId' lId' nt' tms' = runPGSQuery query
129 ( Values fields ((DPS.Only) <$> tms')
130 , cId'
131 , lId'
132 , ngramsTypeId nt'
133 )
134 fields = [QualifiedIdentifier Nothing "int4"]
135
136
137 query :: DPS.Query
138 query = [sql|
139 WITH input_ngrams(id) AS (?)
140
141 SELECT ngi.id, nng.weight FROM nodes_contexts nc
142 JOIN node_node_ngrams nng ON nng.node1_id = nc.node_id
143 JOIN input_ngrams ngi ON nng.ngrams_id = ngi.id
144 WHERE nng.node1_id = ?
145 AND nng.node2_id = ?
146 AND nng.ngrams_type = ?
147 AND nc.category > 0
148 GROUP BY ngi.id, nng.weight
149
150 |]
151
152
153
154
155 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
156 => CorpusId
157 -> Int
158 -> NgramsType
159 -> [NgramsTerm]
160 -> Cmd err [(NgramsTerm, Int)]
161 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
162 fmap (first NgramsTerm) <$>
163 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
164 ( int
165 , toDBid NodeDocument
166 , cId
167 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
168 , cId
169 , ngramsTypeId nt
170 )
171 where
172 fields = [QualifiedIdentifier Nothing "text"]
173
174 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
175 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
176 WITH nodes_sample AS (SELECT id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
177 JOIN nodes_contexts nn ON n.id = nn.context_id
178 WHERE n.typename = ?
179 AND nn.node_id = ?),
180 input_rows(terms) AS (?)
181 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
182 JOIN ngrams ng ON cng.ngrams_id = ng.id
183 JOIN input_rows ir ON ir.terms = ng.terms
184 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
185 JOIN nodes_sample n ON nn.context_id = n.id
186 WHERE nn.node_id = ? -- CorpusId
187 AND cng.ngrams_type = ? -- NgramsTypeId
188 AND nn.category > 0
189 GROUP BY cng.node_id, ng.terms
190 |]
191
192
193 ------------------------------------------------------------------------
194
195 getContextsByNgramsOnlyUser :: HasDBid NodeType
196 => CorpusId
197 -> [ListId]
198 -> NgramsType
199 -> [NgramsTerm]
200 -> Cmd err (HashMap NgramsTerm (Set NodeId))
201 getContextsByNgramsOnlyUser cId ls nt ngs =
202 HM.unionsWith (<>)
203 . map (HM.fromListWith (<>)
204 . map (second Set.singleton))
205 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
206 (splitEvery 1000 ngs)
207
208 getNgramsByContextOnlyUser :: HasDBid NodeType
209 => NodeId
210 -> [ListId]
211 -> NgramsType
212 -> [NgramsTerm]
213 -> Cmd err (Map NodeId (Set NgramsTerm))
214 getNgramsByContextOnlyUser cId ls nt ngs =
215 Map.unionsWith (<>)
216 . map ( Map.fromListWith (<>)
217 . map (second Set.singleton)
218 )
219 . map (map swap)
220 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
221 (splitEvery 1000 ngs)
222
223 ------------------------------------------------------------------------
224 -- used in G.Core.Text.List
225 selectNgramsOnlyByContextUser :: HasDBid NodeType
226 => CorpusId
227 -> [ListId]
228 -> NgramsType
229 -> [NgramsTerm]
230 -> Cmd err [(NgramsTerm, ContextId)]
231 selectNgramsOnlyByContextUser cId ls nt tms =
232 fmap (first NgramsTerm) <$>
233 runPGSQuery queryNgramsOnlyByContextUser
234 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
235 , Values [QualifiedIdentifier Nothing "int4"]
236 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
237 , cId
238 , toDBid NodeDocument
239 , ngramsTypeId nt
240 )
241 where
242 fields = [QualifiedIdentifier Nothing "text"]
243
244 queryNgramsOnlyByContextUser :: DPS.Query
245 queryNgramsOnlyByContextUser = [sql|
246 WITH input_rows(terms) AS (?),
247 input_list(id) AS (?)
248 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
249 JOIN ngrams ng ON cng.ngrams_id = ng.id
250 JOIN input_rows ir ON ir.terms = ng.terms
251 JOIN input_list il ON il.id = cng.node_id
252 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
253 JOIN contexts c ON nc.context_id = c.id
254 WHERE nc.node_id = ? -- CorpusId
255 AND c.typename = ? -- toDBid (maybe not useful with context table)
256 AND cng.ngrams_type = ? -- NgramsTypeId
257 AND nc.category > 0
258 GROUP BY ng.terms, cng.context_id
259 |]
260
261 getNgramsByDocOnlyUser :: DocId
262 -> [ListId]
263 -> NgramsType
264 -> [NgramsTerm]
265 -> Cmd err (HashMap NgramsTerm (Set NodeId))
266 getNgramsByDocOnlyUser cId ls nt ngs =
267 HM.unionsWith (<>)
268 . map (HM.fromListWith (<>) . map (second Set.singleton))
269 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
270
271
272 selectNgramsOnlyByDocUser :: DocId
273 -> [ListId]
274 -> NgramsType
275 -> [NgramsTerm]
276 -> Cmd err [(NgramsTerm, NodeId)]
277 selectNgramsOnlyByDocUser dId ls nt tms =
278 fmap (first NgramsTerm) <$>
279 runPGSQuery queryNgramsOnlyByDocUser
280 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
281 , Values [QualifiedIdentifier Nothing "int4"]
282 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
283 , dId
284 , ngramsTypeId nt
285 )
286 where
287 fields = [QualifiedIdentifier Nothing "text"]
288
289
290 queryNgramsOnlyByDocUser :: DPS.Query
291 queryNgramsOnlyByDocUser = [sql|
292 WITH input_rows(terms) AS (?),
293 input_list(id) AS (?)
294 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
295 JOIN ngrams ng ON cng.ngrams_id = ng.id
296 JOIN input_rows ir ON ir.terms = ng.terms
297 JOIN input_list il ON il.id = cng.context_id
298 WHERE cng.node_id = ? -- DocId
299 AND cng.ngrams_type = ? -- NgramsTypeId
300 GROUP BY ng.terms, cng.node_id
301 |]
302
303 ------------------------------------------------------------------------
304 -- | TODO filter by language, database, any social field
305 getContextsByNgramsMaster :: HasDBid NodeType
306 => UserCorpusId
307 -> MasterCorpusId
308 -> Cmd err (HashMap Text (Set NodeId))
309 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
310 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
311 -- . takeWhile (not . List.null)
312 -- . takeWhile (\l -> List.length l > 3)
313 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
314
315 selectNgramsByContextMaster :: HasDBid NodeType
316 => Int
317 -> UserCorpusId
318 -> MasterCorpusId
319 -> Int
320 -> Cmd err [(NodeId, Text)]
321 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
322 queryNgramsByContextMaster'
323 ( ucId
324 , ngramsTypeId NgramsTerms
325 , toDBid NodeDocument
326 , p
327 , toDBid NodeDocument
328 , p
329 , n
330 , mcId
331 , toDBid NodeDocument
332 , ngramsTypeId NgramsTerms
333 )
334
335 -- | TODO fix context_node_ngrams relation
336 queryNgramsByContextMaster' :: DPS.Query
337 queryNgramsByContextMaster' = [sql|
338 WITH contextsByNgramsUser AS (
339
340 SELECT n.id, ng.terms FROM contexts n
341 JOIN nodes_contexts nn ON n.id = nn.context_id
342 JOIN context_node_ngrams cng ON cng.context_id = n.id
343 JOIN ngrams ng ON cng.ngrams_id = ng.id
344 WHERE nn.node_id = ? -- UserCorpusId
345 -- AND n.typename = ? -- toDBid
346 AND cng.ngrams_type = ? -- NgramsTypeId
347 AND nn.category > 0
348 AND node_pos(n.id,?) >= ?
349 AND node_pos(n.id,?) < ?
350 GROUP BY n.id, ng.terms
351
352 ),
353
354 contextsByNgramsMaster AS (
355
356 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
357 JOIN context_node_ngrams cng ON n.id = cng.context_id
358 JOIN ngrams ng ON ng.id = cng.ngrams_id
359
360 WHERE n.parent_id = ? -- Master Corpus toDBid
361 AND n.typename = ? -- toDBid
362 AND cng.ngrams_type = ? -- NgramsTypeId
363 GROUP BY n.id, ng.terms
364 )
365
366 SELECT m.id, m.terms FROM nodesByNgramsMaster m
367 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
368 |]