]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByContext.hs
[john-snow] implement pos/lemma language
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByContext.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByContext
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByContext
17 where
18
19 --import Data.Map.Strict.Patch (PatchMap, Replace, diff)
20 import Data.HashMap.Strict (HashMap)
21 import Data.Map (Map)
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import Data.Tuple.Extra (first, second, swap)
25 import Database.PostgreSQL.Simple.SqlQQ (sql)
26 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
27 -- import Debug.Trace (trace)
28 import Gargantext.Core
29 import Gargantext.API.Ngrams.Types (NgramsTerm(..))
30 import Gargantext.Data.HashMap.Strict.Utils as HM
31 import Gargantext.Database.Admin.Types.Node (ListId, CorpusId, NodeId(..), ContextId, MasterCorpusId, NodeType(NodeDocument), UserCorpusId, DocId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import qualified Data.HashMap.Strict as HM
36 import qualified Data.Map as Map
37 import qualified Data.Set as Set
38 import qualified Database.PostgreSQL.Simple as DPS
39
40 -- | fst is size of Supra Corpus
41 -- snd is Texts and size of Occurrences (different docs)
42
43 countContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
44 -> HashMap NgramsTerm (Set ContextId)
45 -> (Double, HashMap NgramsTerm (Double, Set NgramsTerm))
46 countContextsByNgramsWith f m = (total, m')
47 where
48 total = fromIntegral $ Set.size $ Set.unions $ HM.elems m
49 m' = HM.map ( swap . second (fromIntegral . Set.size))
50 $ groupContextsByNgramsWith f m
51
52
53 groupContextsByNgramsWith :: (NgramsTerm -> NgramsTerm)
54 -> HashMap NgramsTerm (Set NodeId)
55 -> HashMap NgramsTerm (Set NgramsTerm, Set ContextId)
56 groupContextsByNgramsWith f' m'' =
57 HM.fromListWith (<>) $ map (\(t,ns) -> (f' t, (Set.singleton t, ns)))
58 $ HM.toList m''
59
60 ------------------------------------------------------------------------
61
62 getContextsByNgramsUser :: HasDBid NodeType
63 => CorpusId
64 -> NgramsType
65 -> Cmd err (HashMap NgramsTerm (Set ContextId))
66 getContextsByNgramsUser cId nt =
67 HM.fromListWith (<>) <$> map (\(n,t) -> (NgramsTerm t, Set.singleton n))
68 <$> selectNgramsByContextUser cId nt
69 where
70
71 selectNgramsByContextUser :: HasDBid NodeType
72 => CorpusId
73 -> NgramsType
74 -> Cmd err [(NodeId, Text)]
75 selectNgramsByContextUser cId' nt' =
76 runPGSQuery queryNgramsByContextUser
77 ( cId'
78 , toDBid NodeDocument
79 , ngramsTypeId nt'
80 -- , 100 :: Int -- limit
81 -- , 0 :: Int -- offset
82 )
83
84 queryNgramsByContextUser :: DPS.Query
85 queryNgramsByContextUser = [sql|
86 SELECT cng.context_id, ng.terms FROM context_node_ngrams cng
87 JOIN ngrams ng ON cng.ngrams_id = ng.id
88 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
89 JOIN contexts c ON nc.context_id = c.id
90 WHERE nc.node_id = ? -- CorpusId
91 AND c.typename = ? -- toDBid
92 AND cng.ngrams_type = ? -- NgramsTypeId
93 AND nc.category > 0 -- is not in Trash
94 GROUP BY cng.context_id, ng.terms
95 |]
96
97
98 ------------------------------------------------------------------------
99 getOccByNgramsOnlyFast_withSample :: HasDBid NodeType
100 => CorpusId
101 -> Int
102 -> NgramsType
103 -> [NgramsTerm]
104 -> Cmd err (HashMap NgramsTerm Int)
105 getOccByNgramsOnlyFast_withSample cId int nt ngs =
106 HM.fromListWith (+) <$> selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt ngs
107
108
109 getOccByNgramsOnlyFast' :: CorpusId
110 -> ListId
111 -> NgramsType
112 -> [NgramsTerm]
113 -> Cmd err (HashMap NgramsTerm Int)
114 getOccByNgramsOnlyFast' cId lId nt tms = -- trace (show (cId, lId)) $
115 HM.fromListWith (+) <$> map (second round) <$> run cId lId nt tms
116
117 where
118 fields = [QualifiedIdentifier Nothing "text"]
119
120 run :: CorpusId
121 -> ListId
122 -> NgramsType
123 -> [NgramsTerm]
124 -> Cmd err [(NgramsTerm, Double)]
125 run cId' lId' nt' tms' = map (first NgramsTerm) <$> runPGSQuery query
126 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms')
127 , cId'
128 , lId'
129 , ngramsTypeId nt'
130 )
131
132 query :: DPS.Query
133 query = [sql|
134 WITH input_rows(terms) AS (?)
135 SELECT ng.terms, nng.weight FROM nodes_contexts nc
136 JOIN node_node_ngrams nng ON nng.node1_id = nc.node_id
137 JOIN ngrams ng ON nng.ngrams_id = ng.id
138 JOIN input_rows ir ON ir.terms = ng.terms
139 WHERE nng.node1_id = ? -- CorpusId
140 AND nng.node2_id = ? -- ListId
141 AND nng.ngrams_type = ? -- NgramsTypeId
142 AND nc.category > 0 -- Not trash
143 GROUP BY ng.terms, nng.weight
144 |]
145
146
147 selectNgramsOccurrencesOnlyByContextUser_withSample :: HasDBid NodeType
148 => CorpusId
149 -> Int
150 -> NgramsType
151 -> [NgramsTerm]
152 -> Cmd err [(NgramsTerm, Int)]
153 selectNgramsOccurrencesOnlyByContextUser_withSample cId int nt tms =
154 fmap (first NgramsTerm) <$>
155 runPGSQuery queryNgramsOccurrencesOnlyByContextUser_withSample
156 ( int
157 , toDBid NodeDocument
158 , cId
159 , Values fields ((DPS.Only . unNgramsTerm) <$> tms)
160 , cId
161 , ngramsTypeId nt
162 )
163 where
164 fields = [QualifiedIdentifier Nothing "text"]
165
166 queryNgramsOccurrencesOnlyByContextUser_withSample :: DPS.Query
167 queryNgramsOccurrencesOnlyByContextUser_withSample = [sql|
168 WITH nodes_sample AS (SELECT id FROM contexts n TABLESAMPLE SYSTEM_ROWS (?)
169 JOIN nodes_contexts nn ON n.id = nn.context_id
170 WHERE n.typename = ?
171 AND nn.node_id = ?),
172 input_rows(terms) AS (?)
173 SELECT ng.terms, COUNT(cng.context_id) FROM context_node_ngrams cng
174 JOIN ngrams ng ON cng.ngrams_id = ng.id
175 JOIN input_rows ir ON ir.terms = ng.terms
176 JOIN nodes_contexts nn ON nn.context_id = cng.context_id
177 JOIN nodes_sample n ON nn.context_id = n.id
178 WHERE nn.node_id = ? -- CorpusId
179 AND cng.ngrams_type = ? -- NgramsTypeId
180 AND nn.category > 0
181 GROUP BY cng.node_id, ng.terms
182 |]
183
184
185 ------------------------------------------------------------------------
186
187 getContextsByNgramsOnlyUser :: HasDBid NodeType
188 => CorpusId
189 -> [ListId]
190 -> NgramsType
191 -> [NgramsTerm]
192 -> Cmd err (HashMap NgramsTerm (Set NodeId))
193 getContextsByNgramsOnlyUser cId ls nt ngs =
194 HM.unionsWith (<>)
195 . map (HM.fromListWith (<>)
196 . map (second Set.singleton))
197 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
198 (splitEvery 1000 ngs)
199
200 getNgramsByContextOnlyUser :: HasDBid NodeType
201 => NodeId
202 -> [ListId]
203 -> NgramsType
204 -> [NgramsTerm]
205 -> Cmd err (Map NodeId (Set NgramsTerm))
206 getNgramsByContextOnlyUser cId ls nt ngs =
207 Map.unionsWith (<>)
208 . map ( Map.fromListWith (<>)
209 . map (second Set.singleton)
210 )
211 . map (map swap)
212 <$> mapM (selectNgramsOnlyByContextUser cId ls nt)
213 (splitEvery 1000 ngs)
214
215 ------------------------------------------------------------------------
216 -- used in G.Core.Text.List
217 selectNgramsOnlyByContextUser :: HasDBid NodeType
218 => CorpusId
219 -> [ListId]
220 -> NgramsType
221 -> [NgramsTerm]
222 -> Cmd err [(NgramsTerm, ContextId)]
223 selectNgramsOnlyByContextUser cId ls nt tms =
224 fmap (first NgramsTerm) <$>
225 runPGSQuery queryNgramsOnlyByContextUser
226 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
227 , Values [QualifiedIdentifier Nothing "int4"]
228 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
229 , cId
230 , toDBid NodeDocument
231 , ngramsTypeId nt
232 )
233 where
234 fields = [QualifiedIdentifier Nothing "text"]
235
236 queryNgramsOnlyByContextUser :: DPS.Query
237 queryNgramsOnlyByContextUser = [sql|
238 WITH input_rows(terms) AS (?),
239 input_list(id) AS (?)
240 SELECT ng.terms, cng.context_id FROM context_node_ngrams cng
241 JOIN ngrams ng ON cng.ngrams_id = ng.id
242 JOIN input_rows ir ON ir.terms = ng.terms
243 JOIN input_list il ON il.id = cng.node_id
244 JOIN nodes_contexts nc ON nc.context_id = cng.context_id
245 JOIN contexts c ON nc.context_id = c.id
246 WHERE nc.node_id = ? -- CorpusId
247 AND c.typename = ? -- toDBid (maybe not useful with context table)
248 AND cng.ngrams_type = ? -- NgramsTypeId
249 AND nc.category > 0
250 GROUP BY ng.terms, cng.context_id
251 |]
252
253 getNgramsByDocOnlyUser :: DocId
254 -> [ListId]
255 -> NgramsType
256 -> [NgramsTerm]
257 -> Cmd err (HashMap NgramsTerm (Set NodeId))
258 getNgramsByDocOnlyUser cId ls nt ngs =
259 HM.unionsWith (<>)
260 . map (HM.fromListWith (<>) . map (second Set.singleton))
261 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
262
263
264 selectNgramsOnlyByDocUser :: DocId
265 -> [ListId]
266 -> NgramsType
267 -> [NgramsTerm]
268 -> Cmd err [(NgramsTerm, NodeId)]
269 selectNgramsOnlyByDocUser dId ls nt tms =
270 fmap (first NgramsTerm) <$>
271 runPGSQuery queryNgramsOnlyByDocUser
272 ( Values fields ((DPS.Only . unNgramsTerm) <$> tms)
273 , Values [QualifiedIdentifier Nothing "int4"]
274 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
275 , dId
276 , ngramsTypeId nt
277 )
278 where
279 fields = [QualifiedIdentifier Nothing "text"]
280
281
282 queryNgramsOnlyByDocUser :: DPS.Query
283 queryNgramsOnlyByDocUser = [sql|
284 WITH input_rows(terms) AS (?),
285 input_list(id) AS (?)
286 SELECT ng.terms, cng.node_id FROM context_node_ngrams cng
287 JOIN ngrams ng ON cng.ngrams_id = ng.id
288 JOIN input_rows ir ON ir.terms = ng.terms
289 JOIN input_list il ON il.id = cng.context_id
290 WHERE cng.node_id = ? -- DocId
291 AND cng.ngrams_type = ? -- NgramsTypeId
292 GROUP BY ng.terms, cng.node_id
293 |]
294
295 ------------------------------------------------------------------------
296 -- | TODO filter by language, database, any social field
297 getContextsByNgramsMaster :: HasDBid NodeType
298 => UserCorpusId
299 -> MasterCorpusId
300 -> Cmd err (HashMap Text (Set NodeId))
301 getContextsByNgramsMaster ucId mcId = unionsWith (<>)
302 . map (HM.fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
303 -- . takeWhile (not . List.null)
304 -- . takeWhile (\l -> List.length l > 3)
305 <$> mapM (selectNgramsByContextMaster 1000 ucId mcId) [0,500..10000]
306
307 selectNgramsByContextMaster :: HasDBid NodeType
308 => Int
309 -> UserCorpusId
310 -> MasterCorpusId
311 -> Int
312 -> Cmd err [(NodeId, Text)]
313 selectNgramsByContextMaster n ucId mcId p = runPGSQuery
314 queryNgramsByContextMaster'
315 ( ucId
316 , ngramsTypeId NgramsTerms
317 , toDBid NodeDocument
318 , p
319 , toDBid NodeDocument
320 , p
321 , n
322 , mcId
323 , toDBid NodeDocument
324 , ngramsTypeId NgramsTerms
325 )
326
327 -- | TODO fix context_node_ngrams relation
328 queryNgramsByContextMaster' :: DPS.Query
329 queryNgramsByContextMaster' = [sql|
330 WITH contextsByNgramsUser AS (
331
332 SELECT n.id, ng.terms FROM contexts n
333 JOIN nodes_contexts nn ON n.id = nn.context_id
334 JOIN context_node_ngrams cng ON cng.context_id = n.id
335 JOIN ngrams ng ON cng.ngrams_id = ng.id
336 WHERE nn.node_id = ? -- UserCorpusId
337 -- AND n.typename = ? -- toDBid
338 AND cng.ngrams_type = ? -- NgramsTypeId
339 AND nn.category > 0
340 AND node_pos(n.id,?) >= ?
341 AND node_pos(n.id,?) < ?
342 GROUP BY n.id, ng.terms
343
344 ),
345
346 contextsByNgramsMaster AS (
347
348 SELECT n.id, ng.terms FROM contexts n TABLESAMPLE SYSTEM_ROWS(?)
349 JOIN context_node_ngrams cng ON n.id = cng.context_id
350 JOIN ngrams ng ON ng.id = cng.ngrams_id
351
352 WHERE n.parent_id = ? -- Master Corpus toDBid
353 AND n.typename = ? -- toDBid
354 AND cng.ngrams_type = ? -- NgramsTypeId
355 GROUP BY n.id, ng.terms
356 )
357
358 SELECT m.id, m.terms FROM nodesByNgramsMaster m
359 RIGHT JOIN contextsByNgramsUser u ON u.id = m.id
360 |]