]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByNode.hs
[TextFlow] grouping fun
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByNode.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByNode
17 where
18
19 import Data.Map.Strict (Map, fromListWith, elems, toList)
20 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.Set (Set)
22 import Data.Text (Text)
23 import Data.Tuple.Extra (second, swap)
24 import Database.PostgreSQL.Simple.SqlQQ (sql)
25 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
26 import Debug.Trace (trace)
27 import qualified Data.List as List
28 import qualified Data.Map.Strict as Map
29 import qualified Data.Set as Set
30 import qualified Data.Text as Text
31 import qualified Database.PostgreSQL.Simple as DPS
32
33 import Gargantext.Core (Lang(..))
34 import Gargantext.Database.Admin.Config (nodeTypeId)
35 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
36 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
37 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
38 import Gargantext.Prelude
39 import Gargantext.Core.Text.Terms.Mono.Stem (stem)
40
41 -- | TODO: group with 2 terms only can be
42 -- discussed. Main purpose of this is offering
43 -- a first grouping option to user and get some
44 -- enriched data to better learn and improve that algo
45 ngramsGroup :: Lang
46 -> Int
47 -> Int
48 -> Text
49 -> Text
50 ngramsGroup l _m _n = Text.intercalate " "
51 . map (stem l)
52 -- . take n
53 . List.sort
54 -- . (List.filter (\t -> Text.length t > m))
55 . Text.splitOn " "
56 . Text.replace "-" " "
57
58
59
60 -- | fst is size of Supra Corpus
61 -- snd is Texts and size of Occurrences (different docs)
62 countNodesByNgramsWith :: (Text -> Text)
63 -> Map Text (Set NodeId)
64 -> (Double, Map Text (Double, Set Text))
65 countNodesByNgramsWith f m = (total, m')
66 where
67 total = fromIntegral $ Set.size $ Set.unions $ elems m
68 m' = Map.map ( swap . second (fromIntegral . Set.size))
69 $ groupNodesByNgramsWith f m
70
71
72 groupNodesByNgramsWith :: (Text -> Text)
73 -> Map Text (Set NodeId)
74 -> Map Text (Set Text, Set NodeId)
75 groupNodesByNgramsWith f m =
76 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
77 $ toList m
78
79 ------------------------------------------------------------------------
80 getNodesByNgramsUser :: CorpusId
81 -> NgramsType
82 -> Cmd err (Map Text (Set NodeId))
83 getNodesByNgramsUser cId nt =
84 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
85 <$> selectNgramsByNodeUser cId nt
86 where
87
88 selectNgramsByNodeUser :: CorpusId
89 -> NgramsType
90 -> Cmd err [(NodeId, Text)]
91 selectNgramsByNodeUser cId' nt' =
92 runPGSQuery queryNgramsByNodeUser
93 ( cId'
94 , nodeTypeId NodeDocument
95 , ngramsTypeId nt'
96 -- , 100 :: Int -- limit
97 -- , 0 :: Int -- offset
98 )
99
100 queryNgramsByNodeUser :: DPS.Query
101 queryNgramsByNodeUser = [sql|
102 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
103 JOIN ngrams ng ON nng.ngrams_id = ng.id
104 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
105 JOIN nodes n ON nn.node2_id = n.id
106 WHERE nn.node1_id = ? -- CorpusId
107 AND n.typename = ? -- NodeTypeId
108 AND nng.ngrams_type = ? -- NgramsTypeId
109 AND nn.category > 0
110 GROUP BY nng.node2_id, ng.terms
111 ORDER BY (nng.node2_id, ng.terms) DESC
112 -- LIMIT ?
113 -- OFFSET ?
114 |]
115 ------------------------------------------------------------------------
116 -- TODO add groups
117 getOccByNgramsOnlyFast :: CorpusId
118 -> NgramsType
119 -> [Text]
120 -> Cmd err (Map Text Int)
121 getOccByNgramsOnlyFast cId nt ngs =
122 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
123
124
125 getOccByNgramsOnlyFast' :: CorpusId
126 -> ListId
127 -> NgramsType
128 -> [Text]
129 -> Cmd err (Map Text Int)
130 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
131 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
132
133 where
134 fields = [QualifiedIdentifier Nothing "text"]
135
136 run :: CorpusId
137 -> ListId
138 -> NgramsType
139 -> [Text]
140 -> Cmd err [(Text, Double)]
141 run cId' lId' nt' tms' = runPGSQuery query
142 ( Values fields (DPS.Only <$> tms')
143 , cId'
144 , lId'
145 , ngramsTypeId nt'
146 )
147
148 query :: DPS.Query
149 query = [sql|
150 WITH input_rows(terms) AS (?)
151 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
152 JOIN ngrams ng ON nng.ngrams_id = ng.id
153 JOIN input_rows ir ON ir.terms = ng.terms
154 WHERE nng.node1_id = ? -- CorpusId
155 AND nng.node2_id = ? -- ListId
156 AND nng.ngrams_type = ? -- NgramsTypeId
157 -- AND nn.category > 0 -- TODO
158 GROUP BY ng.terms, nng.weight
159 |]
160
161
162 -- just slower than getOccByNgramsOnlyFast
163 getOccByNgramsOnlySlow :: NodeType
164 -> CorpusId
165 -> [ListId]
166 -> NgramsType
167 -> [Text]
168 -> Cmd err (Map Text Int)
169 getOccByNgramsOnlySlow t cId ls nt ngs =
170 Map.map Set.size <$> getScore' t cId ls nt ngs
171 where
172 getScore' NodeCorpus = getNodesByNgramsOnlyUser
173 getScore' NodeDocument = getNgramsByDocOnlyUser
174 getScore' _ = getNodesByNgramsOnlyUser
175
176 getOccByNgramsOnlySafe :: CorpusId
177 -> [ListId]
178 -> NgramsType
179 -> [Text]
180 -> Cmd err (Map Text Int)
181 getOccByNgramsOnlySafe cId ls nt ngs = do
182 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
183 fast <- getOccByNgramsOnlyFast cId nt ngs
184 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
185 when (fast /= slow) $
186 printDebug "getOccByNgramsOnlySafe: difference"
187 (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
188 pure slow
189
190
191 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
192 -> NgramsType
193 -> [Text]
194 -> Cmd err [(Text, Int)]
195 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
196 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
197 ( Values fields (DPS.Only <$> tms)
198 , cId
199 , nodeTypeId NodeDocument
200 , ngramsTypeId nt
201 )
202 where
203 fields = [QualifiedIdentifier Nothing "text"]
204
205 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
206 -- Question: with the grouping is the result exactly the same (since Set NodeId for
207 -- equivalent ngrams intersections are not empty)
208 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
209 queryNgramsOccurrencesOnlyByNodeUser = [sql|
210 WITH input_rows(terms) AS (?)
211 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
212 JOIN ngrams ng ON nng.ngrams_id = ng.id
213 JOIN input_rows ir ON ir.terms = ng.terms
214 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
215 JOIN nodes n ON nn.node2_id = n.id
216 WHERE nn.node1_id = ? -- CorpusId
217 AND n.typename = ? -- NodeTypeId
218 AND nng.ngrams_type = ? -- NgramsTypeId
219 AND nn.category > 0
220 GROUP BY nng.node2_id, ng.terms
221 |]
222
223 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
224 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
225 WITH input_rows(terms) AS (?)
226 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
227 JOIN ngrams ng ON nng.ngrams_id = ng.id
228 JOIN input_rows ir ON ir.terms = ng.terms
229 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
230 JOIN nodes n ON nn.node2_id = n.id
231 WHERE nn.node1_id = ? -- CorpusId
232 AND n.typename = ? -- NodeTypeId
233 AND nng.ngrams_type = ? -- NgramsTypeId
234 AND nn.category > 0
235 GROUP BY nng.node2_id, ng.terms
236 |]
237
238 ------------------------------------------------------------------------
239 getNodesByNgramsOnlyUser :: NodeId
240 -> [ListId]
241 -> NgramsType
242 -> [Text]
243 -> Cmd err (Map Text (Set NodeId))
244 getNodesByNgramsOnlyUser cId ls nt ngs =
245 Map.unionsWith (<>)
246 . map (fromListWith (<>)
247 . map (second Set.singleton))
248 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
249 (splitEvery 1000 ngs)
250
251
252 getNgramsByNodeOnlyUser :: NodeId
253 -> [ListId]
254 -> NgramsType
255 -> [Text]
256 -> Cmd err (Map NodeId (Set Text))
257 getNgramsByNodeOnlyUser cId ls nt ngs =
258 Map.unionsWith (<>)
259 . map (fromListWith (<>)
260 . map (second Set.singleton))
261 . map (map swap)
262 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
263 (splitEvery 1000 ngs)
264
265 ------------------------------------------------------------------------
266 selectNgramsOnlyByNodeUser :: CorpusId
267 -> [ListId]
268 -> NgramsType
269 -> [Text]
270 -> Cmd err [(Text, NodeId)]
271 selectNgramsOnlyByNodeUser cId ls nt tms =
272 runPGSQuery queryNgramsOnlyByNodeUser
273 ( Values fields (DPS.Only <$> tms)
274 , Values [QualifiedIdentifier Nothing "int4"]
275 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
276 , cId
277 , nodeTypeId NodeDocument
278 , ngramsTypeId nt
279 )
280 where
281 fields = [QualifiedIdentifier Nothing "text"]
282
283 queryNgramsOnlyByNodeUser :: DPS.Query
284 queryNgramsOnlyByNodeUser = [sql|
285 WITH input_rows(terms) AS (?),
286 input_list(id) AS (?)
287 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
288 JOIN ngrams ng ON nng.ngrams_id = ng.id
289 JOIN input_rows ir ON ir.terms = ng.terms
290 JOIN input_list il ON il.id = nng.node1_id
291 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
292 JOIN nodes n ON nn.node2_id = n.id
293 WHERE nn.node1_id = ? -- CorpusId
294 AND n.typename = ? -- NodeTypeId
295 AND nng.ngrams_type = ? -- NgramsTypeId
296 AND nn.category > 0
297 GROUP BY ng.terms, nng.node2_id
298 |]
299
300
301 selectNgramsOnlyByNodeUser' :: CorpusId
302 -> [ListId]
303 -> NgramsType
304 -> [Text]
305 -> Cmd err [(Text, Int)]
306 selectNgramsOnlyByNodeUser' cId ls nt tms =
307 runPGSQuery queryNgramsOnlyByNodeUser
308 ( Values fields (DPS.Only <$> tms)
309 , Values [QualifiedIdentifier Nothing "int4"]
310 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
311 , cId
312 , nodeTypeId NodeDocument
313 , ngramsTypeId nt
314 )
315 where
316 fields = [QualifiedIdentifier Nothing "text"]
317
318 queryNgramsOnlyByNodeUser' :: DPS.Query
319 queryNgramsOnlyByNodeUser' = [sql|
320 WITH input_rows(terms) AS (?),
321 input_list(id) AS (?)
322 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
323 JOIN ngrams ng ON nng.ngrams_id = ng.id
324 JOIN input_rows ir ON ir.terms = ng.terms
325 JOIN input_list il ON il.id = nng.node2_id
326 WHERE nng.node1_id = ? -- CorpusId
327 AND nng.ngrams_type = ? -- NgramsTypeId
328 -- AND nn.category > 0
329 GROUP BY ng.terms, nng.weight
330 |]
331
332
333 getNgramsByDocOnlyUser :: NodeId
334 -> [ListId]
335 -> NgramsType
336 -> [Text]
337 -> Cmd err (Map Text (Set NodeId))
338 getNgramsByDocOnlyUser cId ls nt ngs =
339 Map.unionsWith (<>)
340 . map (fromListWith (<>) . map (second Set.singleton))
341 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
342
343
344 selectNgramsOnlyByDocUser :: DocId
345 -> [ListId]
346 -> NgramsType
347 -> [Text]
348 -> Cmd err [(Text, NodeId)]
349 selectNgramsOnlyByDocUser dId ls nt tms =
350 runPGSQuery queryNgramsOnlyByDocUser
351 ( Values fields (DPS.Only <$> tms)
352 , Values [QualifiedIdentifier Nothing "int4"]
353 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
354 , dId
355 , ngramsTypeId nt
356 )
357 where
358 fields = [QualifiedIdentifier Nothing "text"]
359
360
361 queryNgramsOnlyByDocUser :: DPS.Query
362 queryNgramsOnlyByDocUser = [sql|
363 WITH input_rows(terms) AS (?),
364 input_list(id) AS (?)
365 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
366 JOIN ngrams ng ON nng.ngrams_id = ng.id
367 JOIN input_rows ir ON ir.terms = ng.terms
368 JOIN input_list il ON il.id = nng.node1_id
369 WHERE nng.node2_id = ? -- DocId
370 AND nng.ngrams_type = ? -- NgramsTypeId
371 GROUP BY ng.terms, nng.node2_id
372 |]
373
374 ------------------------------------------------------------------------
375 -- | TODO filter by language, database, any social field
376 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
377 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
378 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
379 -- . takeWhile (not . List.null)
380 -- . takeWhile (\l -> List.length l > 3)
381 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
382
383 selectNgramsByNodeMaster :: Int
384 -> UserCorpusId
385 -> MasterCorpusId
386 -> Int
387 -> Cmd err [(NodeId, Text)]
388 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
389 queryNgramsByNodeMaster'
390 ( ucId
391 , ngramsTypeId NgramsTerms
392 , nodeTypeId NodeDocument
393 , p
394 , nodeTypeId NodeDocument
395 , p
396 , n
397 , mcId
398 , nodeTypeId NodeDocument
399 , ngramsTypeId NgramsTerms
400 )
401
402 -- | TODO fix node_node_ngrams relation
403 queryNgramsByNodeMaster' :: DPS.Query
404 queryNgramsByNodeMaster' = [sql|
405 WITH nodesByNgramsUser AS (
406
407 SELECT n.id, ng.terms FROM nodes n
408 JOIN nodes_nodes nn ON n.id = nn.node2_id
409 JOIN node_node_ngrams nng ON nng.node2_id = n.id
410 JOIN ngrams ng ON nng.ngrams_id = ng.id
411 WHERE nn.node1_id = ? -- UserCorpusId
412 -- AND n.typename = ? -- NodeTypeId
413 AND nng.ngrams_type = ? -- NgramsTypeId
414 AND nn.category > 0
415 AND node_pos(n.id,?) >= ?
416 AND node_pos(n.id,?) < ?
417 GROUP BY n.id, ng.terms
418
419 ),
420
421 nodesByNgramsMaster AS (
422
423 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
424 JOIN node_node_ngrams nng ON n.id = nng.node2_id
425 JOIN ngrams ng ON ng.id = nng.ngrams_id
426
427 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
428 AND n.typename = ? -- NodeTypeId
429 AND nng.ngrams_type = ? -- NgramsTypeId
430 GROUP BY n.id, ng.terms
431 )
432
433 SELECT m.id, m.terms FROM nodesByNgramsMaster m
434 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
435 |]