]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByNode.hs
[Merge]
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByNode.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByNode
17 where
18
19 import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
20 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.Set (Set)
22 import Data.Text (Text)
23 import Data.Tuple.Extra (second, swap)
24 import Database.PostgreSQL.Simple.SqlQQ (sql)
25 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
26 import Debug.Trace (trace)
27 import qualified Data.List as List
28 import qualified Data.Map.Strict as Map
29 import qualified Data.Set as Set
30 import qualified Data.Text as Text
31 import qualified Database.PostgreSQL.Simple as DPS
32
33 import Gargantext.Core (Lang(..))
34 import Gargantext.Database.Admin.Config (nodeTypeId)
35 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
36 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
37 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
38 import Gargantext.Prelude
39 import Gargantext.Core.Text.Metrics.TFICF
40 import Gargantext.Core.Text.Terms.Mono.Stem (stem)
41
42 -- | TODO: group with 2 terms only can be
43 -- discussed. Main purpose of this is offering
44 -- a first grouping option to user and get some
45 -- enriched data to better learn and improve that algo
46 ngramsGroup :: Lang
47 -> Int
48 -> Int
49 -> Text
50 -> Text
51 ngramsGroup l _m _n = Text.intercalate " "
52 . map (stem l)
53 -- . take n
54 . List.sort
55 -- . (List.filter (\t -> Text.length t > m))
56 . Text.splitOn " "
57 . Text.replace "-" " "
58
59
60 getTficf :: UserCorpusId
61 -> MasterCorpusId
62 -> NgramsType
63 -> (Text -> Text)
64 -> Cmd err (Map Text (Double, Set Text))
65 getTficf u m nt f = do
66 u' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsUser u nt
67 m' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsMaster u m
68
69 pure $ toTficfData (countNodesByNgramsWith f u')
70 (countNodesByNgramsWith f m')
71
72 {-
73 getTficfWith :: UserCorpusId
74 -> MasterCorpusId
75 -> [ListId]
76 -> NgramsType
77 -> Map Text (Maybe Text)
78 -> Cmd err (Map Text (Double, Set Text))
79 getTficfWith u m ls nt mtxt = do
80 u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
81 m' <- getNodesByNgramsMaster u m
82
83 let f x = case Map.lookup x mtxt of
84 Nothing -> x
85 Just x' -> maybe x identity x'
86
87 pure $ toTficfData (countNodesByNgramsWith f u') (countNodesByNgramsWith f m')
88 -}
89
90 type Context = (Double, Map Text (Double, Set Text))
91 type Supra = Context
92 type Infra = Context
93
94 toTficfData :: Infra
95 -> Supra
96 -> Map Text (Double, Set Text)
97 toTficfData (ti, mi) (ts, ms) =
98 fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
99 (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
100 , ns
101 )
102 )
103 | (t, (n,ns)) <- toList mi
104 ]
105
106
107 -- | fst is size of Supra Corpus
108 -- snd is Texts and size of Occurrences (different docs)
109 countNodesByNgramsWith :: (Text -> Text)
110 -> Map Text (Set NodeId)
111 -> (Double, Map Text (Double, Set Text))
112 countNodesByNgramsWith f m = (total, m')
113 where
114 total = fromIntegral $ Set.size $ Set.unions $ elems m
115 m' = Map.map ( swap . second (fromIntegral . Set.size))
116 $ groupNodesByNgramsWith f m
117
118
119 groupNodesByNgramsWith :: (Text -> Text)
120 -> Map Text (Set NodeId)
121 -> Map Text (Set Text, Set NodeId)
122 groupNodesByNgramsWith f m =
123 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
124 $ toList m
125
126 ------------------------------------------------------------------------
127 getNodesByNgramsUser :: CorpusId
128 -> NgramsType
129 -> Cmd err (Map Text (Set NodeId))
130 getNodesByNgramsUser cId nt =
131 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
132 <$> selectNgramsByNodeUser cId nt
133 where
134
135 selectNgramsByNodeUser :: CorpusId
136 -> NgramsType
137 -> Cmd err [(NodeId, Text)]
138 selectNgramsByNodeUser cId' nt' =
139 runPGSQuery queryNgramsByNodeUser
140 ( cId'
141 , nodeTypeId NodeDocument
142 , ngramsTypeId nt'
143 -- , 100 :: Int -- limit
144 -- , 0 :: Int -- offset
145 )
146
147 queryNgramsByNodeUser :: DPS.Query
148 queryNgramsByNodeUser = [sql|
149 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
150 JOIN ngrams ng ON nng.ngrams_id = ng.id
151 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
152 JOIN nodes n ON nn.node2_id = n.id
153 WHERE nn.node1_id = ? -- CorpusId
154 AND n.typename = ? -- NodeTypeId
155 AND nng.ngrams_type = ? -- NgramsTypeId
156 AND nn.category > 0
157 GROUP BY nng.node2_id, ng.terms
158 ORDER BY (nng.node2_id, ng.terms) DESC
159 -- LIMIT ?
160 -- OFFSET ?
161 |]
162 ------------------------------------------------------------------------
163 -- TODO add groups
164 getOccByNgramsOnlyFast :: CorpusId
165 -> NgramsType
166 -> [Text]
167 -> Cmd err (Map Text Int)
168 getOccByNgramsOnlyFast cId nt ngs =
169 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
170
171
172 getOccByNgramsOnlyFast' :: CorpusId
173 -> ListId
174 -> NgramsType
175 -> [Text]
176 -> Cmd err (Map Text Int)
177 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
178 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
179
180 where
181 fields = [QualifiedIdentifier Nothing "text"]
182
183 run :: CorpusId
184 -> ListId
185 -> NgramsType
186 -> [Text]
187 -> Cmd err [(Text, Double)]
188 run cId' lId' nt' tms' = runPGSQuery query
189 ( Values fields (DPS.Only <$> tms')
190 , cId'
191 , lId'
192 , ngramsTypeId nt'
193 )
194
195 query :: DPS.Query
196 query = [sql|
197 WITH input_rows(terms) AS (?)
198 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
199 JOIN ngrams ng ON nng.ngrams_id = ng.id
200 JOIN input_rows ir ON ir.terms = ng.terms
201 WHERE nng.node1_id = ? -- CorpusId
202 AND nng.node2_id = ? -- ListId
203 AND nng.ngrams_type = ? -- NgramsTypeId
204 -- AND nn.category > 0 -- TODO
205 GROUP BY ng.terms, nng.weight
206 |]
207
208
209 -- just slower than getOccByNgramsOnlyFast
210 getOccByNgramsOnlySlow :: NodeType
211 -> CorpusId
212 -> [ListId]
213 -> NgramsType
214 -> [Text]
215 -> Cmd err (Map Text Int)
216 getOccByNgramsOnlySlow t cId ls nt ngs =
217 Map.map Set.size <$> getScore' t cId ls nt ngs
218 where
219 getScore' NodeCorpus = getNodesByNgramsOnlyUser
220 getScore' NodeDocument = getNgramsByDocOnlyUser
221 getScore' _ = getNodesByNgramsOnlyUser
222
223 getOccByNgramsOnlySafe :: CorpusId
224 -> [ListId]
225 -> NgramsType
226 -> [Text]
227 -> Cmd err (Map Text Int)
228 getOccByNgramsOnlySafe cId ls nt ngs = do
229 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
230 fast <- getOccByNgramsOnlyFast cId nt ngs
231 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
232 when (fast /= slow) $
233 printDebug "getOccByNgramsOnlySafe: difference"
234 (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
235 pure slow
236
237
238 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
239 -> NgramsType
240 -> [Text]
241 -> Cmd err [(Text, Int)]
242 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
243 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
244 ( Values fields (DPS.Only <$> tms)
245 , cId
246 , nodeTypeId NodeDocument
247 , ngramsTypeId nt
248 )
249 where
250 fields = [QualifiedIdentifier Nothing "text"]
251
252 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
253 -- Question: with the grouping is the result exactly the same (since Set NodeId for
254 -- equivalent ngrams intersections are not empty)
255 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
256 queryNgramsOccurrencesOnlyByNodeUser = [sql|
257 WITH input_rows(terms) AS (?)
258 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
259 JOIN ngrams ng ON nng.ngrams_id = ng.id
260 JOIN input_rows ir ON ir.terms = ng.terms
261 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
262 JOIN nodes n ON nn.node2_id = n.id
263 WHERE nn.node1_id = ? -- CorpusId
264 AND n.typename = ? -- NodeTypeId
265 AND nng.ngrams_type = ? -- NgramsTypeId
266 AND nn.category > 0
267 GROUP BY nng.node2_id, ng.terms
268 |]
269
270 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
271 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
272 WITH input_rows(terms) AS (?)
273 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
274 JOIN ngrams ng ON nng.ngrams_id = ng.id
275 JOIN input_rows ir ON ir.terms = ng.terms
276 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
277 JOIN nodes n ON nn.node2_id = n.id
278 WHERE nn.node1_id = ? -- CorpusId
279 AND n.typename = ? -- NodeTypeId
280 AND nng.ngrams_type = ? -- NgramsTypeId
281 AND nn.category > 0
282 GROUP BY nng.node2_id, ng.terms
283 |]
284
285 ------------------------------------------------------------------------
286 getNodesByNgramsOnlyUser :: NodeId
287 -> [ListId]
288 -> NgramsType
289 -> [Text]
290 -> Cmd err (Map Text (Set NodeId))
291 getNodesByNgramsOnlyUser cId ls nt ngs =
292 Map.unionsWith (<>)
293 . map (fromListWith (<>)
294 . map (second Set.singleton))
295 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
296 (splitEvery 1000 ngs)
297
298
299 getNgramsByNodeOnlyUser :: NodeId
300 -> [ListId]
301 -> NgramsType
302 -> [Text]
303 -> Cmd err (Map NodeId (Set Text))
304 getNgramsByNodeOnlyUser cId ls nt ngs =
305 Map.unionsWith (<>)
306 . map (fromListWith (<>)
307 . map (second Set.singleton))
308 . map (map swap)
309 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
310 (splitEvery 1000 ngs)
311
312 ------------------------------------------------------------------------
313 selectNgramsOnlyByNodeUser :: CorpusId
314 -> [ListId]
315 -> NgramsType
316 -> [Text]
317 -> Cmd err [(Text, NodeId)]
318 selectNgramsOnlyByNodeUser cId ls nt tms =
319 runPGSQuery queryNgramsOnlyByNodeUser
320 ( Values fields (DPS.Only <$> tms)
321 , Values [QualifiedIdentifier Nothing "int4"]
322 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
323 , cId
324 , nodeTypeId NodeDocument
325 , ngramsTypeId nt
326 )
327 where
328 fields = [QualifiedIdentifier Nothing "text"]
329
330 queryNgramsOnlyByNodeUser :: DPS.Query
331 queryNgramsOnlyByNodeUser = [sql|
332 WITH input_rows(terms) AS (?),
333 input_list(id) AS (?)
334 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
335 JOIN ngrams ng ON nng.ngrams_id = ng.id
336 JOIN input_rows ir ON ir.terms = ng.terms
337 JOIN input_list il ON il.id = nng.node1_id
338 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
339 JOIN nodes n ON nn.node2_id = n.id
340 WHERE nn.node1_id = ? -- CorpusId
341 AND n.typename = ? -- NodeTypeId
342 AND nng.ngrams_type = ? -- NgramsTypeId
343 AND nn.category > 0
344 GROUP BY ng.terms, nng.node2_id
345 |]
346
347
348 selectNgramsOnlyByNodeUser' :: CorpusId
349 -> [ListId]
350 -> NgramsType
351 -> [Text]
352 -> Cmd err [(Text, Int)]
353 selectNgramsOnlyByNodeUser' cId ls nt tms =
354 runPGSQuery queryNgramsOnlyByNodeUser
355 ( Values fields (DPS.Only <$> tms)
356 , Values [QualifiedIdentifier Nothing "int4"]
357 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
358 , cId
359 , nodeTypeId NodeDocument
360 , ngramsTypeId nt
361 )
362 where
363 fields = [QualifiedIdentifier Nothing "text"]
364
365 queryNgramsOnlyByNodeUser' :: DPS.Query
366 queryNgramsOnlyByNodeUser' = [sql|
367 WITH input_rows(terms) AS (?),
368 input_list(id) AS (?)
369 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
370 JOIN ngrams ng ON nng.ngrams_id = ng.id
371 JOIN input_rows ir ON ir.terms = ng.terms
372 JOIN input_list il ON il.id = nng.node2_id
373 WHERE nng.node1_id = ? -- CorpusId
374 AND nng.ngrams_type = ? -- NgramsTypeId
375 -- AND nn.category > 0
376 GROUP BY ng.terms, nng.weight
377 |]
378
379
380 getNgramsByDocOnlyUser :: NodeId
381 -> [ListId]
382 -> NgramsType
383 -> [Text]
384 -> Cmd err (Map Text (Set NodeId))
385 getNgramsByDocOnlyUser cId ls nt ngs =
386 Map.unionsWith (<>)
387 . map (fromListWith (<>) . map (second Set.singleton))
388 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
389
390
391 selectNgramsOnlyByDocUser :: DocId
392 -> [ListId]
393 -> NgramsType
394 -> [Text]
395 -> Cmd err [(Text, NodeId)]
396 selectNgramsOnlyByDocUser dId ls nt tms =
397 runPGSQuery queryNgramsOnlyByDocUser
398 ( Values fields (DPS.Only <$> tms)
399 , Values [QualifiedIdentifier Nothing "int4"]
400 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
401 , dId
402 , ngramsTypeId nt
403 )
404 where
405 fields = [QualifiedIdentifier Nothing "text"]
406
407
408 queryNgramsOnlyByDocUser :: DPS.Query
409 queryNgramsOnlyByDocUser = [sql|
410 WITH input_rows(terms) AS (?),
411 input_list(id) AS (?)
412 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
413 JOIN ngrams ng ON nng.ngrams_id = ng.id
414 JOIN input_rows ir ON ir.terms = ng.terms
415 JOIN input_list il ON il.id = nng.node1_id
416 WHERE nng.node2_id = ? -- DocId
417 AND nng.ngrams_type = ? -- NgramsTypeId
418 GROUP BY ng.terms, nng.node2_id
419 |]
420
421 ------------------------------------------------------------------------
422 -- | TODO filter by language, database, any social field
423 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
424 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
425 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
426 -- . takeWhile (not . List.null)
427 -- . takeWhile (\l -> List.length l > 3)
428 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
429
430 selectNgramsByNodeMaster :: Int
431 -> UserCorpusId
432 -> MasterCorpusId
433 -> Int
434 -> Cmd err [(NodeId, Text)]
435 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
436 queryNgramsByNodeMaster'
437 ( ucId
438 , ngramsTypeId NgramsTerms
439 , nodeTypeId NodeDocument
440 , p
441 , nodeTypeId NodeDocument
442 , p
443 , n
444 , mcId
445 , nodeTypeId NodeDocument
446 , ngramsTypeId NgramsTerms
447 )
448
449 -- | TODO fix node_node_ngrams relation
450 queryNgramsByNodeMaster' :: DPS.Query
451 queryNgramsByNodeMaster' = [sql|
452 WITH nodesByNgramsUser AS (
453
454 SELECT n.id, ng.terms FROM nodes n
455 JOIN nodes_nodes nn ON n.id = nn.node2_id
456 JOIN node_node_ngrams nng ON nng.node2_id = n.id
457 JOIN ngrams ng ON nng.ngrams_id = ng.id
458 WHERE nn.node1_id = ? -- UserCorpusId
459 -- AND n.typename = ? -- NodeTypeId
460 AND nng.ngrams_type = ? -- NgramsTypeId
461 AND nn.category > 0
462 AND node_pos(n.id,?) >= ?
463 AND node_pos(n.id,?) < ?
464 GROUP BY n.id, ng.terms
465
466 ),
467
468 nodesByNgramsMaster AS (
469
470 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
471 JOIN node_node_ngrams nng ON n.id = nng.node2_id
472 JOIN ngrams ng ON ng.id = nng.ngrams_id
473
474 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
475 AND n.typename = ? -- NodeTypeId
476 AND nng.ngrams_type = ? -- NgramsTypeId
477 GROUP BY n.id, ng.terms
478 )
479
480 SELECT m.id, m.terms FROM nodesByNgramsMaster m
481 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
482 |]