]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByNode.hs
[DB/FACT] Gargantext.Database.Prelude
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByNode.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE FlexibleContexts #-}
15 {-# LANGUAGE QuasiQuotes #-}
16 {-# LANGUAGE NoImplicitPrelude #-}
17 {-# LANGUAGE OverloadedStrings #-}
18 {-# LANGUAGE RankNTypes #-}
19
20 module Gargantext.Database.Action.Metrics.NgramsByNode
21 where
22
23 import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
24 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
25 import Data.Set (Set)
26 import qualified Data.Ord as DO (Down(..))
27 import Data.Text (Text)
28 import Data.Tuple.Extra (second, swap)
29 import Database.PostgreSQL.Simple.SqlQQ (sql)
30 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
31 import Debug.Trace (trace)
32 import Gargantext.Core (Lang(..))
33 import Gargantext.Core.Types (Ordering(..))
34 import Gargantext.Database.Admin.Config (nodeTypeId)
35 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
36 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
37 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
38 import Gargantext.Prelude
39 import Gargantext.Text.Metrics.TFICF
40 import Gargantext.Text.Terms.Mono.Stem (stem)
41 import qualified Data.List as List
42 import qualified Data.Map.Strict as Map
43 import qualified Data.Set as Set
44 import qualified Data.Text as Text
45 import qualified Database.PostgreSQL.Simple as DPS
46
47 -- | TODO: group with 2 terms only can be
48 -- discussed. Main purpose of this is offering
49 -- a first grouping option to user and get some
50 -- enriched data to better learn and improve that algo
51 ngramsGroup :: Lang
52 -> Int
53 -> Int
54 -> Text
55 -> Text
56 ngramsGroup l _m _n = Text.intercalate " "
57 . map (stem l)
58 -- . take n
59 . List.sort
60 -- . (List.filter (\t -> Text.length t > m))
61 . Text.splitOn " "
62 . Text.replace "-" " "
63
64
65 sortTficf :: Ordering
66 -> (Map Text (Double, Set Text))
67 -> [ (Text,(Double, Set Text))]
68 sortTficf Down = List.sortOn (DO.Down . fst . snd) . toList
69 sortTficf Up = List.sortOn (fst . snd) . toList
70
71
72 getTficf :: UserCorpusId
73 -> MasterCorpusId
74 -> NgramsType
75 -> (Text -> Text)
76 -> Cmd err (Map Text (Double, Set Text))
77 getTficf u m nt f = do
78 u' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsUser u nt
79 m' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsMaster u m
80
81 pure $ toTficfData (countNodesByNgramsWith f u')
82 (countNodesByNgramsWith f m')
83
84 {-
85 getTficfWith :: UserCorpusId
86 -> MasterCorpusId
87 -> [ListId]
88 -> NgramsType
89 -> Map Text (Maybe Text)
90 -> Cmd err (Map Text (Double, Set Text))
91 getTficfWith u m ls nt mtxt = do
92 u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
93 m' <- getNodesByNgramsMaster u m
94
95 let f x = case Map.lookup x mtxt of
96 Nothing -> x
97 Just x' -> maybe x identity x'
98
99 pure $ toTficfData (countNodesByNgramsWith f u') (countNodesByNgramsWith f m')
100 -}
101
102 type Context = (Double, Map Text (Double, Set Text))
103 type Supra = Context
104 type Infra = Context
105
106 toTficfData :: Infra
107 -> Supra
108 -> Map Text (Double, Set Text)
109 toTficfData (ti, mi) (ts, ms) =
110 fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
111 (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
112 , ns
113 )
114 )
115 | (t, (n,ns)) <- toList mi
116 ]
117
118
119 -- | fst is size of Supra Corpus
120 -- snd is Texts and size of Occurrences (different docs)
121 countNodesByNgramsWith :: (Text -> Text)
122 -> Map Text (Set NodeId)
123 -> (Double, Map Text (Double, Set Text))
124 countNodesByNgramsWith f m = (total, m')
125 where
126 total = fromIntegral $ Set.size $ Set.unions $ elems m
127 m' = Map.map ( swap . second (fromIntegral . Set.size))
128 $ groupNodesByNgramsWith f m
129
130
131 groupNodesByNgramsWith :: (Text -> Text)
132 -> Map Text (Set NodeId)
133 -> Map Text (Set Text, Set NodeId)
134 groupNodesByNgramsWith f m =
135 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
136 $ toList m
137
138 ------------------------------------------------------------------------
139 getNodesByNgramsUser :: CorpusId
140 -> NgramsType
141 -> Cmd err (Map Text (Set NodeId))
142 getNodesByNgramsUser cId nt =
143 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
144 <$> selectNgramsByNodeUser cId nt
145 where
146
147 selectNgramsByNodeUser :: CorpusId
148 -> NgramsType
149 -> Cmd err [(NodeId, Text)]
150 selectNgramsByNodeUser cId' nt' =
151 runPGSQuery queryNgramsByNodeUser
152 ( cId'
153 , nodeTypeId NodeDocument
154 , ngramsTypeId nt'
155 -- , 100 :: Int -- limit
156 -- , 0 :: Int -- offset
157 )
158
159 queryNgramsByNodeUser :: DPS.Query
160 queryNgramsByNodeUser = [sql|
161 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
162 JOIN ngrams ng ON nng.ngrams_id = ng.id
163 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
164 JOIN nodes n ON nn.node2_id = n.id
165 WHERE nn.node1_id = ? -- CorpusId
166 AND n.typename = ? -- NodeTypeId
167 AND nng.ngrams_type = ? -- NgramsTypeId
168 AND nn.category > 0
169 GROUP BY nng.node2_id, ng.terms
170 ORDER BY (nng.node2_id, ng.terms) DESC
171 -- LIMIT ?
172 -- OFFSET ?
173 |]
174 ------------------------------------------------------------------------
175 -- TODO add groups
176 getOccByNgramsOnlyFast :: CorpusId
177 -> NgramsType
178 -> [Text]
179 -> Cmd err (Map Text Int)
180 getOccByNgramsOnlyFast cId nt ngs =
181 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
182
183
184 getOccByNgramsOnlyFast' :: CorpusId
185 -> ListId
186 -> NgramsType
187 -> [Text]
188 -> Cmd err (Map Text Int)
189 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
190 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
191
192 where
193 fields = [QualifiedIdentifier Nothing "text"]
194
195 run :: CorpusId
196 -> ListId
197 -> NgramsType
198 -> [Text]
199 -> Cmd err [(Text, Double)]
200 run cId' lId' nt' tms' = runPGSQuery query
201 ( Values fields (DPS.Only <$> tms')
202 , cId'
203 , lId'
204 , ngramsTypeId nt'
205 )
206
207 query :: DPS.Query
208 query = [sql|
209 WITH input_rows(terms) AS (?)
210 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
211 JOIN ngrams ng ON nng.ngrams_id = ng.id
212 JOIN input_rows ir ON ir.terms = ng.terms
213 WHERE nng.node1_id = ? -- CorpusId
214 AND nng.node2_id = ? -- ListId
215 AND nng.ngrams_type = ? -- NgramsTypeId
216 -- AND nn.category > 0 -- TODO
217 GROUP BY ng.terms, nng.weight
218 |]
219
220
221 -- just slower than getOccByNgramsOnlyFast
222 getOccByNgramsOnlySlow :: NodeType
223 -> CorpusId
224 -> [ListId]
225 -> NgramsType
226 -> [Text]
227 -> Cmd err (Map Text Int)
228 getOccByNgramsOnlySlow t cId ls nt ngs =
229 Map.map Set.size <$> getScore' t cId ls nt ngs
230 where
231 getScore' NodeCorpus = getNodesByNgramsOnlyUser
232 getScore' NodeDocument = getNgramsByDocOnlyUser
233 getScore' _ = getNodesByNgramsOnlyUser
234
235 getOccByNgramsOnlySafe :: CorpusId
236 -> [ListId]
237 -> NgramsType
238 -> [Text]
239 -> Cmd err (Map Text Int)
240 getOccByNgramsOnlySafe cId ls nt ngs = do
241 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
242 fast <- getOccByNgramsOnlyFast cId nt ngs
243 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
244 when (fast /= slow) $
245 printDebug "getOccByNgramsOnlySafe: difference"
246 (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
247 pure slow
248
249
250 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
251 -> NgramsType
252 -> [Text]
253 -> Cmd err [(Text, Int)]
254 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
255 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
256 ( Values fields (DPS.Only <$> tms)
257 , cId
258 , nodeTypeId NodeDocument
259 , ngramsTypeId nt
260 )
261 where
262 fields = [QualifiedIdentifier Nothing "text"]
263
264 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
265 -- Question: with the grouping is the result exactly the same (since Set NodeId for
266 -- equivalent ngrams intersections are not empty)
267 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
268 queryNgramsOccurrencesOnlyByNodeUser = [sql|
269 WITH input_rows(terms) AS (?)
270 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
271 JOIN ngrams ng ON nng.ngrams_id = ng.id
272 JOIN input_rows ir ON ir.terms = ng.terms
273 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
274 JOIN nodes n ON nn.node2_id = n.id
275 WHERE nn.node1_id = ? -- CorpusId
276 AND n.typename = ? -- NodeTypeId
277 AND nng.ngrams_type = ? -- NgramsTypeId
278 AND nn.category > 0
279 GROUP BY nng.node2_id, ng.terms
280 |]
281
282 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
283 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
284 WITH input_rows(terms) AS (?)
285 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
286 JOIN ngrams ng ON nng.ngrams_id = ng.id
287 JOIN input_rows ir ON ir.terms = ng.terms
288 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
289 JOIN nodes n ON nn.node2_id = n.id
290 WHERE nn.node1_id = ? -- CorpusId
291 AND n.typename = ? -- NodeTypeId
292 AND nng.ngrams_type = ? -- NgramsTypeId
293 AND nn.category > 0
294 GROUP BY nng.node2_id, ng.terms
295 |]
296
297 ------------------------------------------------------------------------
298 getNodesByNgramsOnlyUser :: NodeId
299 -> [ListId]
300 -> NgramsType
301 -> [Text]
302 -> Cmd err (Map Text (Set NodeId))
303 getNodesByNgramsOnlyUser cId ls nt ngs =
304 Map.unionsWith (<>)
305 . map (fromListWith (<>)
306 . map (second Set.singleton))
307 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
308 (splitEvery 1000 ngs)
309
310
311 getNgramsByNodeOnlyUser :: NodeId
312 -> [ListId]
313 -> NgramsType
314 -> [Text]
315 -> Cmd err (Map NodeId (Set Text))
316 getNgramsByNodeOnlyUser cId ls nt ngs =
317 Map.unionsWith (<>)
318 . map (fromListWith (<>)
319 . map (second Set.singleton))
320 . map (map swap)
321 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
322 (splitEvery 1000 ngs)
323
324 ------------------------------------------------------------------------
325 selectNgramsOnlyByNodeUser :: CorpusId
326 -> [ListId]
327 -> NgramsType
328 -> [Text]
329 -> Cmd err [(Text, NodeId)]
330 selectNgramsOnlyByNodeUser cId ls nt tms =
331 runPGSQuery queryNgramsOnlyByNodeUser
332 ( Values fields (DPS.Only <$> tms)
333 , Values [QualifiedIdentifier Nothing "int4"]
334 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
335 , cId
336 , nodeTypeId NodeDocument
337 , ngramsTypeId nt
338 )
339 where
340 fields = [QualifiedIdentifier Nothing "text"]
341
342 queryNgramsOnlyByNodeUser :: DPS.Query
343 queryNgramsOnlyByNodeUser = [sql|
344 WITH input_rows(terms) AS (?),
345 input_list(id) AS (?)
346 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
347 JOIN ngrams ng ON nng.ngrams_id = ng.id
348 JOIN input_rows ir ON ir.terms = ng.terms
349 JOIN input_list il ON il.id = nng.node1_id
350 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
351 JOIN nodes n ON nn.node2_id = n.id
352 WHERE nn.node1_id = ? -- CorpusId
353 AND n.typename = ? -- NodeTypeId
354 AND nng.ngrams_type = ? -- NgramsTypeId
355 AND nn.category > 0
356 GROUP BY ng.terms, nng.node2_id
357 |]
358
359
360 selectNgramsOnlyByNodeUser' :: CorpusId
361 -> [ListId]
362 -> NgramsType
363 -> [Text]
364 -> Cmd err [(Text, Int)]
365 selectNgramsOnlyByNodeUser' cId ls nt tms =
366 runPGSQuery queryNgramsOnlyByNodeUser
367 ( Values fields (DPS.Only <$> tms)
368 , Values [QualifiedIdentifier Nothing "int4"]
369 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
370 , cId
371 , nodeTypeId NodeDocument
372 , ngramsTypeId nt
373 )
374 where
375 fields = [QualifiedIdentifier Nothing "text"]
376
377 queryNgramsOnlyByNodeUser' :: DPS.Query
378 queryNgramsOnlyByNodeUser' = [sql|
379 WITH input_rows(terms) AS (?),
380 input_list(id) AS (?)
381 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
382 JOIN ngrams ng ON nng.ngrams_id = ng.id
383 JOIN input_rows ir ON ir.terms = ng.terms
384 JOIN input_list il ON il.id = nng.node2_id
385 WHERE nng.node1_id = ? -- CorpusId
386 AND nng.ngrams_type = ? -- NgramsTypeId
387 -- AND nn.category > 0
388 GROUP BY ng.terms, nng.weight
389 |]
390
391
392 getNgramsByDocOnlyUser :: NodeId
393 -> [ListId]
394 -> NgramsType
395 -> [Text]
396 -> Cmd err (Map Text (Set NodeId))
397 getNgramsByDocOnlyUser cId ls nt ngs =
398 Map.unionsWith (<>)
399 . map (fromListWith (<>) . map (second Set.singleton))
400 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
401
402
403 selectNgramsOnlyByDocUser :: DocId
404 -> [ListId]
405 -> NgramsType
406 -> [Text]
407 -> Cmd err [(Text, NodeId)]
408 selectNgramsOnlyByDocUser dId ls nt tms =
409 runPGSQuery queryNgramsOnlyByDocUser
410 ( Values fields (DPS.Only <$> tms)
411 , Values [QualifiedIdentifier Nothing "int4"]
412 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
413 , dId
414 , ngramsTypeId nt
415 )
416 where
417 fields = [QualifiedIdentifier Nothing "text"]
418
419
420 queryNgramsOnlyByDocUser :: DPS.Query
421 queryNgramsOnlyByDocUser = [sql|
422 WITH input_rows(terms) AS (?),
423 input_list(id) AS (?)
424 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
425 JOIN ngrams ng ON nng.ngrams_id = ng.id
426 JOIN input_rows ir ON ir.terms = ng.terms
427 JOIN input_list il ON il.id = nng.node1_id
428 WHERE nng.node2_id = ? -- DocId
429 AND nng.ngrams_type = ? -- NgramsTypeId
430 GROUP BY ng.terms, nng.node2_id
431 |]
432
433 ------------------------------------------------------------------------
434 -- | TODO filter by language, database, any social field
435 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
436 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
437 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
438 -- . takeWhile (not . List.null)
439 -- . takeWhile (\l -> List.length l > 3)
440 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
441
442 selectNgramsByNodeMaster :: Int
443 -> UserCorpusId
444 -> MasterCorpusId
445 -> Int
446 -> Cmd err [(NodeId, Text)]
447 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
448 queryNgramsByNodeMaster'
449 ( ucId
450 , ngramsTypeId NgramsTerms
451 , nodeTypeId NodeDocument
452 , p
453 , nodeTypeId NodeDocument
454 , p
455 , n
456 , mcId
457 , nodeTypeId NodeDocument
458 , ngramsTypeId NgramsTerms
459 )
460
461 -- | TODO fix node_node_ngrams relation
462 queryNgramsByNodeMaster' :: DPS.Query
463 queryNgramsByNodeMaster' = [sql|
464 WITH nodesByNgramsUser AS (
465
466 SELECT n.id, ng.terms FROM nodes n
467 JOIN nodes_nodes nn ON n.id = nn.node2_id
468 JOIN node_node_ngrams nng ON nng.node2_id = n.id
469 JOIN ngrams ng ON nng.ngrams_id = ng.id
470 WHERE nn.node1_id = ? -- UserCorpusId
471 -- AND n.typename = ? -- NodeTypeId
472 AND nng.ngrams_type = ? -- NgramsTypeId
473 AND nn.category > 0
474 AND node_pos(n.id,?) >= ?
475 AND node_pos(n.id,?) < ?
476 GROUP BY n.id, ng.terms
477
478 ),
479
480 nodesByNgramsMaster AS (
481
482 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
483 JOIN node_node_ngrams nng ON n.id = nng.node2_id
484 JOIN ngrams ng ON ng.id = nng.ngrams_id
485
486 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
487 AND n.typename = ? -- NodeTypeId
488 AND nng.ngrams_type = ? -- NgramsTypeId
489 GROUP BY n.id, ng.terms
490 )
491
492 SELECT m.id, m.terms FROM nodesByNgramsMaster m
493 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
494 |]