]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByNode.hs
[CLEAN] metrics
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByNode.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE FlexibleContexts #-}
15 {-# LANGUAGE QuasiQuotes #-}
16 {-# LANGUAGE NoImplicitPrelude #-}
17 {-# LANGUAGE OverloadedStrings #-}
18 {-# LANGUAGE RankNTypes #-}
19
20 module Gargantext.Database.Action.Metrics.NgramsByNode
21 where
22
23 import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
24 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
25 import Data.Set (Set)
26 import Data.Text (Text)
27 import Data.Tuple.Extra (second, swap)
28 import Database.PostgreSQL.Simple.SqlQQ (sql)
29 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
30 import Debug.Trace (trace)
31 import Gargantext.Core (Lang(..))
32 import Gargantext.Database.Admin.Config (nodeTypeId)
33 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
34 import Gargantext.Database.Admin.Utils (Cmd, runPGSQuery)
35 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
36 import Gargantext.Prelude
37 import Gargantext.Text.Metrics.TFICF
38 import Gargantext.Text.Terms.Mono.Stem (stem)
39 import qualified Data.List as List
40 import qualified Data.Map.Strict as Map
41 import qualified Data.Set as Set
42 import qualified Data.Text as Text
43 import qualified Database.PostgreSQL.Simple as DPS
44
45 -- | TODO: group with 2 terms only can be
46 -- discussed. Main purpose of this is offering
47 -- a first grouping option to user and get some
48 -- enriched data to better learn and improve that algo
49 ngramsGroup :: Lang
50 -> Int
51 -> Int
52 -> Text
53 -> Text
54 ngramsGroup l _m _n = Text.intercalate " "
55 . map (stem l)
56 -- . take n
57 . List.sort
58 -- . (List.filter (\t -> Text.length t > m))
59 . Text.splitOn " "
60 . Text.replace "-" " "
61
62
63 sortTficf :: (Map Text (Double, Set Text))
64 -> [ (Text,(Double, Set Text))]
65 sortTficf = List.sortOn (fst . snd) . toList
66
67
68 getTficf :: UserCorpusId
69 -> MasterCorpusId
70 -> NgramsType
71 -> (Text -> Text)
72 -> Cmd err (Map Text (Double, Set Text))
73 getTficf u m nt f = do
74 u' <- getNodesByNgramsUser u nt
75 m' <- getNodesByNgramsMaster u m
76
77 pure $ toTficfData (countNodesByNgramsWith f u')
78 (countNodesByNgramsWith f m')
79
80 {-
81 getTficfWith :: UserCorpusId
82 -> MasterCorpusId
83 -> [ListId]
84 -> NgramsType
85 -> Map Text (Maybe Text)
86 -> Cmd err (Map Text (Double, Set Text))
87 getTficfWith u m ls nt mtxt = do
88 u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
89 m' <- getNodesByNgramsMaster u m
90
91 let f x = case Map.lookup x mtxt of
92 Nothing -> x
93 Just x' -> maybe x identity x'
94
95 pure $ toTficfData (countNodesByNgramsWith f u')
96 (countNodesByNgramsWith f m')
97 -}
98
99 type Context = (Double, Map Text (Double, Set Text))
100 type Supra = Context
101 type Infra = Context
102
103 toTficfData :: Infra
104 -> Supra
105 -> Map Text (Double, Set Text)
106 toTficfData (ti, mi) (ts, ms) =
107 fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
108 (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
109 , ns
110 )
111 )
112 | (t, (n,ns)) <- toList mi
113 ]
114
115
116 -- | fst is size of Supra Corpus
117 -- snd is Texts and size of Occurrences (different docs)
118 countNodesByNgramsWith :: (Text -> Text)
119 -> Map Text (Set NodeId)
120 -> (Double, Map Text (Double, Set Text))
121 countNodesByNgramsWith f m = (total, m')
122 where
123 total = fromIntegral $ Set.size $ Set.unions $ elems m
124 m' = Map.map ( swap . second (fromIntegral . Set.size))
125 $ groupNodesByNgramsWith f m
126
127
128 groupNodesByNgramsWith :: (Text -> Text)
129 -> Map Text (Set NodeId)
130 -> Map Text (Set Text, Set NodeId)
131 groupNodesByNgramsWith f m =
132 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
133 $ toList m
134
135 ------------------------------------------------------------------------
136 getNodesByNgramsUser :: CorpusId
137 -> NgramsType
138 -> Cmd err (Map Text (Set NodeId))
139 getNodesByNgramsUser cId nt =
140 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
141 <$> selectNgramsByNodeUser cId nt
142 where
143
144 selectNgramsByNodeUser :: CorpusId
145 -> NgramsType
146 -> Cmd err [(NodeId, Text)]
147 selectNgramsByNodeUser cId' nt' =
148 runPGSQuery queryNgramsByNodeUser
149 ( cId'
150 , nodeTypeId NodeDocument
151 , ngramsTypeId nt'
152 -- , 100 :: Int -- limit
153 -- , 0 :: Int -- offset
154 )
155
156 queryNgramsByNodeUser :: DPS.Query
157 queryNgramsByNodeUser = [sql|
158 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
159 JOIN ngrams ng ON nng.ngrams_id = ng.id
160 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
161 JOIN nodes n ON nn.node2_id = n.id
162 WHERE nn.node1_id = ? -- CorpusId
163 AND n.typename = ? -- NodeTypeId
164 AND nng.ngrams_type = ? -- NgramsTypeId
165 AND nn.category > 0
166 GROUP BY nng.node2_id, ng.terms
167 ORDER BY (nng.node2_id, ng.terms) DESC
168 -- LIMIT ?
169 -- OFFSET ?
170 |]
171 ------------------------------------------------------------------------
172 -- TODO add groups
173 getOccByNgramsOnlyFast :: CorpusId
174 -> NgramsType
175 -> [Text]
176 -> Cmd err (Map Text Int)
177 getOccByNgramsOnlyFast cId nt ngs =
178 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
179
180
181 getOccByNgramsOnlyFast' :: CorpusId
182 -> ListId
183 -> NgramsType
184 -> [Text]
185 -> Cmd err (Map Text Int)
186 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
187 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
188
189 where
190 fields = [QualifiedIdentifier Nothing "text"]
191
192 run :: CorpusId
193 -> ListId
194 -> NgramsType
195 -> [Text]
196 -> Cmd err [(Text, Double)]
197 run cId' lId' nt' tms' = runPGSQuery query
198 ( Values fields (DPS.Only <$> tms')
199 , cId'
200 , lId'
201 , ngramsTypeId nt'
202 )
203
204 query :: DPS.Query
205 query = [sql|
206 WITH input_rows(terms) AS (?)
207 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
208 JOIN ngrams ng ON nng.ngrams_id = ng.id
209 JOIN input_rows ir ON ir.terms = ng.terms
210 WHERE nng.node1_id = ? -- CorpusId
211 AND nng.node2_id = ? -- ListId
212 AND nng.ngrams_type = ? -- NgramsTypeId
213 -- AND nn.category > 0 -- TODO
214 GROUP BY ng.terms, nng.weight
215 |]
216
217
218 -- just slower than getOccByNgramsOnlyFast
219 getOccByNgramsOnlySlow :: NodeType
220 -> CorpusId
221 -> [ListId]
222 -> NgramsType
223 -> [Text]
224 -> Cmd err (Map Text Int)
225 getOccByNgramsOnlySlow t cId ls nt ngs =
226 Map.map Set.size <$> getScore' t cId ls nt ngs
227 where
228 getScore' NodeCorpus = getNodesByNgramsOnlyUser
229 getScore' NodeDocument = getNgramsByDocOnlyUser
230 getScore' _ = getNodesByNgramsOnlyUser
231
232 getOccByNgramsOnlySafe :: CorpusId
233 -> [ListId]
234 -> NgramsType
235 -> [Text]
236 -> Cmd err (Map Text Int)
237 getOccByNgramsOnlySafe cId ls nt ngs = do
238 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
239 fast <- getOccByNgramsOnlyFast cId nt ngs
240 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
241 when (fast /= slow) $
242 printDebug "getOccByNgramsOnlySafe: difference"
243 (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
244 pure slow
245
246
247 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
248 -> NgramsType
249 -> [Text]
250 -> Cmd err [(Text, Int)]
251 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
252 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
253 ( Values fields (DPS.Only <$> tms)
254 , cId
255 , nodeTypeId NodeDocument
256 , ngramsTypeId nt
257 )
258 where
259 fields = [QualifiedIdentifier Nothing "text"]
260
261 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
262 -- Question: with the grouping is the result exactly the same (since Set NodeId for
263 -- equivalent ngrams intersections are not empty)
264 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
265 queryNgramsOccurrencesOnlyByNodeUser = [sql|
266 WITH input_rows(terms) AS (?)
267 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
268 JOIN ngrams ng ON nng.ngrams_id = ng.id
269 JOIN input_rows ir ON ir.terms = ng.terms
270 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
271 JOIN nodes n ON nn.node2_id = n.id
272 WHERE nn.node1_id = ? -- CorpusId
273 AND n.typename = ? -- NodeTypeId
274 AND nng.ngrams_type = ? -- NgramsTypeId
275 AND nn.category > 0
276 GROUP BY nng.node2_id, ng.terms
277 |]
278
279 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
280 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
281 WITH input_rows(terms) AS (?)
282 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
283 JOIN ngrams ng ON nng.ngrams_id = ng.id
284 JOIN input_rows ir ON ir.terms = ng.terms
285 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
286 JOIN nodes n ON nn.node2_id = n.id
287 WHERE nn.node1_id = ? -- CorpusId
288 AND n.typename = ? -- NodeTypeId
289 AND nng.ngrams_type = ? -- NgramsTypeId
290 AND nn.category > 0
291 GROUP BY nng.node2_id, ng.terms
292 |]
293
294 ------------------------------------------------------------------------
295 getNodesByNgramsOnlyUser :: NodeId
296 -> [ListId]
297 -> NgramsType
298 -> [Text]
299 -> Cmd err (Map Text (Set NodeId))
300 getNodesByNgramsOnlyUser cId ls nt ngs =
301 Map.unionsWith (<>)
302 . map (fromListWith (<>)
303 . map (second Set.singleton))
304 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
305 (splitEvery 1000 ngs)
306
307
308 getNgramsByNodeOnlyUser :: NodeId
309 -> [ListId]
310 -> NgramsType
311 -> [Text]
312 -> Cmd err (Map NodeId (Set Text))
313 getNgramsByNodeOnlyUser cId ls nt ngs =
314 Map.unionsWith (<>)
315 . map (fromListWith (<>)
316 . map (second Set.singleton))
317 . map (map swap)
318 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
319 (splitEvery 1000 ngs)
320
321 ------------------------------------------------------------------------
322 selectNgramsOnlyByNodeUser :: CorpusId
323 -> [ListId]
324 -> NgramsType
325 -> [Text]
326 -> Cmd err [(Text, NodeId)]
327 selectNgramsOnlyByNodeUser cId ls nt tms =
328 runPGSQuery queryNgramsOnlyByNodeUser
329 ( Values fields (DPS.Only <$> tms)
330 , Values [QualifiedIdentifier Nothing "int4"]
331 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
332 , cId
333 , nodeTypeId NodeDocument
334 , ngramsTypeId nt
335 )
336 where
337 fields = [QualifiedIdentifier Nothing "text"]
338
339 queryNgramsOnlyByNodeUser :: DPS.Query
340 queryNgramsOnlyByNodeUser = [sql|
341 WITH input_rows(terms) AS (?),
342 input_list(id) AS (?)
343 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
344 JOIN ngrams ng ON nng.ngrams_id = ng.id
345 JOIN input_rows ir ON ir.terms = ng.terms
346 JOIN input_list il ON il.id = nng.node1_id
347 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
348 JOIN nodes n ON nn.node2_id = n.id
349 WHERE nn.node1_id = ? -- CorpusId
350 AND n.typename = ? -- NodeTypeId
351 AND nng.ngrams_type = ? -- NgramsTypeId
352 AND nn.category > 0
353 GROUP BY ng.terms, nng.node2_id
354 |]
355
356
357 selectNgramsOnlyByNodeUser' :: CorpusId
358 -> [ListId]
359 -> NgramsType
360 -> [Text]
361 -> Cmd err [(Text, Int)]
362 selectNgramsOnlyByNodeUser' cId ls nt tms =
363 runPGSQuery queryNgramsOnlyByNodeUser
364 ( Values fields (DPS.Only <$> tms)
365 , Values [QualifiedIdentifier Nothing "int4"]
366 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
367 , cId
368 , nodeTypeId NodeDocument
369 , ngramsTypeId nt
370 )
371 where
372 fields = [QualifiedIdentifier Nothing "text"]
373
374 queryNgramsOnlyByNodeUser' :: DPS.Query
375 queryNgramsOnlyByNodeUser' = [sql|
376 WITH input_rows(terms) AS (?),
377 input_list(id) AS (?)
378 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
379 JOIN ngrams ng ON nng.ngrams_id = ng.id
380 JOIN input_rows ir ON ir.terms = ng.terms
381 JOIN input_list il ON il.id = nng.node2_id
382 WHERE nng.node1_id = ? -- CorpusId
383 AND nng.ngrams_type = ? -- NgramsTypeId
384 -- AND nn.category > 0
385 GROUP BY ng.terms, nng.weight
386 |]
387
388
389 getNgramsByDocOnlyUser :: NodeId
390 -> [ListId]
391 -> NgramsType
392 -> [Text]
393 -> Cmd err (Map Text (Set NodeId))
394 getNgramsByDocOnlyUser cId ls nt ngs =
395 Map.unionsWith (<>)
396 . map (fromListWith (<>) . map (second Set.singleton))
397 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
398
399
400 selectNgramsOnlyByDocUser :: DocId
401 -> [ListId]
402 -> NgramsType
403 -> [Text]
404 -> Cmd err [(Text, NodeId)]
405 selectNgramsOnlyByDocUser dId ls nt tms =
406 runPGSQuery queryNgramsOnlyByDocUser
407 ( Values fields (DPS.Only <$> tms)
408 , Values [QualifiedIdentifier Nothing "int4"]
409 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
410 , dId
411 , ngramsTypeId nt
412 )
413 where
414 fields = [QualifiedIdentifier Nothing "text"]
415
416
417 queryNgramsOnlyByDocUser :: DPS.Query
418 queryNgramsOnlyByDocUser = [sql|
419 WITH input_rows(terms) AS (?),
420 input_list(id) AS (?)
421 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
422 JOIN ngrams ng ON nng.ngrams_id = ng.id
423 JOIN input_rows ir ON ir.terms = ng.terms
424 JOIN input_list il ON il.id = nng.node1_id
425 WHERE nng.node2_id = ? -- DocId
426 AND nng.ngrams_type = ? -- NgramsTypeId
427 GROUP BY ng.terms, nng.node2_id
428 |]
429
430 ------------------------------------------------------------------------
431 -- | TODO filter by language, database, any social field
432 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
433 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
434 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
435 -- . takeWhile (not . List.null)
436 -- . takeWhile (\l -> List.length l > 3)
437 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
438
439 selectNgramsByNodeMaster :: Int
440 -> UserCorpusId
441 -> MasterCorpusId
442 -> Int
443 -> Cmd err [(NodeId, Text)]
444 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
445 queryNgramsByNodeMaster'
446 ( ucId
447 , ngramsTypeId NgramsTerms
448 , nodeTypeId NodeDocument
449 , p
450 , nodeTypeId NodeDocument
451 , p
452 , n
453 , mcId
454 , nodeTypeId NodeDocument
455 , ngramsTypeId NgramsTerms
456 )
457
458 -- | TODO fix node_node_ngrams relation
459 queryNgramsByNodeMaster' :: DPS.Query
460 queryNgramsByNodeMaster' = [sql|
461 WITH nodesByNgramsUser AS (
462
463 SELECT n.id, ng.terms FROM nodes n
464 JOIN nodes_nodes nn ON n.id = nn.node2_id
465 JOIN node_node_ngrams nng ON nng.node2_id = n.id
466 JOIN ngrams ng ON nng.ngrams_id = ng.id
467 WHERE nn.node1_id = ? -- UserCorpusId
468 -- AND n.typename = ? -- NodeTypeId
469 AND nng.ngrams_type = ? -- NgramsTypeId
470 AND nn.category > 0
471 AND node_pos(n.id,?) >= ?
472 AND node_pos(n.id,?) < ?
473 GROUP BY n.id, ng.terms
474
475 ),
476
477 nodesByNgramsMaster AS (
478
479 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
480 JOIN node_node_ngrams nng ON n.id = nng.node2_id
481 JOIN ngrams ng ON ng.id = nng.ngrams_id
482
483 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
484 AND n.typename = ? -- NodeTypeId
485 AND nng.ngrams_type = ? -- NgramsTypeId
486 GROUP BY n.id, ng.terms
487 )
488
489 SELECT m.id, m.terms FROM nodesByNgramsMaster m
490 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
491 |]