2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
16 module Gargantext.Database.Action.Metrics.NgramsByNode
19 import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
20 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
22 import qualified Data.Ord as DO (Down(..))
23 import Data.Text (Text)
24 import Data.Tuple.Extra (second, swap)
25 import Database.PostgreSQL.Simple.SqlQQ (sql)
26 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
27 import Debug.Trace (trace)
28 import Gargantext.Core (Lang(..))
29 import Gargantext.Core.Types (Ordering(..))
30 import Gargantext.Database.Admin.Config (nodeTypeId)
31 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import Gargantext.Text.Metrics.TFICF
36 import Gargantext.Text.Terms.Mono.Stem (stem)
37 import qualified Data.List as List
38 import qualified Data.Map.Strict as Map
39 import qualified Data.Set as Set
40 import qualified Data.Text as Text
41 import qualified Database.PostgreSQL.Simple as DPS
43 -- | TODO: group with 2 terms only can be
44 -- discussed. Main purpose of this is offering
45 -- a first grouping option to user and get some
46 -- enriched data to better learn and improve that algo
52 ngramsGroup l _m _n = Text.intercalate " "
56 -- . (List.filter (\t -> Text.length t > m))
58 . Text.replace "-" " "
62 -> (Map Text (Double, Set Text))
63 -> [ (Text,(Double, Set Text))]
64 sortTficf Down = List.sortOn (DO.Down . fst . snd) . toList
65 sortTficf Up = List.sortOn (fst . snd) . toList
68 getTficf :: UserCorpusId
72 -> Cmd err (Map Text (Double, Set Text))
73 getTficf u m nt f = do
74 u' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsUser u nt
75 m' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsMaster u m
77 pure $ toTficfData (countNodesByNgramsWith f u')
78 (countNodesByNgramsWith f m')
81 getTficfWith :: UserCorpusId
85 -> Map Text (Maybe Text)
86 -> Cmd err (Map Text (Double, Set Text))
87 getTficfWith u m ls nt mtxt = do
88 u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
89 m' <- getNodesByNgramsMaster u m
91 let f x = case Map.lookup x mtxt of
93 Just x' -> maybe x identity x'
95 pure $ toTficfData (countNodesByNgramsWith f u') (countNodesByNgramsWith f m')
98 type Context = (Double, Map Text (Double, Set Text))
104 -> Map Text (Double, Set Text)
105 toTficfData (ti, mi) (ts, ms) =
106 fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
107 (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
111 | (t, (n,ns)) <- toList mi
115 -- | fst is size of Supra Corpus
116 -- snd is Texts and size of Occurrences (different docs)
117 countNodesByNgramsWith :: (Text -> Text)
118 -> Map Text (Set NodeId)
119 -> (Double, Map Text (Double, Set Text))
120 countNodesByNgramsWith f m = (total, m')
122 total = fromIntegral $ Set.size $ Set.unions $ elems m
123 m' = Map.map ( swap . second (fromIntegral . Set.size))
124 $ groupNodesByNgramsWith f m
127 groupNodesByNgramsWith :: (Text -> Text)
128 -> Map Text (Set NodeId)
129 -> Map Text (Set Text, Set NodeId)
130 groupNodesByNgramsWith f m =
131 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
134 ------------------------------------------------------------------------
135 getNodesByNgramsUser :: CorpusId
137 -> Cmd err (Map Text (Set NodeId))
138 getNodesByNgramsUser cId nt =
139 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
140 <$> selectNgramsByNodeUser cId nt
143 selectNgramsByNodeUser :: CorpusId
145 -> Cmd err [(NodeId, Text)]
146 selectNgramsByNodeUser cId' nt' =
147 runPGSQuery queryNgramsByNodeUser
149 , nodeTypeId NodeDocument
151 -- , 100 :: Int -- limit
152 -- , 0 :: Int -- offset
155 queryNgramsByNodeUser :: DPS.Query
156 queryNgramsByNodeUser = [sql|
157 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
158 JOIN ngrams ng ON nng.ngrams_id = ng.id
159 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
160 JOIN nodes n ON nn.node2_id = n.id
161 WHERE nn.node1_id = ? -- CorpusId
162 AND n.typename = ? -- NodeTypeId
163 AND nng.ngrams_type = ? -- NgramsTypeId
165 GROUP BY nng.node2_id, ng.terms
166 ORDER BY (nng.node2_id, ng.terms) DESC
170 ------------------------------------------------------------------------
172 getOccByNgramsOnlyFast :: CorpusId
175 -> Cmd err (Map Text Int)
176 getOccByNgramsOnlyFast cId nt ngs =
177 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
180 getOccByNgramsOnlyFast' :: CorpusId
184 -> Cmd err (Map Text Int)
185 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
186 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
189 fields = [QualifiedIdentifier Nothing "text"]
195 -> Cmd err [(Text, Double)]
196 run cId' lId' nt' tms' = runPGSQuery query
197 ( Values fields (DPS.Only <$> tms')
205 WITH input_rows(terms) AS (?)
206 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
207 JOIN ngrams ng ON nng.ngrams_id = ng.id
208 JOIN input_rows ir ON ir.terms = ng.terms
209 WHERE nng.node1_id = ? -- CorpusId
210 AND nng.node2_id = ? -- ListId
211 AND nng.ngrams_type = ? -- NgramsTypeId
212 -- AND nn.category > 0 -- TODO
213 GROUP BY ng.terms, nng.weight
217 -- just slower than getOccByNgramsOnlyFast
218 getOccByNgramsOnlySlow :: NodeType
223 -> Cmd err (Map Text Int)
224 getOccByNgramsOnlySlow t cId ls nt ngs =
225 Map.map Set.size <$> getScore' t cId ls nt ngs
227 getScore' NodeCorpus = getNodesByNgramsOnlyUser
228 getScore' NodeDocument = getNgramsByDocOnlyUser
229 getScore' _ = getNodesByNgramsOnlyUser
231 getOccByNgramsOnlySafe :: CorpusId
235 -> Cmd err (Map Text Int)
236 getOccByNgramsOnlySafe cId ls nt ngs = do
237 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
238 fast <- getOccByNgramsOnlyFast cId nt ngs
239 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
240 when (fast /= slow) $
241 printDebug "getOccByNgramsOnlySafe: difference"
242 (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
246 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
249 -> Cmd err [(Text, Int)]
250 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
251 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
252 ( Values fields (DPS.Only <$> tms)
254 , nodeTypeId NodeDocument
258 fields = [QualifiedIdentifier Nothing "text"]
260 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
261 -- Question: with the grouping is the result exactly the same (since Set NodeId for
262 -- equivalent ngrams intersections are not empty)
263 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
264 queryNgramsOccurrencesOnlyByNodeUser = [sql|
265 WITH input_rows(terms) AS (?)
266 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
267 JOIN ngrams ng ON nng.ngrams_id = ng.id
268 JOIN input_rows ir ON ir.terms = ng.terms
269 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
270 JOIN nodes n ON nn.node2_id = n.id
271 WHERE nn.node1_id = ? -- CorpusId
272 AND n.typename = ? -- NodeTypeId
273 AND nng.ngrams_type = ? -- NgramsTypeId
275 GROUP BY nng.node2_id, ng.terms
278 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
279 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
280 WITH input_rows(terms) AS (?)
281 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
282 JOIN ngrams ng ON nng.ngrams_id = ng.id
283 JOIN input_rows ir ON ir.terms = ng.terms
284 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
285 JOIN nodes n ON nn.node2_id = n.id
286 WHERE nn.node1_id = ? -- CorpusId
287 AND n.typename = ? -- NodeTypeId
288 AND nng.ngrams_type = ? -- NgramsTypeId
290 GROUP BY nng.node2_id, ng.terms
293 ------------------------------------------------------------------------
294 getNodesByNgramsOnlyUser :: NodeId
298 -> Cmd err (Map Text (Set NodeId))
299 getNodesByNgramsOnlyUser cId ls nt ngs =
301 . map (fromListWith (<>)
302 . map (second Set.singleton))
303 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
304 (splitEvery 1000 ngs)
307 getNgramsByNodeOnlyUser :: NodeId
311 -> Cmd err (Map NodeId (Set Text))
312 getNgramsByNodeOnlyUser cId ls nt ngs =
314 . map (fromListWith (<>)
315 . map (second Set.singleton))
317 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
318 (splitEvery 1000 ngs)
320 ------------------------------------------------------------------------
321 selectNgramsOnlyByNodeUser :: CorpusId
325 -> Cmd err [(Text, NodeId)]
326 selectNgramsOnlyByNodeUser cId ls nt tms =
327 runPGSQuery queryNgramsOnlyByNodeUser
328 ( Values fields (DPS.Only <$> tms)
329 , Values [QualifiedIdentifier Nothing "int4"]
330 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
332 , nodeTypeId NodeDocument
336 fields = [QualifiedIdentifier Nothing "text"]
338 queryNgramsOnlyByNodeUser :: DPS.Query
339 queryNgramsOnlyByNodeUser = [sql|
340 WITH input_rows(terms) AS (?),
341 input_list(id) AS (?)
342 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
343 JOIN ngrams ng ON nng.ngrams_id = ng.id
344 JOIN input_rows ir ON ir.terms = ng.terms
345 JOIN input_list il ON il.id = nng.node1_id
346 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
347 JOIN nodes n ON nn.node2_id = n.id
348 WHERE nn.node1_id = ? -- CorpusId
349 AND n.typename = ? -- NodeTypeId
350 AND nng.ngrams_type = ? -- NgramsTypeId
352 GROUP BY ng.terms, nng.node2_id
356 selectNgramsOnlyByNodeUser' :: CorpusId
360 -> Cmd err [(Text, Int)]
361 selectNgramsOnlyByNodeUser' cId ls nt tms =
362 runPGSQuery queryNgramsOnlyByNodeUser
363 ( Values fields (DPS.Only <$> tms)
364 , Values [QualifiedIdentifier Nothing "int4"]
365 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
367 , nodeTypeId NodeDocument
371 fields = [QualifiedIdentifier Nothing "text"]
373 queryNgramsOnlyByNodeUser' :: DPS.Query
374 queryNgramsOnlyByNodeUser' = [sql|
375 WITH input_rows(terms) AS (?),
376 input_list(id) AS (?)
377 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
378 JOIN ngrams ng ON nng.ngrams_id = ng.id
379 JOIN input_rows ir ON ir.terms = ng.terms
380 JOIN input_list il ON il.id = nng.node2_id
381 WHERE nng.node1_id = ? -- CorpusId
382 AND nng.ngrams_type = ? -- NgramsTypeId
383 -- AND nn.category > 0
384 GROUP BY ng.terms, nng.weight
388 getNgramsByDocOnlyUser :: NodeId
392 -> Cmd err (Map Text (Set NodeId))
393 getNgramsByDocOnlyUser cId ls nt ngs =
395 . map (fromListWith (<>) . map (second Set.singleton))
396 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
399 selectNgramsOnlyByDocUser :: DocId
403 -> Cmd err [(Text, NodeId)]
404 selectNgramsOnlyByDocUser dId ls nt tms =
405 runPGSQuery queryNgramsOnlyByDocUser
406 ( Values fields (DPS.Only <$> tms)
407 , Values [QualifiedIdentifier Nothing "int4"]
408 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
413 fields = [QualifiedIdentifier Nothing "text"]
416 queryNgramsOnlyByDocUser :: DPS.Query
417 queryNgramsOnlyByDocUser = [sql|
418 WITH input_rows(terms) AS (?),
419 input_list(id) AS (?)
420 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
421 JOIN ngrams ng ON nng.ngrams_id = ng.id
422 JOIN input_rows ir ON ir.terms = ng.terms
423 JOIN input_list il ON il.id = nng.node1_id
424 WHERE nng.node2_id = ? -- DocId
425 AND nng.ngrams_type = ? -- NgramsTypeId
426 GROUP BY ng.terms, nng.node2_id
429 ------------------------------------------------------------------------
430 -- | TODO filter by language, database, any social field
431 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
432 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
433 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
434 -- . takeWhile (not . List.null)
435 -- . takeWhile (\l -> List.length l > 3)
436 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
438 selectNgramsByNodeMaster :: Int
442 -> Cmd err [(NodeId, Text)]
443 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
444 queryNgramsByNodeMaster'
446 , ngramsTypeId NgramsTerms
447 , nodeTypeId NodeDocument
449 , nodeTypeId NodeDocument
453 , nodeTypeId NodeDocument
454 , ngramsTypeId NgramsTerms
457 -- | TODO fix node_node_ngrams relation
458 queryNgramsByNodeMaster' :: DPS.Query
459 queryNgramsByNodeMaster' = [sql|
460 WITH nodesByNgramsUser AS (
462 SELECT n.id, ng.terms FROM nodes n
463 JOIN nodes_nodes nn ON n.id = nn.node2_id
464 JOIN node_node_ngrams nng ON nng.node2_id = n.id
465 JOIN ngrams ng ON nng.ngrams_id = ng.id
466 WHERE nn.node1_id = ? -- UserCorpusId
467 -- AND n.typename = ? -- NodeTypeId
468 AND nng.ngrams_type = ? -- NgramsTypeId
470 AND node_pos(n.id,?) >= ?
471 AND node_pos(n.id,?) < ?
472 GROUP BY n.id, ng.terms
476 nodesByNgramsMaster AS (
478 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
479 JOIN node_node_ngrams nng ON n.id = nng.node2_id
480 JOIN ngrams ng ON ng.id = nng.ngrams_id
482 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
483 AND n.typename = ? -- NodeTypeId
484 AND nng.ngrams_type = ? -- NgramsTypeId
485 GROUP BY n.id, ng.terms
488 SELECT m.id, m.terms FROM nodesByNgramsMaster m
489 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id