2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
17 {-# LANGUAGE RankNTypes #-}
19 module Gargantext.Database.Metrics.NgramsByNode
22 import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
23 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (second, swap)
27 import Database.PostgreSQL.Simple.SqlQQ (sql)
28 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
29 import Gargantext.Core (Lang(..))
30 import Gargantext.Database.Config (nodeTypeId)
31 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
32 import Gargantext.Database.Types.Node -- (ListId, CorpusId, NodeId)
33 import Gargantext.Database.Utils (Cmd, runPGSQuery)
34 import Gargantext.Prelude
35 import Gargantext.Text.Metrics.TFICF
36 import Gargantext.Text.Terms.Mono.Stem (stem)
37 import qualified Data.List as List
38 import qualified Data.Map.Strict as Map
39 import qualified Data.Set as Set
40 import qualified Data.Text as Text
41 import qualified Database.PostgreSQL.Simple as DPS
43 -- | TODO: group with 2 terms only can be
44 -- discussed. Main purpose of this is offering
45 -- a first grouping option to user and get some
46 -- enriched data to better learn and improve that algo
47 ngramsGroup :: Lang -> Int -> Int -> Text -> Text
48 ngramsGroup l _m _n = Text.intercalate " "
52 -- . (List.filter (\t -> Text.length t > m))
54 . Text.replace "-" " "
57 sortTficf :: (Map Text (Double, Set Text))
58 -> [ (Text,(Double, Set Text))]
59 sortTficf = List.sortOn (fst . snd) . toList
62 getTficf' :: UserCorpusId -> MasterCorpusId -> NgramsType -> (Text -> Text)
63 -> Cmd err (Map Text (Double, Set Text))
64 getTficf' u m nt f = do
65 u' <- getNodesByNgramsUser u nt
66 m' <- getNodesByNgramsMaster u m
68 pure $ toTficfData (countNodesByNgramsWith f u')
69 (countNodesByNgramsWith f m')
71 getTficfWith :: UserCorpusId -> MasterCorpusId -> [ListId]
72 -> NgramsType -> Map Text (Maybe Text)
73 -> Cmd err (Map Text (Double, Set Text))
74 getTficfWith u m ls nt mtxt = do
75 u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
76 m' <- getNodesByNgramsMaster u m
78 let f x = case Map.lookup x mtxt of
80 Just x' -> maybe x identity x'
82 pure $ toTficfData (countNodesByNgramsWith f u')
83 (countNodesByNgramsWith f m')
86 type Context = (Double, Map Text (Double, Set Text))
90 toTficfData :: Infra -> Supra
91 -> Map Text (Double, Set Text)
92 toTficfData (ti, mi) (ts, ms) =
93 fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
94 (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
98 | (t, (n,ns)) <- toList mi
102 -- | fst is size of Supra Corpus
103 -- snd is Texts and size of Occurrences (different docs)
104 countNodesByNgramsWith :: (Text -> Text)
105 -> Map Text (Set NodeId)
106 -> (Double, Map Text (Double, Set Text))
107 countNodesByNgramsWith f m = (total, m')
109 total = fromIntegral $ Set.size $ Set.unions $ elems m
110 m' = Map.map ( swap . second (fromIntegral . Set.size))
111 $ groupNodesByNgramsWith f m
114 groupNodesByNgramsWith :: (Text -> Text)
115 -> Map Text (Set NodeId)
116 -> Map Text (Set Text, Set NodeId)
117 groupNodesByNgramsWith f m =
118 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
121 ------------------------------------------------------------------------
122 getNodesByNgramsUser :: CorpusId
124 -> Cmd err (Map Text (Set NodeId))
125 getNodesByNgramsUser cId nt =
126 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
127 <$> selectNgramsByNodeUser cId nt
130 selectNgramsByNodeUser :: CorpusId -> NgramsType
131 -> Cmd err [(NodeId, Text)]
132 selectNgramsByNodeUser cId' nt' =
133 runPGSQuery queryNgramsByNodeUser
135 , nodeTypeId NodeDocument
137 -- , 100 :: Int -- limit
138 -- , 0 :: Int -- offset
141 queryNgramsByNodeUser :: DPS.Query
142 queryNgramsByNodeUser = [sql|
143 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
144 JOIN ngrams ng ON nng.ngrams_id = ng.id
145 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
146 JOIN nodes n ON nn.node2_id = n.id
147 WHERE nn.node1_id = ? -- CorpusId
148 AND n.typename = ? -- NodeTypeId
149 AND nng.ngrams_type = ? -- NgramsTypeId
151 GROUP BY nng.node2_id, ng.terms
152 ORDER BY (nng.node2_id, ng.terms) DESC
156 ------------------------------------------------------------------------
158 getOccByNgramsOnlyFast :: CorpusId
161 -> Cmd err (Map Text Int)
162 getOccByNgramsOnlyFast cId nt ngs =
163 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
165 -- just slower than getOccByNgramsOnlyFast
166 getOccByNgramsOnlySlow :: NodeType
171 -> Cmd err (Map Text Int)
172 getOccByNgramsOnlySlow t cId ls nt ngs =
173 Map.map Set.size <$> getScore' t cId ls nt ngs
175 getScore' NodeCorpus = getNodesByNgramsOnlyUser
176 getScore' NodeDocument = getNgramsByDocOnlyUser
177 getScore' _ = getNodesByNgramsOnlyUser
179 getOccByNgramsOnlySafe :: CorpusId
183 -> Cmd err (Map Text Int)
184 getOccByNgramsOnlySafe cId ls nt ngs = do
185 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
186 fast <- getOccByNgramsOnlyFast cId nt ngs
187 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
188 when (fast /= slow) $
189 printDebug "getOccByNgramsOnlySafe: difference" (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
193 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId -> NgramsType -> [Text]
194 -> Cmd err [(Text, Int)]
195 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
196 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
197 ( Values fields (DPS.Only <$> tms)
199 , nodeTypeId NodeDocument
203 fields = [QualifiedIdentifier Nothing "text"]
205 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
206 -- Question: with the grouping is the result exactly the same (since Set NodeId for
207 -- equivalent ngrams intersections are not empty)
208 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
209 queryNgramsOccurrencesOnlyByNodeUser = [sql|
210 WITH input_rows(terms) AS (?)
211 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
212 JOIN ngrams ng ON nng.ngrams_id = ng.id
213 JOIN input_rows ir ON ir.terms = ng.terms
214 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
215 JOIN nodes n ON nn.node2_id = n.id
216 WHERE nn.node1_id = ? -- CorpusId
217 AND n.typename = ? -- NodeTypeId
218 AND nng.ngrams_type = ? -- NgramsTypeId
220 GROUP BY nng.node2_id, ng.terms
225 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
226 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
227 WITH input_rows(terms) AS (?)
228 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
229 JOIN ngrams ng ON nng.ngrams_id = ng.id
230 JOIN input_rows ir ON ir.terms = ng.terms
231 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
232 JOIN nodes n ON nn.node2_id = n.id
233 WHERE nn.node1_id = ? -- CorpusId
234 AND n.typename = ? -- NodeTypeId
235 AND nng.ngrams_type = ? -- NgramsTypeId
237 GROUP BY nng.node2_id, ng.terms
243 getNodesByNgramsOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
244 -> Cmd err (Map Text (Set NodeId))
245 getNodesByNgramsOnlyUser cId ls nt ngs =
247 . map (fromListWith (<>) . map (second Set.singleton))
248 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
249 (splitEvery 1000 ngs)
251 selectNgramsOnlyByNodeUser :: CorpusId -> [ListId] -> NgramsType -> [Text]
252 -> Cmd err [(Text, NodeId)]
253 selectNgramsOnlyByNodeUser cId ls nt tms =
254 runPGSQuery queryNgramsOnlyByNodeUser
255 ( Values fields (DPS.Only <$> tms)
256 , Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
258 , nodeTypeId NodeDocument
262 fields = [QualifiedIdentifier Nothing "text"]
264 queryNgramsOnlyByNodeUser :: DPS.Query
265 queryNgramsOnlyByNodeUser = [sql|
266 WITH input_rows(terms) AS (?),
267 input_list(id) AS (?)
268 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
269 JOIN ngrams ng ON nng.ngrams_id = ng.id
270 JOIN input_rows ir ON ir.terms = ng.terms
271 JOIN input_list il ON il.id = nng.node1_id
272 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
273 JOIN nodes n ON nn.node2_id = n.id
274 WHERE nn.node1_id = ? -- CorpusId
275 AND n.typename = ? -- NodeTypeId
276 AND nng.ngrams_type = ? -- NgramsTypeId
278 GROUP BY ng.terms, nng.node2_id
282 getNgramsByDocOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
283 -> Cmd err (Map Text (Set NodeId))
284 getNgramsByDocOnlyUser cId ls nt ngs =
286 . map (fromListWith (<>) . map (second Set.singleton))
287 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
290 selectNgramsOnlyByDocUser :: DocId -> [ListId] -> NgramsType -> [Text]
291 -> Cmd err [(Text, NodeId)]
292 selectNgramsOnlyByDocUser dId ls nt tms =
293 runPGSQuery queryNgramsOnlyByDocUser
294 ( Values fields (DPS.Only <$> tms)
295 , Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
300 fields = [QualifiedIdentifier Nothing "text"]
302 queryNgramsOnlyByDocUser :: DPS.Query
303 queryNgramsOnlyByDocUser = [sql|
304 WITH input_rows(terms) AS (?),
305 input_list(id) AS (?)
306 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
307 JOIN ngrams ng ON nng.ngrams_id = ng.id
308 JOIN input_rows ir ON ir.terms = ng.terms
309 JOIN input_list il ON il.id = nng.node1_id
310 WHERE nng.node2_id = ? -- DocId
311 AND nng.ngrams_type = ? -- NgramsTypeId
312 GROUP BY ng.terms, nng.node2_id
315 ------------------------------------------------------------------------
316 -- | TODO filter by language, database, any social field
317 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
318 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
319 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
320 -- . takeWhile (not . List.null)
321 -- . takeWhile (\l -> List.length l > 3)
322 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
325 selectNgramsByNodeMaster :: Int -> UserCorpusId -> MasterCorpusId -> Int -> Cmd err [(NodeId, Text)]
326 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
327 queryNgramsByNodeMaster'
329 , ngramsTypeId NgramsTerms
330 , nodeTypeId NodeDocument
332 , nodeTypeId NodeDocument
336 , nodeTypeId NodeDocument
337 , ngramsTypeId NgramsTerms
340 -- | TODO fix node_node_ngrams relation
341 queryNgramsByNodeMaster' :: DPS.Query
342 queryNgramsByNodeMaster' = [sql|
343 WITH nodesByNgramsUser AS (
345 SELECT n.id, ng.terms FROM nodes n
346 JOIN nodes_nodes nn ON n.id = nn.node2_id
347 JOIN node_node_ngrams nng ON nng.node2_id = n.id
348 JOIN ngrams ng ON nng.ngrams_id = ng.id
349 WHERE nn.node1_id = ? -- UserCorpusId
350 -- AND n.typename = ? -- NodeTypeId
351 AND nng.ngrams_type = ? -- NgramsTypeId
353 AND node_pos(n.id,?) >= ?
354 AND node_pos(n.id,?) < ?
355 GROUP BY n.id, ng.terms
359 nodesByNgramsMaster AS (
361 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
362 JOIN node_node_ngrams nng ON n.id = nng.node2_id
363 JOIN ngrams ng ON ng.id = nng.ngrams_id
365 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
366 AND n.typename = ? -- NodeTypeId
367 AND nng.ngrams_type = ? -- NgramsTypeId
368 GROUP BY n.id, ng.terms
371 SELECT m.id, m.terms FROM nodesByNgramsMaster m
372 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id