2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams by node enable contextual metrics.
14 {-# LANGUAGE QuasiQuotes #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
17 {-# LANGUAGE RankNTypes #-}
19 module Gargantext.Database.Metrics.NgramsByNode
22 import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
23 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (second, swap)
27 import Database.PostgreSQL.Simple.SqlQQ (sql)
28 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
29 import Gargantext.Core (Lang(..))
30 import Gargantext.Database.Config (nodeTypeId)
31 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
32 import Gargantext.Database.Types.Node -- (ListId, CorpusId, NodeId)
33 import Gargantext.Database.Utils (Cmd, runPGSQuery)
34 import Gargantext.Prelude
35 import Gargantext.Text.Metrics.TFICF
36 import Gargantext.Text.Terms.Mono.Stem (stem)
37 import qualified Data.List as List
38 import qualified Data.Map.Strict as Map
39 import qualified Data.Set as Set
40 import qualified Data.Text as Text
41 import qualified Database.PostgreSQL.Simple as DPS
43 -- | TODO: group with 2 terms only can be
44 -- discussed. Main purpose of this is offering
45 -- a first grouping option to user and get some
46 -- enriched data to better learn and improve that algo
47 ngramsGroup :: Lang -> Int -> Int -> Text -> Text
48 ngramsGroup l _m _n = Text.intercalate " "
52 -- . (List.filter (\t -> Text.length t > m))
54 . Text.replace "-" " "
57 sortTficf :: (Map Text (Double, Set Text))
58 -> [ (Text,(Double, Set Text))]
59 sortTficf = List.sortOn (fst . snd) . toList
62 getTficf' :: UserCorpusId -> MasterCorpusId -> NgramsType -> (Text -> Text)
63 -> Cmd err (Map Text (Double, Set Text))
64 getTficf' u m nt f = do
65 u' <- getNodesByNgramsUser u nt
66 m' <- getNodesByNgramsMaster u m
68 pure $ toTficfData (countNodesByNgramsWith f u')
69 (countNodesByNgramsWith f m')
72 getTficfWith :: UserCorpusId -> MasterCorpusId -> [ListId]
73 -> NgramsType -> Map Text (Maybe Text)
74 -> Cmd err (Map Text (Double, Set Text))
75 getTficfWith u m ls nt mtxt = do
76 u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
77 m' <- getNodesByNgramsMaster u m
79 let f x = case Map.lookup x mtxt of
81 Just x' -> maybe x identity x'
83 pure $ toTficfData (countNodesByNgramsWith f u')
84 (countNodesByNgramsWith f m')
88 type Context = (Double, Map Text (Double, Set Text))
92 toTficfData :: Infra -> Supra
93 -> Map Text (Double, Set Text)
94 toTficfData (ti, mi) (ts, ms) =
95 fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
96 (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
100 | (t, (n,ns)) <- toList mi
104 -- | fst is size of Supra Corpus
105 -- snd is Texts and size of Occurrences (different docs)
106 countNodesByNgramsWith :: (Text -> Text)
107 -> Map Text (Set NodeId)
108 -> (Double, Map Text (Double, Set Text))
109 countNodesByNgramsWith f m = (total, m')
111 total = fromIntegral $ Set.size $ Set.unions $ elems m
112 m' = Map.map ( swap . second (fromIntegral . Set.size))
113 $ groupNodesByNgramsWith f m
116 groupNodesByNgramsWith :: (Text -> Text)
117 -> Map Text (Set NodeId)
118 -> Map Text (Set Text, Set NodeId)
119 groupNodesByNgramsWith f m =
120 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
123 ------------------------------------------------------------------------
124 getNodesByNgramsUser :: CorpusId -> NgramsType
125 -> Cmd err (Map Text (Set NodeId))
126 getNodesByNgramsUser cId nt =
127 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
128 <$> selectNgramsByNodeUser cId nt
130 selectNgramsByNodeUser :: CorpusId -> NgramsType
131 -> Cmd err [(NodeId, Text)]
132 selectNgramsByNodeUser cId nt =
133 runPGSQuery queryNgramsByNodeUser
135 , nodeTypeId NodeDocument
137 , 1000 :: Int -- limit
141 queryNgramsByNodeUser :: DPS.Query
142 queryNgramsByNodeUser = [sql|
144 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
145 JOIN ngrams ng ON nng.ngrams_id = ng.id
146 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
147 JOIN nodes n ON nn.node2_id = n.id
148 WHERE nn.node1_id = ? -- CorpusId
149 AND n.typename = ? -- NodeTypeId
150 AND nng.ngrams_type = ? -- NgramsTypeId
151 AND nn.delete = False
152 GROUP BY nng.node2_id, ng.terms
153 ORDER BY (nng.node2_id, ng.terms) DESC
157 ------------------------------------------------------------------------
159 getOccByNgramsOnlyFast :: CorpusId -> NgramsType -> [Text]
160 -> Cmd err (Map Text Int)
161 getOccByNgramsOnlyFast cId nt ngs =
162 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
164 -- just slower than getOccByNgramsOnlyFast
165 getOccByNgramsOnlySlow :: CorpusId -> [ListId] -> NgramsType -> [Text]
166 -> Cmd err (Map Text Int)
167 getOccByNgramsOnlySlow cId ls nt ngs =
168 Map.map Set.size <$> getNodesByNgramsOnlyUser cId ls nt ngs
170 getOccByNgramsOnlySafe :: CorpusId -> [ListId] -> NgramsType -> [Text]
171 -> Cmd err (Map Text Int)
172 getOccByNgramsOnlySafe cId ls nt ngs = do
173 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
174 fast <- getOccByNgramsOnlyFast cId nt ngs
175 slow <- getOccByNgramsOnlySlow cId ls nt ngs
176 when (fast /= slow) $
177 printDebug "getOccByNgramsOnlySafe: difference" (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
181 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId -> NgramsType -> [Text]
182 -> Cmd err [(Text, Int)]
183 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
184 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
185 ( Values fields (DPS.Only <$> tms)
187 , nodeTypeId NodeDocument
191 fields = [QualifiedIdentifier Nothing "text"]
193 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
194 -- Question: with the grouping is the result exactly the same (since Set NodeId for
195 -- equivalent ngrams intersections are not empty)
196 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
197 queryNgramsOccurrencesOnlyByNodeUser = [sql|
199 WITH input_rows(terms) AS (?)
200 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
201 JOIN ngrams ng ON nng.ngrams_id = ng.id
202 JOIN input_rows ir ON ir.terms = ng.terms
203 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
204 JOIN nodes n ON nn.node2_id = n.id
205 WHERE nn.node1_id = ? -- CorpusId
206 AND n.typename = ? -- NodeTypeId
207 AND nng.ngrams_type = ? -- NgramsTypeId
208 AND nn.delete = False
209 GROUP BY nng.node2_id, ng.terms
212 getNodesByNgramsOnlyUser :: CorpusId -> [ListId] -> NgramsType -> [Text]
213 -> Cmd err (Map Text (Set NodeId))
214 getNodesByNgramsOnlyUser cId ls nt ngs = Map.unionsWith (<>)
215 . map (fromListWith (<>) . map (second Set.singleton))
216 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt) (splitEvery 1000 ngs)
218 selectNgramsOnlyByNodeUser :: CorpusId -> [ListId] -> NgramsType -> [Text]
219 -> Cmd err [(Text, NodeId)]
220 selectNgramsOnlyByNodeUser cId ls nt tms =
221 runPGSQuery queryNgramsOnlyByNodeUser
222 ( Values fields (DPS.Only <$> tms)
223 , Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
225 , nodeTypeId NodeDocument
229 fields = [QualifiedIdentifier Nothing "text"]
231 queryNgramsOnlyByNodeUser :: DPS.Query
232 queryNgramsOnlyByNodeUser = [sql|
234 WITH input_rows(terms) AS (?),
235 input_list(id) AS (?)
236 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
237 JOIN ngrams ng ON nng.ngrams_id = ng.id
238 JOIN input_rows ir ON ir.terms = ng.terms
239 JOIN input_list il ON il.id = nng.node1_id
240 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
241 JOIN nodes n ON nn.node2_id = n.id
242 WHERE nn.node1_id = ? -- CorpusId
243 AND n.typename = ? -- NodeTypeId
244 AND nng.ngrams_type = ? -- NgramsTypeId
245 AND nn.delete = False
246 GROUP BY ng.terms, nng.node2_id
253 ------------------------------------------------------------------------
254 -- | TODO filter by language, database, any social field
255 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
256 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
257 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
258 -- . takeWhile (not . List.null)
259 -- . takeWhile (\l -> List.length l > 3)
260 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
267 selectNgramsByNodeMaster :: Int -> UserCorpusId -> MasterCorpusId -> Int -> Cmd err [(NodeId, Text)]
268 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
269 queryNgramsByNodeMaster'
271 , ngramsTypeId NgramsTerms
272 , nodeTypeId NodeDocument
274 , nodeTypeId NodeDocument
278 , nodeTypeId NodeDocument
279 , ngramsTypeId NgramsTerms
282 -- | TODO fix node_node_ngrams relation
283 queryNgramsByNodeMaster' :: DPS.Query
284 queryNgramsByNodeMaster' = [sql|
286 WITH nodesByNgramsUser AS (
288 SELECT n.id, ng.terms FROM nodes n
289 JOIN nodes_nodes nn ON n.id = nn.node2_id
290 JOIN node_node_ngrams nng ON nng.node2_id = n.id
291 JOIN ngrams ng ON nng.ngrams_id = ng.id
292 WHERE nn.node1_id = ? -- UserCorpusId
293 -- AND n.typename = ? -- NodeTypeId
294 AND nng.ngrams_type = ? -- NgramsTypeId
295 AND nn.delete = False
296 AND node_pos(n.id,?) >= ?
297 AND node_pos(n.id,?) < ?
298 GROUP BY n.id, ng.terms
302 nodesByNgramsMaster AS (
304 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
305 JOIN node_node_ngrams nng ON n.id = nng.node2_id
306 JOIN ngrams ng ON ng.id = nng.ngrams_id
308 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
309 AND n.typename = ? -- NodeTypeId
310 AND nng.ngrams_type = ? -- NgramsTypeId
311 GROUP BY n.id, ng.terms
314 SELECT m.id, m.terms FROM nodesByNgramsMaster m
315 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id