]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Metrics/NgramsByNode.hs
[GarGraph] Missing File. GarGraph = Garage à graphs.
[gargantext.git] / src / Gargantext / Database / Metrics / NgramsByNode.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
17 {-# LANGUAGE RankNTypes #-}
18
19 module Gargantext.Database.Metrics.NgramsByNode
20 where
21
22 import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
23 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
24 import Data.Set (Set)
25 import Data.Text (Text)
26 import Data.Tuple.Extra (second, swap)
27 import Database.PostgreSQL.Simple.SqlQQ (sql)
28 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
29 import Gargantext.Core (Lang(..))
30 import Gargantext.Database.Config (nodeTypeId)
31 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
32 import Gargantext.Database.Types.Node -- (ListId, CorpusId, NodeId)
33 import Gargantext.Database.Utils (Cmd, runPGSQuery)
34 import Gargantext.Prelude
35 import Gargantext.Text.Metrics.TFICF
36 import Gargantext.Text.Terms.Mono.Stem (stem)
37 import qualified Data.List as List
38 import qualified Data.Map.Strict as Map
39 import qualified Data.Set as Set
40 import qualified Data.Text as Text
41 import qualified Database.PostgreSQL.Simple as DPS
42
43 -- | TODO: group with 2 terms only can be
44 -- discussed. Main purpose of this is offering
45 -- a first grouping option to user and get some
46 -- enriched data to better learn and improve that algo
47 ngramsGroup :: Lang -> Int -> Int -> Text -> Text
48 ngramsGroup l _m _n = Text.intercalate " "
49 . map (stem l)
50 -- . take n
51 . List.sort
52 -- . (List.filter (\t -> Text.length t > m))
53 . Text.splitOn " "
54 . Text.replace "-" " "
55
56
57 sortTficf :: (Map Text (Double, Set Text))
58 -> [ (Text,(Double, Set Text))]
59 sortTficf = List.sortOn (fst . snd) . toList
60
61
62 getTficf' :: UserCorpusId -> MasterCorpusId -> NgramsType -> (Text -> Text)
63 -> Cmd err (Map Text (Double, Set Text))
64 getTficf' u m nt f = do
65 u' <- getNodesByNgramsUser u nt
66 m' <- getNodesByNgramsMaster u m
67
68 pure $ toTficfData (countNodesByNgramsWith f u')
69 (countNodesByNgramsWith f m')
70
71 getTficfWith :: UserCorpusId -> MasterCorpusId -> [ListId]
72 -> NgramsType -> Map Text (Maybe Text)
73 -> Cmd err (Map Text (Double, Set Text))
74 getTficfWith u m ls nt mtxt = do
75 u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
76 m' <- getNodesByNgramsMaster u m
77
78 let f x = case Map.lookup x mtxt of
79 Nothing -> x
80 Just x' -> maybe x identity x'
81
82 pure $ toTficfData (countNodesByNgramsWith f u')
83 (countNodesByNgramsWith f m')
84
85
86 type Context = (Double, Map Text (Double, Set Text))
87 type Supra = Context
88 type Infra = Context
89
90 toTficfData :: Infra -> Supra
91 -> Map Text (Double, Set Text)
92 toTficfData (ti, mi) (ts, ms) =
93 fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
94 (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
95 , ns
96 )
97 )
98 | (t, (n,ns)) <- toList mi
99 ]
100
101
102 -- | fst is size of Supra Corpus
103 -- snd is Texts and size of Occurrences (different docs)
104 countNodesByNgramsWith :: (Text -> Text)
105 -> Map Text (Set NodeId)
106 -> (Double, Map Text (Double, Set Text))
107 countNodesByNgramsWith f m = (total, m')
108 where
109 total = fromIntegral $ Set.size $ Set.unions $ elems m
110 m' = Map.map ( swap . second (fromIntegral . Set.size))
111 $ groupNodesByNgramsWith f m
112
113
114 groupNodesByNgramsWith :: (Text -> Text)
115 -> Map Text (Set NodeId)
116 -> Map Text (Set Text, Set NodeId)
117 groupNodesByNgramsWith f m =
118 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
119 $ toList m
120
121 ------------------------------------------------------------------------
122 getNodesByNgramsUser :: CorpusId
123 -> NgramsType
124 -> Cmd err (Map Text (Set NodeId))
125 getNodesByNgramsUser cId nt =
126 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
127 <$> selectNgramsByNodeUser cId nt
128 where
129
130 selectNgramsByNodeUser :: CorpusId -> NgramsType
131 -> Cmd err [(NodeId, Text)]
132 selectNgramsByNodeUser cId' nt' =
133 runPGSQuery queryNgramsByNodeUser
134 ( cId'
135 , nodeTypeId NodeDocument
136 , ngramsTypeId nt'
137 -- , 100 :: Int -- limit
138 -- , 0 :: Int -- offset
139 )
140
141 queryNgramsByNodeUser :: DPS.Query
142 queryNgramsByNodeUser = [sql|
143 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
144 JOIN ngrams ng ON nng.ngrams_id = ng.id
145 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
146 JOIN nodes n ON nn.node2_id = n.id
147 WHERE nn.node1_id = ? -- CorpusId
148 AND n.typename = ? -- NodeTypeId
149 AND nng.ngrams_type = ? -- NgramsTypeId
150 AND nn.category > 0
151 GROUP BY nng.node2_id, ng.terms
152 ORDER BY (nng.node2_id, ng.terms) DESC
153 -- LIMIT ?
154 -- OFFSET ?
155 |]
156 ------------------------------------------------------------------------
157 -- TODO add groups
158 getOccByNgramsOnlyFast :: CorpusId
159 -> NgramsType
160 -> [Text]
161 -> Cmd err (Map Text Int)
162 getOccByNgramsOnlyFast cId nt ngs =
163 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
164
165 -- just slower than getOccByNgramsOnlyFast
166 getOccByNgramsOnlySlow :: NodeType
167 -> CorpusId
168 -> [ListId]
169 -> NgramsType
170 -> [Text]
171 -> Cmd err (Map Text Int)
172 getOccByNgramsOnlySlow t cId ls nt ngs =
173 Map.map Set.size <$> getScore' t cId ls nt ngs
174 where
175 getScore' NodeCorpus = getNodesByNgramsOnlyUser
176 getScore' NodeDocument = getNgramsByDocOnlyUser
177 getScore' _ = getNodesByNgramsOnlyUser
178
179 getOccByNgramsOnlySafe :: CorpusId
180 -> [ListId]
181 -> NgramsType
182 -> [Text]
183 -> Cmd err (Map Text Int)
184 getOccByNgramsOnlySafe cId ls nt ngs = do
185 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
186 fast <- getOccByNgramsOnlyFast cId nt ngs
187 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
188 when (fast /= slow) $
189 printDebug "getOccByNgramsOnlySafe: difference" (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
190 pure slow
191
192
193 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId -> NgramsType -> [Text]
194 -> Cmd err [(Text, Int)]
195 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
196 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
197 ( Values fields (DPS.Only <$> tms)
198 , cId
199 , nodeTypeId NodeDocument
200 , ngramsTypeId nt
201 )
202 where
203 fields = [QualifiedIdentifier Nothing "text"]
204
205 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
206 -- Question: with the grouping is the result exactly the same (since Set NodeId for
207 -- equivalent ngrams intersections are not empty)
208 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
209 queryNgramsOccurrencesOnlyByNodeUser = [sql|
210 WITH input_rows(terms) AS (?)
211 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
212 JOIN ngrams ng ON nng.ngrams_id = ng.id
213 JOIN input_rows ir ON ir.terms = ng.terms
214 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
215 JOIN nodes n ON nn.node2_id = n.id
216 WHERE nn.node1_id = ? -- CorpusId
217 AND n.typename = ? -- NodeTypeId
218 AND nng.ngrams_type = ? -- NgramsTypeId
219 AND nn.category > 0
220 GROUP BY nng.node2_id, ng.terms
221 |]
222
223
224
225 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
226 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
227 WITH input_rows(terms) AS (?)
228 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
229 JOIN ngrams ng ON nng.ngrams_id = ng.id
230 JOIN input_rows ir ON ir.terms = ng.terms
231 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
232 JOIN nodes n ON nn.node2_id = n.id
233 WHERE nn.node1_id = ? -- CorpusId
234 AND n.typename = ? -- NodeTypeId
235 AND nng.ngrams_type = ? -- NgramsTypeId
236 AND nn.category > 0
237 GROUP BY nng.node2_id, ng.terms
238 |]
239
240
241
242
243 getNodesByNgramsOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
244 -> Cmd err (Map Text (Set NodeId))
245 getNodesByNgramsOnlyUser cId ls nt ngs =
246 Map.unionsWith (<>)
247 . map (fromListWith (<>) . map (second Set.singleton))
248 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
249 (splitEvery 1000 ngs)
250
251 selectNgramsOnlyByNodeUser :: CorpusId -> [ListId] -> NgramsType -> [Text]
252 -> Cmd err [(Text, NodeId)]
253 selectNgramsOnlyByNodeUser cId ls nt tms =
254 runPGSQuery queryNgramsOnlyByNodeUser
255 ( Values fields (DPS.Only <$> tms)
256 , Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
257 , cId
258 , nodeTypeId NodeDocument
259 , ngramsTypeId nt
260 )
261 where
262 fields = [QualifiedIdentifier Nothing "text"]
263
264 queryNgramsOnlyByNodeUser :: DPS.Query
265 queryNgramsOnlyByNodeUser = [sql|
266 WITH input_rows(terms) AS (?),
267 input_list(id) AS (?)
268 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
269 JOIN ngrams ng ON nng.ngrams_id = ng.id
270 JOIN input_rows ir ON ir.terms = ng.terms
271 JOIN input_list il ON il.id = nng.node1_id
272 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
273 JOIN nodes n ON nn.node2_id = n.id
274 WHERE nn.node1_id = ? -- CorpusId
275 AND n.typename = ? -- NodeTypeId
276 AND nng.ngrams_type = ? -- NgramsTypeId
277 AND nn.category > 0
278 GROUP BY ng.terms, nng.node2_id
279 |]
280
281
282 getNgramsByDocOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
283 -> Cmd err (Map Text (Set NodeId))
284 getNgramsByDocOnlyUser cId ls nt ngs =
285 Map.unionsWith (<>)
286 . map (fromListWith (<>) . map (second Set.singleton))
287 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
288
289
290 selectNgramsOnlyByDocUser :: DocId -> [ListId] -> NgramsType -> [Text]
291 -> Cmd err [(Text, NodeId)]
292 selectNgramsOnlyByDocUser dId ls nt tms =
293 runPGSQuery queryNgramsOnlyByDocUser
294 ( Values fields (DPS.Only <$> tms)
295 , Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
296 , dId
297 , ngramsTypeId nt
298 )
299 where
300 fields = [QualifiedIdentifier Nothing "text"]
301
302 queryNgramsOnlyByDocUser :: DPS.Query
303 queryNgramsOnlyByDocUser = [sql|
304 WITH input_rows(terms) AS (?),
305 input_list(id) AS (?)
306 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
307 JOIN ngrams ng ON nng.ngrams_id = ng.id
308 JOIN input_rows ir ON ir.terms = ng.terms
309 JOIN input_list il ON il.id = nng.node1_id
310 WHERE nng.node2_id = ? -- DocId
311 AND nng.ngrams_type = ? -- NgramsTypeId
312 GROUP BY ng.terms, nng.node2_id
313 |]
314
315 ------------------------------------------------------------------------
316 -- | TODO filter by language, database, any social field
317 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
318 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
319 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
320 -- . takeWhile (not . List.null)
321 -- . takeWhile (\l -> List.length l > 3)
322 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
323
324
325 selectNgramsByNodeMaster :: Int -> UserCorpusId -> MasterCorpusId -> Int -> Cmd err [(NodeId, Text)]
326 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
327 queryNgramsByNodeMaster'
328 ( ucId
329 , ngramsTypeId NgramsTerms
330 , nodeTypeId NodeDocument
331 , p
332 , nodeTypeId NodeDocument
333 , p
334 , n
335 , mcId
336 , nodeTypeId NodeDocument
337 , ngramsTypeId NgramsTerms
338 )
339
340 -- | TODO fix node_node_ngrams relation
341 queryNgramsByNodeMaster' :: DPS.Query
342 queryNgramsByNodeMaster' = [sql|
343 WITH nodesByNgramsUser AS (
344
345 SELECT n.id, ng.terms FROM nodes n
346 JOIN nodes_nodes nn ON n.id = nn.node2_id
347 JOIN node_node_ngrams nng ON nng.node2_id = n.id
348 JOIN ngrams ng ON nng.ngrams_id = ng.id
349 WHERE nn.node1_id = ? -- UserCorpusId
350 -- AND n.typename = ? -- NodeTypeId
351 AND nng.ngrams_type = ? -- NgramsTypeId
352 AND nn.category > 0
353 AND node_pos(n.id,?) >= ?
354 AND node_pos(n.id,?) < ?
355 GROUP BY n.id, ng.terms
356
357 ),
358
359 nodesByNgramsMaster AS (
360
361 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
362 JOIN node_node_ngrams nng ON n.id = nng.node2_id
363 JOIN ngrams ng ON ng.id = nng.ngrams_id
364
365 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
366 AND n.typename = ? -- NodeTypeId
367 AND nng.ngrams_type = ? -- NgramsTypeId
368 GROUP BY n.id, ng.terms
369 )
370
371 SELECT m.id, m.terms FROM nodesByNgramsMaster m
372 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
373 |]
374
375