]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByNode.hs
[OPTIM + FIX] TFICF
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByNode.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByNode
17 where
18
19 import Data.Map.Strict (Map, fromListWith, elems, toList)
20 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.Set (Set)
22 import Data.Text (Text)
23 import Data.Tuple.Extra (second, swap)
24 import Database.PostgreSQL.Simple.SqlQQ (sql)
25 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
26 import Debug.Trace (trace)
27 import Gargantext.Core (Lang(..))
28 import Gargantext.Database.Admin.Config (nodeTypeId)
29 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
30 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
31 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
32 import Gargantext.Prelude
33 import Gargantext.Core.Text.Terms.Mono.Stem (stem)
34 import qualified Data.List as List
35 import qualified Data.Map.Strict as Map
36 import qualified Data.Set as Set
37 import qualified Data.Text as Text
38 import qualified Database.PostgreSQL.Simple as DPS
39
40 -- | TODO: group with 2 terms only can be
41 -- discussed. Main purpose of this is offering
42 -- a first grouping option to user and get some
43 -- enriched data to better learn and improve that algo
44 ngramsGroup :: Lang
45 -> Int
46 -> Int
47 -> Text
48 -> Text
49 ngramsGroup l _m _n = Text.intercalate " "
50 . map (stem l)
51 -- . take n
52 . List.sort
53 -- . (List.filter (\t -> Text.length t > m))
54 . Text.splitOn " "
55 . Text.replace "-" " "
56
57
58
59 -- | fst is size of Supra Corpus
60 -- snd is Texts and size of Occurrences (different docs)
61 countNodesByNgramsWith :: (Text -> Text)
62 -> Map Text (Set NodeId)
63 -> (Double, Map Text (Double, Set Text))
64 countNodesByNgramsWith f m = (total, m')
65 where
66 total = fromIntegral $ Set.size $ Set.unions $ elems m
67 m' = Map.map ( swap . second (fromIntegral . Set.size))
68 $ groupNodesByNgramsWith f m
69
70
71 groupNodesByNgramsWith :: (Text -> Text)
72 -> Map Text (Set NodeId)
73 -> Map Text (Set Text, Set NodeId)
74 groupNodesByNgramsWith f m =
75 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
76 $ toList m
77
78 ------------------------------------------------------------------------
79 getNodesByNgramsUser :: CorpusId
80 -> NgramsType
81 -> Cmd err (Map Text (Set NodeId))
82 getNodesByNgramsUser cId nt =
83 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
84 <$> selectNgramsByNodeUser cId nt
85 where
86
87 selectNgramsByNodeUser :: CorpusId
88 -> NgramsType
89 -> Cmd err [(NodeId, Text)]
90 selectNgramsByNodeUser cId' nt' =
91 runPGSQuery queryNgramsByNodeUser
92 ( cId'
93 , nodeTypeId NodeDocument
94 , ngramsTypeId nt'
95 -- , 100 :: Int -- limit
96 -- , 0 :: Int -- offset
97 )
98
99 queryNgramsByNodeUser :: DPS.Query
100 queryNgramsByNodeUser = [sql|
101 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
102 JOIN ngrams ng ON nng.ngrams_id = ng.id
103 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
104 JOIN nodes n ON nn.node2_id = n.id
105 WHERE nn.node1_id = ? -- CorpusId
106 AND n.typename = ? -- NodeTypeId
107 AND nng.ngrams_type = ? -- NgramsTypeId
108 AND nn.category > 0
109 GROUP BY nng.node2_id, ng.terms
110 ORDER BY (nng.node2_id, ng.terms) DESC
111 -- LIMIT ?
112 -- OFFSET ?
113 |]
114 ------------------------------------------------------------------------
115 -- TODO add groups
116 getOccByNgramsOnlyFast :: CorpusId
117 -> NgramsType
118 -> [Text]
119 -> Cmd err (Map Text Int)
120 getOccByNgramsOnlyFast cId nt ngs =
121 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
122
123
124 getOccByNgramsOnlyFast' :: CorpusId
125 -> ListId
126 -> NgramsType
127 -> [Text]
128 -> Cmd err (Map Text Int)
129 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
130 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
131
132 where
133 fields = [QualifiedIdentifier Nothing "text"]
134
135 run :: CorpusId
136 -> ListId
137 -> NgramsType
138 -> [Text]
139 -> Cmd err [(Text, Double)]
140 run cId' lId' nt' tms' = runPGSQuery query
141 ( Values fields (DPS.Only <$> tms')
142 , cId'
143 , lId'
144 , ngramsTypeId nt'
145 )
146
147 query :: DPS.Query
148 query = [sql|
149 WITH input_rows(terms) AS (?)
150 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
151 JOIN ngrams ng ON nng.ngrams_id = ng.id
152 JOIN input_rows ir ON ir.terms = ng.terms
153 WHERE nng.node1_id = ? -- CorpusId
154 AND nng.node2_id = ? -- ListId
155 AND nng.ngrams_type = ? -- NgramsTypeId
156 -- AND nn.category > 0 -- TODO
157 GROUP BY ng.terms, nng.weight
158 |]
159
160
161 -- just slower than getOccByNgramsOnlyFast
162 getOccByNgramsOnlySlow :: NodeType
163 -> CorpusId
164 -> [ListId]
165 -> NgramsType
166 -> [Text]
167 -> Cmd err (Map Text Int)
168 getOccByNgramsOnlySlow t cId ls nt ngs =
169 Map.map Set.size <$> getScore' t cId ls nt ngs
170 where
171 getScore' NodeCorpus = getNodesByNgramsOnlyUser
172 getScore' NodeDocument = getNgramsByDocOnlyUser
173 getScore' _ = getNodesByNgramsOnlyUser
174
175 getOccByNgramsOnlySafe :: CorpusId
176 -> [ListId]
177 -> NgramsType
178 -> [Text]
179 -> Cmd err (Map Text Int)
180 getOccByNgramsOnlySafe cId ls nt ngs = do
181 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
182 fast <- getOccByNgramsOnlyFast cId nt ngs
183 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
184 when (fast /= slow) $
185 printDebug "getOccByNgramsOnlySafe: difference"
186 (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
187 pure slow
188
189
190 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
191 -> NgramsType
192 -> [Text]
193 -> Cmd err [(Text, Int)]
194 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
195 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
196 ( Values fields (DPS.Only <$> tms)
197 , cId
198 , nodeTypeId NodeDocument
199 , ngramsTypeId nt
200 )
201 where
202 fields = [QualifiedIdentifier Nothing "text"]
203
204 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
205 -- Question: with the grouping is the result exactly the same (since Set NodeId for
206 -- equivalent ngrams intersections are not empty)
207 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
208 queryNgramsOccurrencesOnlyByNodeUser = [sql|
209 WITH input_rows(terms) AS (?)
210 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
211 JOIN ngrams ng ON nng.ngrams_id = ng.id
212 JOIN input_rows ir ON ir.terms = ng.terms
213 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
214 JOIN nodes n ON nn.node2_id = n.id
215 WHERE nn.node1_id = ? -- CorpusId
216 AND n.typename = ? -- NodeTypeId
217 AND nng.ngrams_type = ? -- NgramsTypeId
218 AND nn.category > 0
219 GROUP BY nng.node2_id, ng.terms
220 |]
221
222 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
223 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
224 WITH input_rows(terms) AS (?)
225 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
226 JOIN ngrams ng ON nng.ngrams_id = ng.id
227 JOIN input_rows ir ON ir.terms = ng.terms
228 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
229 JOIN nodes n ON nn.node2_id = n.id
230 WHERE nn.node1_id = ? -- CorpusId
231 AND n.typename = ? -- NodeTypeId
232 AND nng.ngrams_type = ? -- NgramsTypeId
233 AND nn.category > 0
234 GROUP BY nng.node2_id, ng.terms
235 |]
236
237 ------------------------------------------------------------------------
238 getNodesByNgramsOnlyUser :: NodeId
239 -> [ListId]
240 -> NgramsType
241 -> [Text]
242 -> Cmd err (Map Text (Set NodeId))
243 getNodesByNgramsOnlyUser cId ls nt ngs =
244 Map.unionsWith (<>)
245 . map (fromListWith (<>)
246 . map (second Set.singleton))
247 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
248 (splitEvery 1000 ngs)
249
250
251 getNgramsByNodeOnlyUser :: NodeId
252 -> [ListId]
253 -> NgramsType
254 -> [Text]
255 -> Cmd err (Map NodeId (Set Text))
256 getNgramsByNodeOnlyUser cId ls nt ngs =
257 Map.unionsWith (<>)
258 . map (fromListWith (<>)
259 . map (second Set.singleton))
260 . map (map swap)
261 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
262 (splitEvery 1000 ngs)
263
264 ------------------------------------------------------------------------
265 selectNgramsOnlyByNodeUser :: CorpusId
266 -> [ListId]
267 -> NgramsType
268 -> [Text]
269 -> Cmd err [(Text, NodeId)]
270 selectNgramsOnlyByNodeUser cId ls nt tms =
271 runPGSQuery queryNgramsOnlyByNodeUser
272 ( Values fields (DPS.Only <$> tms)
273 , Values [QualifiedIdentifier Nothing "int4"]
274 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
275 , cId
276 , nodeTypeId NodeDocument
277 , ngramsTypeId nt
278 )
279 where
280 fields = [QualifiedIdentifier Nothing "text"]
281
282 queryNgramsOnlyByNodeUser :: DPS.Query
283 queryNgramsOnlyByNodeUser = [sql|
284 WITH input_rows(terms) AS (?),
285 input_list(id) AS (?)
286 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
287 JOIN ngrams ng ON nng.ngrams_id = ng.id
288 JOIN input_rows ir ON ir.terms = ng.terms
289 JOIN input_list il ON il.id = nng.node1_id
290 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
291 JOIN nodes n ON nn.node2_id = n.id
292 WHERE nn.node1_id = ? -- CorpusId
293 AND n.typename = ? -- NodeTypeId
294 AND nng.ngrams_type = ? -- NgramsTypeId
295 AND nn.category > 0
296 GROUP BY ng.terms, nng.node2_id
297 |]
298
299
300 selectNgramsOnlyByNodeUser' :: CorpusId
301 -> [ListId]
302 -> NgramsType
303 -> [Text]
304 -> Cmd err [(Text, Int)]
305 selectNgramsOnlyByNodeUser' cId ls nt tms =
306 runPGSQuery queryNgramsOnlyByNodeUser
307 ( Values fields (DPS.Only <$> tms)
308 , Values [QualifiedIdentifier Nothing "int4"]
309 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
310 , cId
311 , nodeTypeId NodeDocument
312 , ngramsTypeId nt
313 )
314 where
315 fields = [QualifiedIdentifier Nothing "text"]
316
317 queryNgramsOnlyByNodeUser' :: DPS.Query
318 queryNgramsOnlyByNodeUser' = [sql|
319 WITH input_rows(terms) AS (?),
320 input_list(id) AS (?)
321 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
322 JOIN ngrams ng ON nng.ngrams_id = ng.id
323 JOIN input_rows ir ON ir.terms = ng.terms
324 JOIN input_list il ON il.id = nng.node2_id
325 WHERE nng.node1_id = ? -- CorpusId
326 AND nng.ngrams_type = ? -- NgramsTypeId
327 -- AND nn.category > 0
328 GROUP BY ng.terms, nng.weight
329 |]
330
331
332 getNgramsByDocOnlyUser :: NodeId
333 -> [ListId]
334 -> NgramsType
335 -> [Text]
336 -> Cmd err (Map Text (Set NodeId))
337 getNgramsByDocOnlyUser cId ls nt ngs =
338 Map.unionsWith (<>)
339 . map (fromListWith (<>) . map (second Set.singleton))
340 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
341
342
343 selectNgramsOnlyByDocUser :: DocId
344 -> [ListId]
345 -> NgramsType
346 -> [Text]
347 -> Cmd err [(Text, NodeId)]
348 selectNgramsOnlyByDocUser dId ls nt tms =
349 runPGSQuery queryNgramsOnlyByDocUser
350 ( Values fields (DPS.Only <$> tms)
351 , Values [QualifiedIdentifier Nothing "int4"]
352 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
353 , dId
354 , ngramsTypeId nt
355 )
356 where
357 fields = [QualifiedIdentifier Nothing "text"]
358
359
360 queryNgramsOnlyByDocUser :: DPS.Query
361 queryNgramsOnlyByDocUser = [sql|
362 WITH input_rows(terms) AS (?),
363 input_list(id) AS (?)
364 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
365 JOIN ngrams ng ON nng.ngrams_id = ng.id
366 JOIN input_rows ir ON ir.terms = ng.terms
367 JOIN input_list il ON il.id = nng.node1_id
368 WHERE nng.node2_id = ? -- DocId
369 AND nng.ngrams_type = ? -- NgramsTypeId
370 GROUP BY ng.terms, nng.node2_id
371 |]
372
373 ------------------------------------------------------------------------
374 -- | TODO filter by language, database, any social field
375 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
376 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
377 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
378 -- . takeWhile (not . List.null)
379 -- . takeWhile (\l -> List.length l > 3)
380 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
381
382 selectNgramsByNodeMaster :: Int
383 -> UserCorpusId
384 -> MasterCorpusId
385 -> Int
386 -> Cmd err [(NodeId, Text)]
387 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
388 queryNgramsByNodeMaster'
389 ( ucId
390 , ngramsTypeId NgramsTerms
391 , nodeTypeId NodeDocument
392 , p
393 , nodeTypeId NodeDocument
394 , p
395 , n
396 , mcId
397 , nodeTypeId NodeDocument
398 , ngramsTypeId NgramsTerms
399 )
400
401 -- | TODO fix node_node_ngrams relation
402 queryNgramsByNodeMaster' :: DPS.Query
403 queryNgramsByNodeMaster' = [sql|
404 WITH nodesByNgramsUser AS (
405
406 SELECT n.id, ng.terms FROM nodes n
407 JOIN nodes_nodes nn ON n.id = nn.node2_id
408 JOIN node_node_ngrams nng ON nng.node2_id = n.id
409 JOIN ngrams ng ON nng.ngrams_id = ng.id
410 WHERE nn.node1_id = ? -- UserCorpusId
411 -- AND n.typename = ? -- NodeTypeId
412 AND nng.ngrams_type = ? -- NgramsTypeId
413 AND nn.category > 0
414 AND node_pos(n.id,?) >= ?
415 AND node_pos(n.id,?) < ?
416 GROUP BY n.id, ng.terms
417
418 ),
419
420 nodesByNgramsMaster AS (
421
422 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
423 JOIN node_node_ngrams nng ON n.id = nng.node2_id
424 JOIN ngrams ng ON ng.id = nng.ngrams_id
425
426 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
427 AND n.typename = ? -- NodeTypeId
428 AND nng.ngrams_type = ? -- NgramsTypeId
429 GROUP BY n.id, ng.terms
430 )
431
432 SELECT m.id, m.terms FROM nodesByNgramsMaster m
433 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
434 |]