]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByNode.hs
Merge branch 'dev-cbor' into dev
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByNode.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByNode
17 where
18
19 import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
20 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.Set (Set)
22 import Data.Text (Text)
23 import Data.Tuple.Extra (second, swap)
24 import Database.PostgreSQL.Simple.SqlQQ (sql)
25 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
26 import Debug.Trace (trace)
27 import Gargantext.Core (Lang(..))
28 import Gargantext.Database.Admin.Config (nodeTypeId)
29 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
30 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
31 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
32 import Gargantext.Prelude
33 import Gargantext.Core.Text.Metrics.TFICF
34 import Gargantext.Core.Text.Terms.Mono.Stem (stem)
35 import qualified Data.List as List
36 import qualified Data.Map.Strict as Map
37 import qualified Data.Set as Set
38 import qualified Data.Text as Text
39 import qualified Database.PostgreSQL.Simple as DPS
40
41 -- | TODO: group with 2 terms only can be
42 -- discussed. Main purpose of this is offering
43 -- a first grouping option to user and get some
44 -- enriched data to better learn and improve that algo
45 ngramsGroup :: Lang
46 -> Int
47 -> Int
48 -> Text
49 -> Text
50 ngramsGroup l _m _n = Text.intercalate " "
51 . map (stem l)
52 -- . take n
53 . List.sort
54 -- . (List.filter (\t -> Text.length t > m))
55 . Text.splitOn " "
56 . Text.replace "-" " "
57
58
59 getTficf :: UserCorpusId
60 -> MasterCorpusId
61 -> NgramsType
62 -> (Text -> Text)
63 -> Cmd err (Map Text (Double, Set Text))
64 getTficf u m nt f = do
65 u' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsUser u nt
66 m' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsMaster u m
67
68 pure $ toTficfData (countNodesByNgramsWith f u')
69 (countNodesByNgramsWith f m')
70
71 {-
72 getTficfWith :: UserCorpusId
73 -> MasterCorpusId
74 -> [ListId]
75 -> NgramsType
76 -> Map Text (Maybe Text)
77 -> Cmd err (Map Text (Double, Set Text))
78 getTficfWith u m ls nt mtxt = do
79 u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
80 m' <- getNodesByNgramsMaster u m
81
82 let f x = case Map.lookup x mtxt of
83 Nothing -> x
84 Just x' -> maybe x identity x'
85
86 pure $ toTficfData (countNodesByNgramsWith f u') (countNodesByNgramsWith f m')
87 -}
88
89 type Context = (Double, Map Text (Double, Set Text))
90 type Supra = Context
91 type Infra = Context
92
93 toTficfData :: Infra
94 -> Supra
95 -> Map Text (Double, Set Text)
96 toTficfData (ti, mi) (ts, ms) =
97 fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
98 (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
99 , ns
100 )
101 )
102 | (t, (n,ns)) <- toList mi
103 ]
104
105
106 -- | fst is size of Supra Corpus
107 -- snd is Texts and size of Occurrences (different docs)
108 countNodesByNgramsWith :: (Text -> Text)
109 -> Map Text (Set NodeId)
110 -> (Double, Map Text (Double, Set Text))
111 countNodesByNgramsWith f m = (total, m')
112 where
113 total = fromIntegral $ Set.size $ Set.unions $ elems m
114 m' = Map.map ( swap . second (fromIntegral . Set.size))
115 $ groupNodesByNgramsWith f m
116
117
118 groupNodesByNgramsWith :: (Text -> Text)
119 -> Map Text (Set NodeId)
120 -> Map Text (Set Text, Set NodeId)
121 groupNodesByNgramsWith f m =
122 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
123 $ toList m
124
125 ------------------------------------------------------------------------
126 getNodesByNgramsUser :: CorpusId
127 -> NgramsType
128 -> Cmd err (Map Text (Set NodeId))
129 getNodesByNgramsUser cId nt =
130 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
131 <$> selectNgramsByNodeUser cId nt
132 where
133
134 selectNgramsByNodeUser :: CorpusId
135 -> NgramsType
136 -> Cmd err [(NodeId, Text)]
137 selectNgramsByNodeUser cId' nt' =
138 runPGSQuery queryNgramsByNodeUser
139 ( cId'
140 , nodeTypeId NodeDocument
141 , ngramsTypeId nt'
142 -- , 100 :: Int -- limit
143 -- , 0 :: Int -- offset
144 )
145
146 queryNgramsByNodeUser :: DPS.Query
147 queryNgramsByNodeUser = [sql|
148 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
149 JOIN ngrams ng ON nng.ngrams_id = ng.id
150 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
151 JOIN nodes n ON nn.node2_id = n.id
152 WHERE nn.node1_id = ? -- CorpusId
153 AND n.typename = ? -- NodeTypeId
154 AND nng.ngrams_type = ? -- NgramsTypeId
155 AND nn.category > 0
156 GROUP BY nng.node2_id, ng.terms
157 ORDER BY (nng.node2_id, ng.terms) DESC
158 -- LIMIT ?
159 -- OFFSET ?
160 |]
161 ------------------------------------------------------------------------
162 -- TODO add groups
163 getOccByNgramsOnlyFast :: CorpusId
164 -> NgramsType
165 -> [Text]
166 -> Cmd err (Map Text Int)
167 getOccByNgramsOnlyFast cId nt ngs =
168 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
169
170
171 getOccByNgramsOnlyFast' :: CorpusId
172 -> ListId
173 -> NgramsType
174 -> [Text]
175 -> Cmd err (Map Text Int)
176 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
177 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
178
179 where
180 fields = [QualifiedIdentifier Nothing "text"]
181
182 run :: CorpusId
183 -> ListId
184 -> NgramsType
185 -> [Text]
186 -> Cmd err [(Text, Double)]
187 run cId' lId' nt' tms' = runPGSQuery query
188 ( Values fields (DPS.Only <$> tms')
189 , cId'
190 , lId'
191 , ngramsTypeId nt'
192 )
193
194 query :: DPS.Query
195 query = [sql|
196 WITH input_rows(terms) AS (?)
197 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
198 JOIN ngrams ng ON nng.ngrams_id = ng.id
199 JOIN input_rows ir ON ir.terms = ng.terms
200 WHERE nng.node1_id = ? -- CorpusId
201 AND nng.node2_id = ? -- ListId
202 AND nng.ngrams_type = ? -- NgramsTypeId
203 -- AND nn.category > 0 -- TODO
204 GROUP BY ng.terms, nng.weight
205 |]
206
207
208 -- just slower than getOccByNgramsOnlyFast
209 getOccByNgramsOnlySlow :: NodeType
210 -> CorpusId
211 -> [ListId]
212 -> NgramsType
213 -> [Text]
214 -> Cmd err (Map Text Int)
215 getOccByNgramsOnlySlow t cId ls nt ngs =
216 Map.map Set.size <$> getScore' t cId ls nt ngs
217 where
218 getScore' NodeCorpus = getNodesByNgramsOnlyUser
219 getScore' NodeDocument = getNgramsByDocOnlyUser
220 getScore' _ = getNodesByNgramsOnlyUser
221
222 getOccByNgramsOnlySafe :: CorpusId
223 -> [ListId]
224 -> NgramsType
225 -> [Text]
226 -> Cmd err (Map Text Int)
227 getOccByNgramsOnlySafe cId ls nt ngs = do
228 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
229 fast <- getOccByNgramsOnlyFast cId nt ngs
230 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
231 when (fast /= slow) $
232 printDebug "getOccByNgramsOnlySafe: difference"
233 (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
234 pure slow
235
236
237 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
238 -> NgramsType
239 -> [Text]
240 -> Cmd err [(Text, Int)]
241 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
242 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
243 ( Values fields (DPS.Only <$> tms)
244 , cId
245 , nodeTypeId NodeDocument
246 , ngramsTypeId nt
247 )
248 where
249 fields = [QualifiedIdentifier Nothing "text"]
250
251 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
252 -- Question: with the grouping is the result exactly the same (since Set NodeId for
253 -- equivalent ngrams intersections are not empty)
254 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
255 queryNgramsOccurrencesOnlyByNodeUser = [sql|
256 WITH input_rows(terms) AS (?)
257 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
258 JOIN ngrams ng ON nng.ngrams_id = ng.id
259 JOIN input_rows ir ON ir.terms = ng.terms
260 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
261 JOIN nodes n ON nn.node2_id = n.id
262 WHERE nn.node1_id = ? -- CorpusId
263 AND n.typename = ? -- NodeTypeId
264 AND nng.ngrams_type = ? -- NgramsTypeId
265 AND nn.category > 0
266 GROUP BY nng.node2_id, ng.terms
267 |]
268
269 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
270 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
271 WITH input_rows(terms) AS (?)
272 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
273 JOIN ngrams ng ON nng.ngrams_id = ng.id
274 JOIN input_rows ir ON ir.terms = ng.terms
275 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
276 JOIN nodes n ON nn.node2_id = n.id
277 WHERE nn.node1_id = ? -- CorpusId
278 AND n.typename = ? -- NodeTypeId
279 AND nng.ngrams_type = ? -- NgramsTypeId
280 AND nn.category > 0
281 GROUP BY nng.node2_id, ng.terms
282 |]
283
284 ------------------------------------------------------------------------
285 getNodesByNgramsOnlyUser :: NodeId
286 -> [ListId]
287 -> NgramsType
288 -> [Text]
289 -> Cmd err (Map Text (Set NodeId))
290 getNodesByNgramsOnlyUser cId ls nt ngs =
291 Map.unionsWith (<>)
292 . map (fromListWith (<>)
293 . map (second Set.singleton))
294 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
295 (splitEvery 1000 ngs)
296
297
298 getNgramsByNodeOnlyUser :: NodeId
299 -> [ListId]
300 -> NgramsType
301 -> [Text]
302 -> Cmd err (Map NodeId (Set Text))
303 getNgramsByNodeOnlyUser cId ls nt ngs =
304 Map.unionsWith (<>)
305 . map (fromListWith (<>)
306 . map (second Set.singleton))
307 . map (map swap)
308 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
309 (splitEvery 1000 ngs)
310
311 ------------------------------------------------------------------------
312 selectNgramsOnlyByNodeUser :: CorpusId
313 -> [ListId]
314 -> NgramsType
315 -> [Text]
316 -> Cmd err [(Text, NodeId)]
317 selectNgramsOnlyByNodeUser cId ls nt tms =
318 runPGSQuery queryNgramsOnlyByNodeUser
319 ( Values fields (DPS.Only <$> tms)
320 , Values [QualifiedIdentifier Nothing "int4"]
321 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
322 , cId
323 , nodeTypeId NodeDocument
324 , ngramsTypeId nt
325 )
326 where
327 fields = [QualifiedIdentifier Nothing "text"]
328
329 queryNgramsOnlyByNodeUser :: DPS.Query
330 queryNgramsOnlyByNodeUser = [sql|
331 WITH input_rows(terms) AS (?),
332 input_list(id) AS (?)
333 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
334 JOIN ngrams ng ON nng.ngrams_id = ng.id
335 JOIN input_rows ir ON ir.terms = ng.terms
336 JOIN input_list il ON il.id = nng.node1_id
337 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
338 JOIN nodes n ON nn.node2_id = n.id
339 WHERE nn.node1_id = ? -- CorpusId
340 AND n.typename = ? -- NodeTypeId
341 AND nng.ngrams_type = ? -- NgramsTypeId
342 AND nn.category > 0
343 GROUP BY ng.terms, nng.node2_id
344 |]
345
346
347 selectNgramsOnlyByNodeUser' :: CorpusId
348 -> [ListId]
349 -> NgramsType
350 -> [Text]
351 -> Cmd err [(Text, Int)]
352 selectNgramsOnlyByNodeUser' cId ls nt tms =
353 runPGSQuery queryNgramsOnlyByNodeUser
354 ( Values fields (DPS.Only <$> tms)
355 , Values [QualifiedIdentifier Nothing "int4"]
356 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
357 , cId
358 , nodeTypeId NodeDocument
359 , ngramsTypeId nt
360 )
361 where
362 fields = [QualifiedIdentifier Nothing "text"]
363
364 queryNgramsOnlyByNodeUser' :: DPS.Query
365 queryNgramsOnlyByNodeUser' = [sql|
366 WITH input_rows(terms) AS (?),
367 input_list(id) AS (?)
368 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
369 JOIN ngrams ng ON nng.ngrams_id = ng.id
370 JOIN input_rows ir ON ir.terms = ng.terms
371 JOIN input_list il ON il.id = nng.node2_id
372 WHERE nng.node1_id = ? -- CorpusId
373 AND nng.ngrams_type = ? -- NgramsTypeId
374 -- AND nn.category > 0
375 GROUP BY ng.terms, nng.weight
376 |]
377
378
379 getNgramsByDocOnlyUser :: NodeId
380 -> [ListId]
381 -> NgramsType
382 -> [Text]
383 -> Cmd err (Map Text (Set NodeId))
384 getNgramsByDocOnlyUser cId ls nt ngs =
385 Map.unionsWith (<>)
386 . map (fromListWith (<>) . map (second Set.singleton))
387 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
388
389
390 selectNgramsOnlyByDocUser :: DocId
391 -> [ListId]
392 -> NgramsType
393 -> [Text]
394 -> Cmd err [(Text, NodeId)]
395 selectNgramsOnlyByDocUser dId ls nt tms =
396 runPGSQuery queryNgramsOnlyByDocUser
397 ( Values fields (DPS.Only <$> tms)
398 , Values [QualifiedIdentifier Nothing "int4"]
399 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
400 , dId
401 , ngramsTypeId nt
402 )
403 where
404 fields = [QualifiedIdentifier Nothing "text"]
405
406
407 queryNgramsOnlyByDocUser :: DPS.Query
408 queryNgramsOnlyByDocUser = [sql|
409 WITH input_rows(terms) AS (?),
410 input_list(id) AS (?)
411 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
412 JOIN ngrams ng ON nng.ngrams_id = ng.id
413 JOIN input_rows ir ON ir.terms = ng.terms
414 JOIN input_list il ON il.id = nng.node1_id
415 WHERE nng.node2_id = ? -- DocId
416 AND nng.ngrams_type = ? -- NgramsTypeId
417 GROUP BY ng.terms, nng.node2_id
418 |]
419
420 ------------------------------------------------------------------------
421 -- | TODO filter by language, database, any social field
422 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
423 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
424 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
425 -- . takeWhile (not . List.null)
426 -- . takeWhile (\l -> List.length l > 3)
427 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
428
429 selectNgramsByNodeMaster :: Int
430 -> UserCorpusId
431 -> MasterCorpusId
432 -> Int
433 -> Cmd err [(NodeId, Text)]
434 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
435 queryNgramsByNodeMaster'
436 ( ucId
437 , ngramsTypeId NgramsTerms
438 , nodeTypeId NodeDocument
439 , p
440 , nodeTypeId NodeDocument
441 , p
442 , n
443 , mcId
444 , nodeTypeId NodeDocument
445 , ngramsTypeId NgramsTerms
446 )
447
448 -- | TODO fix node_node_ngrams relation
449 queryNgramsByNodeMaster' :: DPS.Query
450 queryNgramsByNodeMaster' = [sql|
451 WITH nodesByNgramsUser AS (
452
453 SELECT n.id, ng.terms FROM nodes n
454 JOIN nodes_nodes nn ON n.id = nn.node2_id
455 JOIN node_node_ngrams nng ON nng.node2_id = n.id
456 JOIN ngrams ng ON nng.ngrams_id = ng.id
457 WHERE nn.node1_id = ? -- UserCorpusId
458 -- AND n.typename = ? -- NodeTypeId
459 AND nng.ngrams_type = ? -- NgramsTypeId
460 AND nn.category > 0
461 AND node_pos(n.id,?) >= ?
462 AND node_pos(n.id,?) < ?
463 GROUP BY n.id, ng.terms
464
465 ),
466
467 nodesByNgramsMaster AS (
468
469 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
470 JOIN node_node_ngrams nng ON n.id = nng.node2_id
471 JOIN ngrams ng ON ng.id = nng.ngrams_id
472
473 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
474 AND n.typename = ? -- NodeTypeId
475 AND nng.ngrams_type = ? -- NgramsTypeId
476 GROUP BY n.id, ng.terms
477 )
478
479 SELECT m.id, m.terms FROM nodesByNgramsMaster m
480 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
481 |]