]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByNode.hs
[WIP/DB] Refactoring (start).
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByNode.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE FlexibleContexts #-}
15 {-# LANGUAGE QuasiQuotes #-}
16 {-# LANGUAGE NoImplicitPrelude #-}
17 {-# LANGUAGE OverloadedStrings #-}
18 {-# LANGUAGE RankNTypes #-}
19
20 module Gargantext.Database.Action.Metrics.NgramsByNode
21 where
22
23 import Debug.Trace (trace)
24 import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
25 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
26 import Data.Set (Set)
27 import Data.Text (Text)
28 import Data.Tuple.Extra (second, swap)
29 import Database.PostgreSQL.Simple.SqlQQ (sql)
30 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
31 import Gargantext.Core (Lang(..))
32 import Gargantext.Database.Config (nodeTypeId)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Database.Types.Node -- (ListId, CorpusId, NodeId)
35 import Gargantext.Database.Utils (Cmd, runPGSQuery)
36 import Gargantext.Prelude
37 import Gargantext.Text.Metrics.TFICF
38 import Gargantext.Text.Terms.Mono.Stem (stem)
39 import qualified Data.List as List
40 import qualified Data.Map.Strict as Map
41 import qualified Data.Set as Set
42 import qualified Data.Text as Text
43 import qualified Database.PostgreSQL.Simple as DPS
44
45 -- | TODO: group with 2 terms only can be
46 -- discussed. Main purpose of this is offering
47 -- a first grouping option to user and get some
48 -- enriched data to better learn and improve that algo
49 ngramsGroup :: Lang -> Int -> Int -> Text -> Text
50 ngramsGroup l _m _n = Text.intercalate " "
51 . map (stem l)
52 -- . take n
53 . List.sort
54 -- . (List.filter (\t -> Text.length t > m))
55 . Text.splitOn " "
56 . Text.replace "-" " "
57
58
59 sortTficf :: (Map Text (Double, Set Text))
60 -> [ (Text,(Double, Set Text))]
61 sortTficf = List.sortOn (fst . snd) . toList
62
63
64 getTficf' :: UserCorpusId -> MasterCorpusId -> NgramsType -> (Text -> Text)
65 -> Cmd err (Map Text (Double, Set Text))
66 getTficf' u m nt f = do
67 u' <- getNodesByNgramsUser u nt
68 m' <- getNodesByNgramsMaster u m
69
70 pure $ toTficfData (countNodesByNgramsWith f u')
71 (countNodesByNgramsWith f m')
72
73 getTficfWith :: UserCorpusId -> MasterCorpusId -> [ListId]
74 -> NgramsType -> Map Text (Maybe Text)
75 -> Cmd err (Map Text (Double, Set Text))
76 getTficfWith u m ls nt mtxt = do
77 u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
78 m' <- getNodesByNgramsMaster u m
79
80 let f x = case Map.lookup x mtxt of
81 Nothing -> x
82 Just x' -> maybe x identity x'
83
84 pure $ toTficfData (countNodesByNgramsWith f u')
85 (countNodesByNgramsWith f m')
86
87
88 type Context = (Double, Map Text (Double, Set Text))
89 type Supra = Context
90 type Infra = Context
91
92 toTficfData :: Infra -> Supra
93 -> Map Text (Double, Set Text)
94 toTficfData (ti, mi) (ts, ms) =
95 fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
96 (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
97 , ns
98 )
99 )
100 | (t, (n,ns)) <- toList mi
101 ]
102
103
104 -- | fst is size of Supra Corpus
105 -- snd is Texts and size of Occurrences (different docs)
106 countNodesByNgramsWith :: (Text -> Text)
107 -> Map Text (Set NodeId)
108 -> (Double, Map Text (Double, Set Text))
109 countNodesByNgramsWith f m = (total, m')
110 where
111 total = fromIntegral $ Set.size $ Set.unions $ elems m
112 m' = Map.map ( swap . second (fromIntegral . Set.size))
113 $ groupNodesByNgramsWith f m
114
115
116 groupNodesByNgramsWith :: (Text -> Text)
117 -> Map Text (Set NodeId)
118 -> Map Text (Set Text, Set NodeId)
119 groupNodesByNgramsWith f m =
120 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
121 $ toList m
122
123 ------------------------------------------------------------------------
124 getNodesByNgramsUser :: CorpusId
125 -> NgramsType
126 -> Cmd err (Map Text (Set NodeId))
127 getNodesByNgramsUser cId nt =
128 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
129 <$> selectNgramsByNodeUser cId nt
130 where
131
132 selectNgramsByNodeUser :: CorpusId -> NgramsType
133 -> Cmd err [(NodeId, Text)]
134 selectNgramsByNodeUser cId' nt' =
135 runPGSQuery queryNgramsByNodeUser
136 ( cId'
137 , nodeTypeId NodeDocument
138 , ngramsTypeId nt'
139 -- , 100 :: Int -- limit
140 -- , 0 :: Int -- offset
141 )
142
143 queryNgramsByNodeUser :: DPS.Query
144 queryNgramsByNodeUser = [sql|
145 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
146 JOIN ngrams ng ON nng.ngrams_id = ng.id
147 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
148 JOIN nodes n ON nn.node2_id = n.id
149 WHERE nn.node1_id = ? -- CorpusId
150 AND n.typename = ? -- NodeTypeId
151 AND nng.ngrams_type = ? -- NgramsTypeId
152 AND nn.category > 0
153 GROUP BY nng.node2_id, ng.terms
154 ORDER BY (nng.node2_id, ng.terms) DESC
155 -- LIMIT ?
156 -- OFFSET ?
157 |]
158 ------------------------------------------------------------------------
159 -- TODO add groups
160 getOccByNgramsOnlyFast :: CorpusId
161 -> NgramsType
162 -> [Text]
163 -> Cmd err (Map Text Int)
164 getOccByNgramsOnlyFast cId nt ngs =
165 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
166
167
168 getOccByNgramsOnlyFast' :: CorpusId
169 -> ListId
170 -> NgramsType
171 -> [Text]
172 -> Cmd err (Map Text Int)
173 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
174 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
175
176 where
177 fields = [QualifiedIdentifier Nothing "text"]
178
179 run :: CorpusId
180 -> ListId
181 -> NgramsType
182 -> [Text]
183 -> Cmd err [(Text, Double)]
184 run cId' lId' nt' tms' = runPGSQuery query
185 ( Values fields (DPS.Only <$> tms')
186 , cId'
187 , lId'
188 , ngramsTypeId nt'
189 )
190
191 query :: DPS.Query
192 query = [sql|
193 WITH input_rows(terms) AS (?)
194 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
195 JOIN ngrams ng ON nng.ngrams_id = ng.id
196 JOIN input_rows ir ON ir.terms = ng.terms
197 WHERE nng.node1_id = ? -- CorpusId
198 AND nng.node2_id = ? -- ListId
199 AND nng.ngrams_type = ? -- NgramsTypeId
200 -- AND nn.category > 0 -- TODO
201 GROUP BY ng.terms, nng.weight
202 |]
203
204
205
206
207 -- just slower than getOccByNgramsOnlyFast
208 getOccByNgramsOnlySlow :: NodeType
209 -> CorpusId
210 -> [ListId]
211 -> NgramsType
212 -> [Text]
213 -> Cmd err (Map Text Int)
214 getOccByNgramsOnlySlow t cId ls nt ngs =
215 Map.map Set.size <$> getScore' t cId ls nt ngs
216 where
217 getScore' NodeCorpus = getNodesByNgramsOnlyUser
218 getScore' NodeDocument = getNgramsByDocOnlyUser
219 getScore' _ = getNodesByNgramsOnlyUser
220
221 getOccByNgramsOnlySafe :: CorpusId
222 -> [ListId]
223 -> NgramsType
224 -> [Text]
225 -> Cmd err (Map Text Int)
226 getOccByNgramsOnlySafe cId ls nt ngs = do
227 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
228 fast <- getOccByNgramsOnlyFast cId nt ngs
229 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
230 when (fast /= slow) $
231 printDebug "getOccByNgramsOnlySafe: difference" (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
232 pure slow
233
234
235 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId -> NgramsType -> [Text]
236 -> Cmd err [(Text, Int)]
237 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
238 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
239 ( Values fields (DPS.Only <$> tms)
240 , cId
241 , nodeTypeId NodeDocument
242 , ngramsTypeId nt
243 )
244 where
245 fields = [QualifiedIdentifier Nothing "text"]
246
247 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
248 -- Question: with the grouping is the result exactly the same (since Set NodeId for
249 -- equivalent ngrams intersections are not empty)
250 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
251 queryNgramsOccurrencesOnlyByNodeUser = [sql|
252 WITH input_rows(terms) AS (?)
253 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
254 JOIN ngrams ng ON nng.ngrams_id = ng.id
255 JOIN input_rows ir ON ir.terms = ng.terms
256 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
257 JOIN nodes n ON nn.node2_id = n.id
258 WHERE nn.node1_id = ? -- CorpusId
259 AND n.typename = ? -- NodeTypeId
260 AND nng.ngrams_type = ? -- NgramsTypeId
261 AND nn.category > 0
262 GROUP BY nng.node2_id, ng.terms
263 |]
264
265
266
267 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
268 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
269 WITH input_rows(terms) AS (?)
270 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
271 JOIN ngrams ng ON nng.ngrams_id = ng.id
272 JOIN input_rows ir ON ir.terms = ng.terms
273 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
274 JOIN nodes n ON nn.node2_id = n.id
275 WHERE nn.node1_id = ? -- CorpusId
276 AND n.typename = ? -- NodeTypeId
277 AND nng.ngrams_type = ? -- NgramsTypeId
278 AND nn.category > 0
279 GROUP BY nng.node2_id, ng.terms
280 |]
281
282 ------------------------------------------------------------------------
283
284 getNodesByNgramsOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
285 -> Cmd err (Map Text (Set NodeId))
286 getNodesByNgramsOnlyUser cId ls nt ngs =
287 Map.unionsWith (<>)
288 . map (fromListWith (<>)
289 . map (second Set.singleton))
290 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
291 (splitEvery 1000 ngs)
292
293
294 getNgramsByNodeOnlyUser :: NodeId
295 -> [ListId]
296 -> NgramsType
297 -> [Text]
298 -> Cmd err (Map NodeId (Set Text))
299 getNgramsByNodeOnlyUser cId ls nt ngs =
300 Map.unionsWith (<>)
301 . map (fromListWith (<>)
302 . map (second Set.singleton))
303 . map (map swap)
304 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
305 (splitEvery 1000 ngs)
306
307 ------------------------------------------------------------------------
308 selectNgramsOnlyByNodeUser :: CorpusId -> [ListId] -> NgramsType -> [Text]
309 -> Cmd err [(Text, NodeId)]
310 selectNgramsOnlyByNodeUser cId ls nt tms =
311 runPGSQuery queryNgramsOnlyByNodeUser
312 ( Values fields (DPS.Only <$> tms)
313 , Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
314 , cId
315 , nodeTypeId NodeDocument
316 , ngramsTypeId nt
317 )
318 where
319 fields = [QualifiedIdentifier Nothing "text"]
320
321 queryNgramsOnlyByNodeUser :: DPS.Query
322 queryNgramsOnlyByNodeUser = [sql|
323 WITH input_rows(terms) AS (?),
324 input_list(id) AS (?)
325 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
326 JOIN ngrams ng ON nng.ngrams_id = ng.id
327 JOIN input_rows ir ON ir.terms = ng.terms
328 JOIN input_list il ON il.id = nng.node1_id
329 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
330 JOIN nodes n ON nn.node2_id = n.id
331 WHERE nn.node1_id = ? -- CorpusId
332 AND n.typename = ? -- NodeTypeId
333 AND nng.ngrams_type = ? -- NgramsTypeId
334 AND nn.category > 0
335 GROUP BY ng.terms, nng.node2_id
336 |]
337
338
339
340 selectNgramsOnlyByNodeUser' :: CorpusId -> [ListId] -> NgramsType -> [Text]
341 -> Cmd err [(Text, Int)]
342 selectNgramsOnlyByNodeUser' cId ls nt tms =
343 runPGSQuery queryNgramsOnlyByNodeUser
344 ( Values fields (DPS.Only <$> tms)
345 , Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
346 , cId
347 , nodeTypeId NodeDocument
348 , ngramsTypeId nt
349 )
350 where
351 fields = [QualifiedIdentifier Nothing "text"]
352
353 queryNgramsOnlyByNodeUser' :: DPS.Query
354 queryNgramsOnlyByNodeUser' = [sql|
355 WITH input_rows(terms) AS (?),
356 input_list(id) AS (?)
357 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
358 JOIN ngrams ng ON nng.ngrams_id = ng.id
359 JOIN input_rows ir ON ir.terms = ng.terms
360 JOIN input_list il ON il.id = nng.node2_id
361 WHERE nng.node1_id = ? -- CorpusId
362 AND nng.ngrams_type = ? -- NgramsTypeId
363 -- AND nn.category > 0
364 GROUP BY ng.terms, nng.weight
365 |]
366
367
368
369 getNgramsByDocOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
370 -> Cmd err (Map Text (Set NodeId))
371 getNgramsByDocOnlyUser cId ls nt ngs =
372 Map.unionsWith (<>)
373 . map (fromListWith (<>) . map (second Set.singleton))
374 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
375
376
377 selectNgramsOnlyByDocUser :: DocId -> [ListId] -> NgramsType -> [Text]
378 -> Cmd err [(Text, NodeId)]
379 selectNgramsOnlyByDocUser dId ls nt tms =
380 runPGSQuery queryNgramsOnlyByDocUser
381 ( Values fields (DPS.Only <$> tms)
382 , Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
383 , dId
384 , ngramsTypeId nt
385 )
386 where
387 fields = [QualifiedIdentifier Nothing "text"]
388
389 queryNgramsOnlyByDocUser :: DPS.Query
390 queryNgramsOnlyByDocUser = [sql|
391 WITH input_rows(terms) AS (?),
392 input_list(id) AS (?)
393 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
394 JOIN ngrams ng ON nng.ngrams_id = ng.id
395 JOIN input_rows ir ON ir.terms = ng.terms
396 JOIN input_list il ON il.id = nng.node1_id
397 WHERE nng.node2_id = ? -- DocId
398 AND nng.ngrams_type = ? -- NgramsTypeId
399 GROUP BY ng.terms, nng.node2_id
400 |]
401
402 ------------------------------------------------------------------------
403 -- | TODO filter by language, database, any social field
404 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
405 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
406 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
407 -- . takeWhile (not . List.null)
408 -- . takeWhile (\l -> List.length l > 3)
409 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
410
411
412 selectNgramsByNodeMaster :: Int -> UserCorpusId -> MasterCorpusId -> Int -> Cmd err [(NodeId, Text)]
413 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
414 queryNgramsByNodeMaster'
415 ( ucId
416 , ngramsTypeId NgramsTerms
417 , nodeTypeId NodeDocument
418 , p
419 , nodeTypeId NodeDocument
420 , p
421 , n
422 , mcId
423 , nodeTypeId NodeDocument
424 , ngramsTypeId NgramsTerms
425 )
426
427 -- | TODO fix node_node_ngrams relation
428 queryNgramsByNodeMaster' :: DPS.Query
429 queryNgramsByNodeMaster' = [sql|
430 WITH nodesByNgramsUser AS (
431
432 SELECT n.id, ng.terms FROM nodes n
433 JOIN nodes_nodes nn ON n.id = nn.node2_id
434 JOIN node_node_ngrams nng ON nng.node2_id = n.id
435 JOIN ngrams ng ON nng.ngrams_id = ng.id
436 WHERE nn.node1_id = ? -- UserCorpusId
437 -- AND n.typename = ? -- NodeTypeId
438 AND nng.ngrams_type = ? -- NgramsTypeId
439 AND nn.category > 0
440 AND node_pos(n.id,?) >= ?
441 AND node_pos(n.id,?) < ?
442 GROUP BY n.id, ng.terms
443
444 ),
445
446 nodesByNgramsMaster AS (
447
448 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
449 JOIN node_node_ngrams nng ON n.id = nng.node2_id
450 JOIN ngrams ng ON ng.id = nng.ngrams_id
451
452 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
453 AND n.typename = ? -- NodeTypeId
454 AND nng.ngrams_type = ? -- NgramsTypeId
455 GROUP BY n.id, ng.terms
456 )
457
458 SELECT m.id, m.terms FROM nodesByNgramsMaster m
459 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
460 |]
461
462