]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Action/Metrics/NgramsByNode.hs
merge done
[gargantext.git] / src / Gargantext / Database / Action / Metrics / NgramsByNode.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15
16 module Gargantext.Database.Action.Metrics.NgramsByNode
17 where
18
19 import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
20 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
21 import Data.Set (Set)
22 import qualified Data.Ord as DO (Down(..))
23 import Data.Text (Text)
24 import Data.Tuple.Extra (second, swap)
25 import Database.PostgreSQL.Simple.SqlQQ (sql)
26 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
27 import Debug.Trace (trace)
28 import Gargantext.Core (Lang(..))
29 import Gargantext.Core.Types (Ordering(..))
30 import Gargantext.Database.Admin.Config (nodeTypeId)
31 import Gargantext.Database.Admin.Types.Node -- (ListId, CorpusId, NodeId)
32 import Gargantext.Database.Prelude (Cmd, runPGSQuery)
33 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
34 import Gargantext.Prelude
35 import Gargantext.Text.Metrics.TFICF
36 import Gargantext.Text.Terms.Mono.Stem (stem)
37 import qualified Data.List as List
38 import qualified Data.Map.Strict as Map
39 import qualified Data.Set as Set
40 import qualified Data.Text as Text
41 import qualified Database.PostgreSQL.Simple as DPS
42
43 -- | TODO: group with 2 terms only can be
44 -- discussed. Main purpose of this is offering
45 -- a first grouping option to user and get some
46 -- enriched data to better learn and improve that algo
47 ngramsGroup :: Lang
48 -> Int
49 -> Int
50 -> Text
51 -> Text
52 ngramsGroup l _m _n = Text.intercalate " "
53 . map (stem l)
54 -- . take n
55 . List.sort
56 -- . (List.filter (\t -> Text.length t > m))
57 . Text.splitOn " "
58 . Text.replace "-" " "
59
60
61 sortTficf :: Ordering
62 -> (Map Text (Double, Set Text))
63 -> [ (Text,(Double, Set Text))]
64 sortTficf Down = List.sortOn (DO.Down . fst . snd) . toList
65 sortTficf Up = List.sortOn (fst . snd) . toList
66
67
68 getTficf :: UserCorpusId
69 -> MasterCorpusId
70 -> NgramsType
71 -> (Text -> Text)
72 -> Cmd err (Map Text (Double, Set Text))
73 getTficf u m nt f = do
74 u' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsUser u nt
75 m' <- Map.filter (\s -> Set.size s > 1) <$> getNodesByNgramsMaster u m
76
77 pure $ toTficfData (countNodesByNgramsWith f u')
78 (countNodesByNgramsWith f m')
79
80 {-
81 getTficfWith :: UserCorpusId
82 -> MasterCorpusId
83 -> [ListId]
84 -> NgramsType
85 -> Map Text (Maybe Text)
86 -> Cmd err (Map Text (Double, Set Text))
87 getTficfWith u m ls nt mtxt = do
88 u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
89 m' <- getNodesByNgramsMaster u m
90
91 let f x = case Map.lookup x mtxt of
92 Nothing -> x
93 Just x' -> maybe x identity x'
94
95 pure $ toTficfData (countNodesByNgramsWith f u') (countNodesByNgramsWith f m')
96 -}
97
98 type Context = (Double, Map Text (Double, Set Text))
99 type Supra = Context
100 type Infra = Context
101
102 toTficfData :: Infra
103 -> Supra
104 -> Map Text (Double, Set Text)
105 toTficfData (ti, mi) (ts, ms) =
106 fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
107 (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
108 , ns
109 )
110 )
111 | (t, (n,ns)) <- toList mi
112 ]
113
114
115 -- | fst is size of Supra Corpus
116 -- snd is Texts and size of Occurrences (different docs)
117 countNodesByNgramsWith :: (Text -> Text)
118 -> Map Text (Set NodeId)
119 -> (Double, Map Text (Double, Set Text))
120 countNodesByNgramsWith f m = (total, m')
121 where
122 total = fromIntegral $ Set.size $ Set.unions $ elems m
123 m' = Map.map ( swap . second (fromIntegral . Set.size))
124 $ groupNodesByNgramsWith f m
125
126
127 groupNodesByNgramsWith :: (Text -> Text)
128 -> Map Text (Set NodeId)
129 -> Map Text (Set Text, Set NodeId)
130 groupNodesByNgramsWith f m =
131 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
132 $ toList m
133
134 ------------------------------------------------------------------------
135 getNodesByNgramsUser :: CorpusId
136 -> NgramsType
137 -> Cmd err (Map Text (Set NodeId))
138 getNodesByNgramsUser cId nt =
139 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
140 <$> selectNgramsByNodeUser cId nt
141 where
142
143 selectNgramsByNodeUser :: CorpusId
144 -> NgramsType
145 -> Cmd err [(NodeId, Text)]
146 selectNgramsByNodeUser cId' nt' =
147 runPGSQuery queryNgramsByNodeUser
148 ( cId'
149 , nodeTypeId NodeDocument
150 , ngramsTypeId nt'
151 -- , 100 :: Int -- limit
152 -- , 0 :: Int -- offset
153 )
154
155 queryNgramsByNodeUser :: DPS.Query
156 queryNgramsByNodeUser = [sql|
157 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
158 JOIN ngrams ng ON nng.ngrams_id = ng.id
159 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
160 JOIN nodes n ON nn.node2_id = n.id
161 WHERE nn.node1_id = ? -- CorpusId
162 AND n.typename = ? -- NodeTypeId
163 AND nng.ngrams_type = ? -- NgramsTypeId
164 AND nn.category > 0
165 GROUP BY nng.node2_id, ng.terms
166 ORDER BY (nng.node2_id, ng.terms) DESC
167 -- LIMIT ?
168 -- OFFSET ?
169 |]
170 ------------------------------------------------------------------------
171 -- TODO add groups
172 getOccByNgramsOnlyFast :: CorpusId
173 -> NgramsType
174 -> [Text]
175 -> Cmd err (Map Text Int)
176 getOccByNgramsOnlyFast cId nt ngs =
177 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
178
179
180 getOccByNgramsOnlyFast' :: CorpusId
181 -> ListId
182 -> NgramsType
183 -> [Text]
184 -> Cmd err (Map Text Int)
185 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
186 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
187
188 where
189 fields = [QualifiedIdentifier Nothing "text"]
190
191 run :: CorpusId
192 -> ListId
193 -> NgramsType
194 -> [Text]
195 -> Cmd err [(Text, Double)]
196 run cId' lId' nt' tms' = runPGSQuery query
197 ( Values fields (DPS.Only <$> tms')
198 , cId'
199 , lId'
200 , ngramsTypeId nt'
201 )
202
203 query :: DPS.Query
204 query = [sql|
205 WITH input_rows(terms) AS (?)
206 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
207 JOIN ngrams ng ON nng.ngrams_id = ng.id
208 JOIN input_rows ir ON ir.terms = ng.terms
209 WHERE nng.node1_id = ? -- CorpusId
210 AND nng.node2_id = ? -- ListId
211 AND nng.ngrams_type = ? -- NgramsTypeId
212 -- AND nn.category > 0 -- TODO
213 GROUP BY ng.terms, nng.weight
214 |]
215
216
217 -- just slower than getOccByNgramsOnlyFast
218 getOccByNgramsOnlySlow :: NodeType
219 -> CorpusId
220 -> [ListId]
221 -> NgramsType
222 -> [Text]
223 -> Cmd err (Map Text Int)
224 getOccByNgramsOnlySlow t cId ls nt ngs =
225 Map.map Set.size <$> getScore' t cId ls nt ngs
226 where
227 getScore' NodeCorpus = getNodesByNgramsOnlyUser
228 getScore' NodeDocument = getNgramsByDocOnlyUser
229 getScore' _ = getNodesByNgramsOnlyUser
230
231 getOccByNgramsOnlySafe :: CorpusId
232 -> [ListId]
233 -> NgramsType
234 -> [Text]
235 -> Cmd err (Map Text Int)
236 getOccByNgramsOnlySafe cId ls nt ngs = do
237 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
238 fast <- getOccByNgramsOnlyFast cId nt ngs
239 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
240 when (fast /= slow) $
241 printDebug "getOccByNgramsOnlySafe: difference"
242 (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
243 pure slow
244
245
246 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId
247 -> NgramsType
248 -> [Text]
249 -> Cmd err [(Text, Int)]
250 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
251 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
252 ( Values fields (DPS.Only <$> tms)
253 , cId
254 , nodeTypeId NodeDocument
255 , ngramsTypeId nt
256 )
257 where
258 fields = [QualifiedIdentifier Nothing "text"]
259
260 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
261 -- Question: with the grouping is the result exactly the same (since Set NodeId for
262 -- equivalent ngrams intersections are not empty)
263 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
264 queryNgramsOccurrencesOnlyByNodeUser = [sql|
265 WITH input_rows(terms) AS (?)
266 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
267 JOIN ngrams ng ON nng.ngrams_id = ng.id
268 JOIN input_rows ir ON ir.terms = ng.terms
269 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
270 JOIN nodes n ON nn.node2_id = n.id
271 WHERE nn.node1_id = ? -- CorpusId
272 AND n.typename = ? -- NodeTypeId
273 AND nng.ngrams_type = ? -- NgramsTypeId
274 AND nn.category > 0
275 GROUP BY nng.node2_id, ng.terms
276 |]
277
278 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
279 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
280 WITH input_rows(terms) AS (?)
281 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
282 JOIN ngrams ng ON nng.ngrams_id = ng.id
283 JOIN input_rows ir ON ir.terms = ng.terms
284 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
285 JOIN nodes n ON nn.node2_id = n.id
286 WHERE nn.node1_id = ? -- CorpusId
287 AND n.typename = ? -- NodeTypeId
288 AND nng.ngrams_type = ? -- NgramsTypeId
289 AND nn.category > 0
290 GROUP BY nng.node2_id, ng.terms
291 |]
292
293 ------------------------------------------------------------------------
294 getNodesByNgramsOnlyUser :: NodeId
295 -> [ListId]
296 -> NgramsType
297 -> [Text]
298 -> Cmd err (Map Text (Set NodeId))
299 getNodesByNgramsOnlyUser cId ls nt ngs =
300 Map.unionsWith (<>)
301 . map (fromListWith (<>)
302 . map (second Set.singleton))
303 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
304 (splitEvery 1000 ngs)
305
306
307 getNgramsByNodeOnlyUser :: NodeId
308 -> [ListId]
309 -> NgramsType
310 -> [Text]
311 -> Cmd err (Map NodeId (Set Text))
312 getNgramsByNodeOnlyUser cId ls nt ngs =
313 Map.unionsWith (<>)
314 . map (fromListWith (<>)
315 . map (second Set.singleton))
316 . map (map swap)
317 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
318 (splitEvery 1000 ngs)
319
320 ------------------------------------------------------------------------
321 selectNgramsOnlyByNodeUser :: CorpusId
322 -> [ListId]
323 -> NgramsType
324 -> [Text]
325 -> Cmd err [(Text, NodeId)]
326 selectNgramsOnlyByNodeUser cId ls nt tms =
327 runPGSQuery queryNgramsOnlyByNodeUser
328 ( Values fields (DPS.Only <$> tms)
329 , Values [QualifiedIdentifier Nothing "int4"]
330 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
331 , cId
332 , nodeTypeId NodeDocument
333 , ngramsTypeId nt
334 )
335 where
336 fields = [QualifiedIdentifier Nothing "text"]
337
338 queryNgramsOnlyByNodeUser :: DPS.Query
339 queryNgramsOnlyByNodeUser = [sql|
340 WITH input_rows(terms) AS (?),
341 input_list(id) AS (?)
342 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
343 JOIN ngrams ng ON nng.ngrams_id = ng.id
344 JOIN input_rows ir ON ir.terms = ng.terms
345 JOIN input_list il ON il.id = nng.node1_id
346 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
347 JOIN nodes n ON nn.node2_id = n.id
348 WHERE nn.node1_id = ? -- CorpusId
349 AND n.typename = ? -- NodeTypeId
350 AND nng.ngrams_type = ? -- NgramsTypeId
351 AND nn.category > 0
352 GROUP BY ng.terms, nng.node2_id
353 |]
354
355
356 selectNgramsOnlyByNodeUser' :: CorpusId
357 -> [ListId]
358 -> NgramsType
359 -> [Text]
360 -> Cmd err [(Text, Int)]
361 selectNgramsOnlyByNodeUser' cId ls nt tms =
362 runPGSQuery queryNgramsOnlyByNodeUser
363 ( Values fields (DPS.Only <$> tms)
364 , Values [QualifiedIdentifier Nothing "int4"]
365 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
366 , cId
367 , nodeTypeId NodeDocument
368 , ngramsTypeId nt
369 )
370 where
371 fields = [QualifiedIdentifier Nothing "text"]
372
373 queryNgramsOnlyByNodeUser' :: DPS.Query
374 queryNgramsOnlyByNodeUser' = [sql|
375 WITH input_rows(terms) AS (?),
376 input_list(id) AS (?)
377 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
378 JOIN ngrams ng ON nng.ngrams_id = ng.id
379 JOIN input_rows ir ON ir.terms = ng.terms
380 JOIN input_list il ON il.id = nng.node2_id
381 WHERE nng.node1_id = ? -- CorpusId
382 AND nng.ngrams_type = ? -- NgramsTypeId
383 -- AND nn.category > 0
384 GROUP BY ng.terms, nng.weight
385 |]
386
387
388 getNgramsByDocOnlyUser :: NodeId
389 -> [ListId]
390 -> NgramsType
391 -> [Text]
392 -> Cmd err (Map Text (Set NodeId))
393 getNgramsByDocOnlyUser cId ls nt ngs =
394 Map.unionsWith (<>)
395 . map (fromListWith (<>) . map (second Set.singleton))
396 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
397
398
399 selectNgramsOnlyByDocUser :: DocId
400 -> [ListId]
401 -> NgramsType
402 -> [Text]
403 -> Cmd err [(Text, NodeId)]
404 selectNgramsOnlyByDocUser dId ls nt tms =
405 runPGSQuery queryNgramsOnlyByDocUser
406 ( Values fields (DPS.Only <$> tms)
407 , Values [QualifiedIdentifier Nothing "int4"]
408 (DPS.Only <$> (map (\(NodeId n) -> n) ls))
409 , dId
410 , ngramsTypeId nt
411 )
412 where
413 fields = [QualifiedIdentifier Nothing "text"]
414
415
416 queryNgramsOnlyByDocUser :: DPS.Query
417 queryNgramsOnlyByDocUser = [sql|
418 WITH input_rows(terms) AS (?),
419 input_list(id) AS (?)
420 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
421 JOIN ngrams ng ON nng.ngrams_id = ng.id
422 JOIN input_rows ir ON ir.terms = ng.terms
423 JOIN input_list il ON il.id = nng.node1_id
424 WHERE nng.node2_id = ? -- DocId
425 AND nng.ngrams_type = ? -- NgramsTypeId
426 GROUP BY ng.terms, nng.node2_id
427 |]
428
429 ------------------------------------------------------------------------
430 -- | TODO filter by language, database, any social field
431 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
432 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
433 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
434 -- . takeWhile (not . List.null)
435 -- . takeWhile (\l -> List.length l > 3)
436 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
437
438 selectNgramsByNodeMaster :: Int
439 -> UserCorpusId
440 -> MasterCorpusId
441 -> Int
442 -> Cmd err [(NodeId, Text)]
443 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
444 queryNgramsByNodeMaster'
445 ( ucId
446 , ngramsTypeId NgramsTerms
447 , nodeTypeId NodeDocument
448 , p
449 , nodeTypeId NodeDocument
450 , p
451 , n
452 , mcId
453 , nodeTypeId NodeDocument
454 , ngramsTypeId NgramsTerms
455 )
456
457 -- | TODO fix node_node_ngrams relation
458 queryNgramsByNodeMaster' :: DPS.Query
459 queryNgramsByNodeMaster' = [sql|
460 WITH nodesByNgramsUser AS (
461
462 SELECT n.id, ng.terms FROM nodes n
463 JOIN nodes_nodes nn ON n.id = nn.node2_id
464 JOIN node_node_ngrams nng ON nng.node2_id = n.id
465 JOIN ngrams ng ON nng.ngrams_id = ng.id
466 WHERE nn.node1_id = ? -- UserCorpusId
467 -- AND n.typename = ? -- NodeTypeId
468 AND nng.ngrams_type = ? -- NgramsTypeId
469 AND nn.category > 0
470 AND node_pos(n.id,?) >= ?
471 AND node_pos(n.id,?) < ?
472 GROUP BY n.id, ng.terms
473
474 ),
475
476 nodesByNgramsMaster AS (
477
478 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
479 JOIN node_node_ngrams nng ON n.id = nng.node2_id
480 JOIN ngrams ng ON ng.id = nng.ngrams_id
481
482 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
483 AND n.typename = ? -- NodeTypeId
484 AND nng.ngrams_type = ? -- NgramsTypeId
485 GROUP BY n.id, ng.terms
486 )
487
488 SELECT m.id, m.terms FROM nodesByNgramsMaster m
489 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
490 |]