]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Metrics/NgramsByNode.hs
Merge branch 'dev-phylo' into dev
[gargantext.git] / src / Gargantext / Database / Metrics / NgramsByNode.hs
1 {-|
2 Module : Gargantext.Database.Metrics.NgramsByNode
3 Description : Ngrams by Node user and master
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams by node enable contextual metrics.
11
12 -}
13
14 {-# LANGUAGE QuasiQuotes #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE OverloadedStrings #-}
17 {-# LANGUAGE RankNTypes #-}
18
19 module Gargantext.Database.Metrics.NgramsByNode
20 where
21
22 import Debug.Trace (trace)
23 import Data.Map.Strict (Map, fromListWith, elems, toList, fromList)
24 import Data.Map.Strict.Patch (PatchMap, Replace, diff)
25 import Data.Set (Set)
26 import Data.Text (Text)
27 import Data.Tuple.Extra (second, swap)
28 import Database.PostgreSQL.Simple.SqlQQ (sql)
29 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
30 import Gargantext.Core (Lang(..))
31 import Gargantext.Database.Config (nodeTypeId)
32 import Gargantext.Database.Schema.Ngrams (ngramsTypeId, NgramsType(..))
33 import Gargantext.Database.Types.Node -- (ListId, CorpusId, NodeId)
34 import Gargantext.Database.Utils (Cmd, runPGSQuery)
35 import Gargantext.Prelude
36 import Gargantext.Text.Metrics.TFICF
37 import Gargantext.Text.Terms.Mono.Stem (stem)
38 import qualified Data.List as List
39 import qualified Data.Map.Strict as Map
40 import qualified Data.Set as Set
41 import qualified Data.Text as Text
42 import qualified Database.PostgreSQL.Simple as DPS
43
44 -- | TODO: group with 2 terms only can be
45 -- discussed. Main purpose of this is offering
46 -- a first grouping option to user and get some
47 -- enriched data to better learn and improve that algo
48 ngramsGroup :: Lang -> Int -> Int -> Text -> Text
49 ngramsGroup l _m _n = Text.intercalate " "
50 . map (stem l)
51 -- . take n
52 . List.sort
53 -- . (List.filter (\t -> Text.length t > m))
54 . Text.splitOn " "
55 . Text.replace "-" " "
56
57
58 sortTficf :: (Map Text (Double, Set Text))
59 -> [ (Text,(Double, Set Text))]
60 sortTficf = List.sortOn (fst . snd) . toList
61
62
63 getTficf' :: UserCorpusId -> MasterCorpusId -> NgramsType -> (Text -> Text)
64 -> Cmd err (Map Text (Double, Set Text))
65 getTficf' u m nt f = do
66 u' <- getNodesByNgramsUser u nt
67 m' <- getNodesByNgramsMaster u m
68
69 pure $ toTficfData (countNodesByNgramsWith f u')
70 (countNodesByNgramsWith f m')
71
72 getTficfWith :: UserCorpusId -> MasterCorpusId -> [ListId]
73 -> NgramsType -> Map Text (Maybe Text)
74 -> Cmd err (Map Text (Double, Set Text))
75 getTficfWith u m ls nt mtxt = do
76 u' <- getNodesByNgramsOnlyUser u ls nt (Map.keys mtxt)
77 m' <- getNodesByNgramsMaster u m
78
79 let f x = case Map.lookup x mtxt of
80 Nothing -> x
81 Just x' -> maybe x identity x'
82
83 pure $ toTficfData (countNodesByNgramsWith f u')
84 (countNodesByNgramsWith f m')
85
86
87 type Context = (Double, Map Text (Double, Set Text))
88 type Supra = Context
89 type Infra = Context
90
91 toTficfData :: Infra -> Supra
92 -> Map Text (Double, Set Text)
93 toTficfData (ti, mi) (ts, ms) =
94 fromList [ (t, ( tficf (TficfInfra (Count n )(Total ti))
95 (TficfSupra (Count $ maybe 0 fst $ Map.lookup t ms)(Total ts))
96 , ns
97 )
98 )
99 | (t, (n,ns)) <- toList mi
100 ]
101
102
103 -- | fst is size of Supra Corpus
104 -- snd is Texts and size of Occurrences (different docs)
105 countNodesByNgramsWith :: (Text -> Text)
106 -> Map Text (Set NodeId)
107 -> (Double, Map Text (Double, Set Text))
108 countNodesByNgramsWith f m = (total, m')
109 where
110 total = fromIntegral $ Set.size $ Set.unions $ elems m
111 m' = Map.map ( swap . second (fromIntegral . Set.size))
112 $ groupNodesByNgramsWith f m
113
114
115 groupNodesByNgramsWith :: (Text -> Text)
116 -> Map Text (Set NodeId)
117 -> Map Text (Set Text, Set NodeId)
118 groupNodesByNgramsWith f m =
119 fromListWith (<>) $ map (\(t,ns) -> (f t, (Set.singleton t, ns)))
120 $ toList m
121
122 ------------------------------------------------------------------------
123 getNodesByNgramsUser :: CorpusId
124 -> NgramsType
125 -> Cmd err (Map Text (Set NodeId))
126 getNodesByNgramsUser cId nt =
127 fromListWith (<>) <$> map (\(n,t) -> (t, Set.singleton n))
128 <$> selectNgramsByNodeUser cId nt
129 where
130
131 selectNgramsByNodeUser :: CorpusId -> NgramsType
132 -> Cmd err [(NodeId, Text)]
133 selectNgramsByNodeUser cId' nt' =
134 runPGSQuery queryNgramsByNodeUser
135 ( cId'
136 , nodeTypeId NodeDocument
137 , ngramsTypeId nt'
138 -- , 100 :: Int -- limit
139 -- , 0 :: Int -- offset
140 )
141
142 queryNgramsByNodeUser :: DPS.Query
143 queryNgramsByNodeUser = [sql|
144 SELECT nng.node2_id, ng.terms FROM node_node_ngrams nng
145 JOIN ngrams ng ON nng.ngrams_id = ng.id
146 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
147 JOIN nodes n ON nn.node2_id = n.id
148 WHERE nn.node1_id = ? -- CorpusId
149 AND n.typename = ? -- NodeTypeId
150 AND nng.ngrams_type = ? -- NgramsTypeId
151 AND nn.category > 0
152 GROUP BY nng.node2_id, ng.terms
153 ORDER BY (nng.node2_id, ng.terms) DESC
154 -- LIMIT ?
155 -- OFFSET ?
156 |]
157 ------------------------------------------------------------------------
158 -- TODO add groups
159 getOccByNgramsOnlyFast :: CorpusId
160 -> NgramsType
161 -> [Text]
162 -> Cmd err (Map Text Int)
163 getOccByNgramsOnlyFast cId nt ngs =
164 fromListWith (+) <$> selectNgramsOccurrencesOnlyByNodeUser cId nt ngs
165
166
167 getOccByNgramsOnlyFast' :: CorpusId
168 -> ListId
169 -> NgramsType
170 -> [Text]
171 -> Cmd err (Map Text Int)
172 getOccByNgramsOnlyFast' cId lId nt tms = trace (show (cId, lId)) $
173 fromListWith (+) <$> map (second round) <$> run cId lId nt tms
174
175 where
176 fields = [QualifiedIdentifier Nothing "text"]
177
178 run :: CorpusId
179 -> ListId
180 -> NgramsType
181 -> [Text]
182 -> Cmd err [(Text, Double)]
183 run cId' lId' nt' tms' = runPGSQuery query
184 ( Values fields (DPS.Only <$> tms')
185 , cId'
186 , lId'
187 , ngramsTypeId nt'
188 )
189
190 query :: DPS.Query
191 query = [sql|
192 WITH input_rows(terms) AS (?)
193 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
194 JOIN ngrams ng ON nng.ngrams_id = ng.id
195 JOIN input_rows ir ON ir.terms = ng.terms
196 WHERE nng.node1_id = ? -- CorpusId
197 AND nng.node2_id = ? -- ListId
198 AND nng.ngrams_type = ? -- NgramsTypeId
199 -- AND nn.category > 0 -- TODO
200 GROUP BY ng.terms, nng.weight
201 |]
202
203
204
205
206 -- just slower than getOccByNgramsOnlyFast
207 getOccByNgramsOnlySlow :: NodeType
208 -> CorpusId
209 -> [ListId]
210 -> NgramsType
211 -> [Text]
212 -> Cmd err (Map Text Int)
213 getOccByNgramsOnlySlow t cId ls nt ngs =
214 Map.map Set.size <$> getScore' t cId ls nt ngs
215 where
216 getScore' NodeCorpus = getNodesByNgramsOnlyUser
217 getScore' NodeDocument = getNgramsByDocOnlyUser
218 getScore' _ = getNodesByNgramsOnlyUser
219
220 getOccByNgramsOnlySafe :: CorpusId
221 -> [ListId]
222 -> NgramsType
223 -> [Text]
224 -> Cmd err (Map Text Int)
225 getOccByNgramsOnlySafe cId ls nt ngs = do
226 printDebug "getOccByNgramsOnlySafe" (cId, nt, length ngs)
227 fast <- getOccByNgramsOnlyFast cId nt ngs
228 slow <- getOccByNgramsOnlySlow NodeCorpus cId ls nt ngs
229 when (fast /= slow) $
230 printDebug "getOccByNgramsOnlySafe: difference" (diff slow fast :: PatchMap Text (Replace (Maybe Int)))
231 pure slow
232
233
234 selectNgramsOccurrencesOnlyByNodeUser :: CorpusId -> NgramsType -> [Text]
235 -> Cmd err [(Text, Int)]
236 selectNgramsOccurrencesOnlyByNodeUser cId nt tms =
237 runPGSQuery queryNgramsOccurrencesOnlyByNodeUser
238 ( Values fields (DPS.Only <$> tms)
239 , cId
240 , nodeTypeId NodeDocument
241 , ngramsTypeId nt
242 )
243 where
244 fields = [QualifiedIdentifier Nothing "text"]
245
246 -- same as queryNgramsOnlyByNodeUser but using COUNT on the node ids.
247 -- Question: with the grouping is the result exactly the same (since Set NodeId for
248 -- equivalent ngrams intersections are not empty)
249 queryNgramsOccurrencesOnlyByNodeUser :: DPS.Query
250 queryNgramsOccurrencesOnlyByNodeUser = [sql|
251 WITH input_rows(terms) AS (?)
252 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
253 JOIN ngrams ng ON nng.ngrams_id = ng.id
254 JOIN input_rows ir ON ir.terms = ng.terms
255 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
256 JOIN nodes n ON nn.node2_id = n.id
257 WHERE nn.node1_id = ? -- CorpusId
258 AND n.typename = ? -- NodeTypeId
259 AND nng.ngrams_type = ? -- NgramsTypeId
260 AND nn.category > 0
261 GROUP BY nng.node2_id, ng.terms
262 |]
263
264
265
266 queryNgramsOccurrencesOnlyByNodeUser' :: DPS.Query
267 queryNgramsOccurrencesOnlyByNodeUser' = [sql|
268 WITH input_rows(terms) AS (?)
269 SELECT ng.terms, COUNT(nng.node2_id) FROM node_node_ngrams nng
270 JOIN ngrams ng ON nng.ngrams_id = ng.id
271 JOIN input_rows ir ON ir.terms = ng.terms
272 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
273 JOIN nodes n ON nn.node2_id = n.id
274 WHERE nn.node1_id = ? -- CorpusId
275 AND n.typename = ? -- NodeTypeId
276 AND nng.ngrams_type = ? -- NgramsTypeId
277 AND nn.category > 0
278 GROUP BY nng.node2_id, ng.terms
279 |]
280
281 ------------------------------------------------------------------------
282
283 getNodesByNgramsOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
284 -> Cmd err (Map Text (Set NodeId))
285 getNodesByNgramsOnlyUser cId ls nt ngs =
286 Map.unionsWith (<>)
287 . map (fromListWith (<>)
288 . map (second Set.singleton))
289 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
290 (splitEvery 1000 ngs)
291
292
293 getNgramsByNodeOnlyUser :: NodeId
294 -> [ListId]
295 -> NgramsType
296 -> [Text]
297 -> Cmd err (Map NodeId (Set Text))
298 getNgramsByNodeOnlyUser cId ls nt ngs =
299 Map.unionsWith (<>)
300 . map (fromListWith (<>)
301 . map (second Set.singleton))
302 . map (map swap)
303 <$> mapM (selectNgramsOnlyByNodeUser cId ls nt)
304 (splitEvery 1000 ngs)
305
306 ------------------------------------------------------------------------
307 selectNgramsOnlyByNodeUser :: CorpusId -> [ListId] -> NgramsType -> [Text]
308 -> Cmd err [(Text, NodeId)]
309 selectNgramsOnlyByNodeUser cId ls nt tms =
310 runPGSQuery queryNgramsOnlyByNodeUser
311 ( Values fields (DPS.Only <$> tms)
312 , Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
313 , cId
314 , nodeTypeId NodeDocument
315 , ngramsTypeId nt
316 )
317 where
318 fields = [QualifiedIdentifier Nothing "text"]
319
320 queryNgramsOnlyByNodeUser :: DPS.Query
321 queryNgramsOnlyByNodeUser = [sql|
322 WITH input_rows(terms) AS (?),
323 input_list(id) AS (?)
324 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
325 JOIN ngrams ng ON nng.ngrams_id = ng.id
326 JOIN input_rows ir ON ir.terms = ng.terms
327 JOIN input_list il ON il.id = nng.node1_id
328 JOIN nodes_nodes nn ON nn.node2_id = nng.node2_id
329 JOIN nodes n ON nn.node2_id = n.id
330 WHERE nn.node1_id = ? -- CorpusId
331 AND n.typename = ? -- NodeTypeId
332 AND nng.ngrams_type = ? -- NgramsTypeId
333 AND nn.category > 0
334 GROUP BY ng.terms, nng.node2_id
335 |]
336
337
338
339 selectNgramsOnlyByNodeUser' :: CorpusId -> [ListId] -> NgramsType -> [Text]
340 -> Cmd err [(Text, Int)]
341 selectNgramsOnlyByNodeUser' cId ls nt tms =
342 runPGSQuery queryNgramsOnlyByNodeUser
343 ( Values fields (DPS.Only <$> tms)
344 , Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
345 , cId
346 , nodeTypeId NodeDocument
347 , ngramsTypeId nt
348 )
349 where
350 fields = [QualifiedIdentifier Nothing "text"]
351
352 queryNgramsOnlyByNodeUser' :: DPS.Query
353 queryNgramsOnlyByNodeUser' = [sql|
354 WITH input_rows(terms) AS (?),
355 input_list(id) AS (?)
356 SELECT ng.terms, nng.weight FROM node_node_ngrams nng
357 JOIN ngrams ng ON nng.ngrams_id = ng.id
358 JOIN input_rows ir ON ir.terms = ng.terms
359 JOIN input_list il ON il.id = nng.node2_id
360 WHERE nng.node1_id = ? -- CorpusId
361 AND nng.ngrams_type = ? -- NgramsTypeId
362 -- AND nn.category > 0
363 GROUP BY ng.terms, nng.weight
364 |]
365
366
367
368 getNgramsByDocOnlyUser :: NodeId -> [ListId] -> NgramsType -> [Text]
369 -> Cmd err (Map Text (Set NodeId))
370 getNgramsByDocOnlyUser cId ls nt ngs =
371 Map.unionsWith (<>)
372 . map (fromListWith (<>) . map (second Set.singleton))
373 <$> mapM (selectNgramsOnlyByDocUser cId ls nt) (splitEvery 1000 ngs)
374
375
376 selectNgramsOnlyByDocUser :: DocId -> [ListId] -> NgramsType -> [Text]
377 -> Cmd err [(Text, NodeId)]
378 selectNgramsOnlyByDocUser dId ls nt tms =
379 runPGSQuery queryNgramsOnlyByDocUser
380 ( Values fields (DPS.Only <$> tms)
381 , Values [QualifiedIdentifier Nothing "int4"] (DPS.Only <$> (map (\(NodeId n) -> n) ls))
382 , dId
383 , ngramsTypeId nt
384 )
385 where
386 fields = [QualifiedIdentifier Nothing "text"]
387
388 queryNgramsOnlyByDocUser :: DPS.Query
389 queryNgramsOnlyByDocUser = [sql|
390 WITH input_rows(terms) AS (?),
391 input_list(id) AS (?)
392 SELECT ng.terms, nng.node2_id FROM node_node_ngrams nng
393 JOIN ngrams ng ON nng.ngrams_id = ng.id
394 JOIN input_rows ir ON ir.terms = ng.terms
395 JOIN input_list il ON il.id = nng.node1_id
396 WHERE nng.node2_id = ? -- DocId
397 AND nng.ngrams_type = ? -- NgramsTypeId
398 GROUP BY ng.terms, nng.node2_id
399 |]
400
401 ------------------------------------------------------------------------
402 -- | TODO filter by language, database, any social field
403 getNodesByNgramsMaster :: UserCorpusId -> MasterCorpusId -> Cmd err (Map Text (Set NodeId))
404 getNodesByNgramsMaster ucId mcId = Map.unionsWith (<>)
405 . map (fromListWith (<>) . map (\(n,t) -> (t, Set.singleton n)))
406 -- . takeWhile (not . List.null)
407 -- . takeWhile (\l -> List.length l > 3)
408 <$> mapM (selectNgramsByNodeMaster 1000 ucId mcId) [0,500..10000]
409
410
411 selectNgramsByNodeMaster :: Int -> UserCorpusId -> MasterCorpusId -> Int -> Cmd err [(NodeId, Text)]
412 selectNgramsByNodeMaster n ucId mcId p = runPGSQuery
413 queryNgramsByNodeMaster'
414 ( ucId
415 , ngramsTypeId NgramsTerms
416 , nodeTypeId NodeDocument
417 , p
418 , nodeTypeId NodeDocument
419 , p
420 , n
421 , mcId
422 , nodeTypeId NodeDocument
423 , ngramsTypeId NgramsTerms
424 )
425
426 -- | TODO fix node_node_ngrams relation
427 queryNgramsByNodeMaster' :: DPS.Query
428 queryNgramsByNodeMaster' = [sql|
429 WITH nodesByNgramsUser AS (
430
431 SELECT n.id, ng.terms FROM nodes n
432 JOIN nodes_nodes nn ON n.id = nn.node2_id
433 JOIN node_node_ngrams nng ON nng.node2_id = n.id
434 JOIN ngrams ng ON nng.ngrams_id = ng.id
435 WHERE nn.node1_id = ? -- UserCorpusId
436 -- AND n.typename = ? -- NodeTypeId
437 AND nng.ngrams_type = ? -- NgramsTypeId
438 AND nn.category > 0
439 AND node_pos(n.id,?) >= ?
440 AND node_pos(n.id,?) < ?
441 GROUP BY n.id, ng.terms
442
443 ),
444
445 nodesByNgramsMaster AS (
446
447 SELECT n.id, ng.terms FROM nodes n TABLESAMPLE SYSTEM_ROWS(?)
448 JOIN node_node_ngrams nng ON n.id = nng.node2_id
449 JOIN ngrams ng ON ng.id = nng.ngrams_id
450
451 WHERE n.parent_id = ? -- Master Corpus NodeTypeId
452 AND n.typename = ? -- NodeTypeId
453 AND nng.ngrams_type = ? -- NgramsTypeId
454 GROUP BY n.id, ng.terms
455 )
456
457 SELECT m.id, m.terms FROM nodesByNgramsMaster m
458 RIGHT JOIN nodesByNgramsUser u ON u.id = m.id
459 |]
460
461