]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Cooc.hs
Merge branch 'dev-ngrams-repo' of https://gitlab.iscpif.fr/gargantext/haskell-gargant...
[gargantext.git] / src / Gargantext / Database / Cooc.hs
1 {-|
2 Module : Gargantext.Database.Cooc
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE NoImplicitPrelude #-}
13 --{-# LANGUAGE OverloadedStrings #-}
14
15 {-# LANGUAGE QuasiQuotes #-}
16 {-# LANGUAGE RankNTypes #-}
17
18 module Gargantext.Database.Cooc where
19
20 import Database.PostgreSQL.Simple.SqlQQ
21
22 import Gargantext.Prelude
23 import Gargantext.Database.Utils (Cmd, runCmdDevNoErr, runPGSQuery)
24
25 type CorpusId = Int
26 type MainListId = Int
27 type GroupListId = Int
28
29 coocTest :: IO [(Int, Int, Int)]
30 coocTest = runCmdDevNoErr $ dBcooc 421968 446602 446599
31
32 dBcooc :: CorpusId -> MainListId -> GroupListId -> Cmd err [(Int, Int, Int)]
33 dBcooc corpus mainList groupList = runPGSQuery [sql|
34 set work_mem='1GB';
35
36 --EXPLAIN ANALYZE
37 WITH COOC as (
38 SELECT
39 COALESCE(grA.ngram1_id, wlA.ngram_id) as ngA,
40 COALESCE(grB.ngram1_id, wlB.ngram_id) as ngB,
41 COUNT(*) AS score
42 FROM
43 nodes AS n
44 -- / -- X Y
45 -- SQL graph for getting the cooccurrences
46
47 -- STEP 1: X axis of the matrix
48 INNER JOIN nodes_ngrams
49 AS ngA ON ngA.node_id = n.id
50 -- \--> get the occurrences node/ngram of the corpus
51
52 INNER JOIN nodes_ngrams
53 AS wlA ON ngA.ngram_id = wlA.ngram_id
54 AND wlA.node_id = ?
55 -- \--> filter with white/main list (typename 7)
56
57 LEFT JOIN nodes_ngrams_ngrams
58 AS grA ON wlA.ngram_id = grA.ngram1_id
59 AND grA.node_id = ?
60 -- \--> adding (joining) ngrams that are grouped (typename 6)
61 LEFT JOIN nodes_ngrams
62 AS wlAA ON grA.ngram2_id = wlAA.ngram_id
63 AND wlAA.node_id = wlA.node_id
64 -- \--> adding (joining) ngrams that are not grouped
65 --LEFT JOIN ngrams AS wlAA ON grA.ngram2_id = wlAA.id
66 -- \--> for joining all synonyms even if they are not in the main list (white list)
67
68
69 -- STEP 2: Y axi of the matrix
70 INNER JOIN nodes_ngrams
71 AS ngB ON ngB.node_id = n.id
72 -- \--> get the occurrences node/ngram of the corpus
73
74 INNER JOIN nodes_ngrams
75 AS wlB ON ngB.ngram_id = wlB.ngram_id
76 AND wlB.node_id = ?
77 -- \--> filter with white/main list
78
79 LEFT JOIN nodes_ngrams_ngrams
80 AS grB ON wlB.ngram_id = grB.ngram1_id
81 AND grB.node_id = ?
82 -- \--> adding (joining) ngrams that are grouped
83 LEFT JOIN nodes_ngrams
84 AS wlBB ON grB.ngram2_id = wlBB.ngram_id
85 AND wlBB.node_id = wlB.node_id
86 -- \--> adding (joining) ngrams that are not grouped
87
88 -- LEFT JOIN ngrams AS wlBB ON grB.ngram2_id = wlBB.id
89 -- \--> for joining all synonyms even if they are not in the main list (white list)
90
91 WHERE
92 n.typename = 4
93 AND n.parent_id = ?
94 GROUP BY 1,2
95 -- ==
96 -- GROUP BY ngA, ngB
97 )
98
99 SELECT ngA, ngB, score
100 FROM COOC --> from the query above
101
102 WHERE score >= 3
103 AND
104 ngA <= ngB
105 |] (mainList, groupList, mainList, groupList, corpus)
106
107