]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Cooc.hs
[BIN/PHYLO] first draft, needs phylo connection.
[gargantext.git] / src / Gargantext / Database / Cooc.hs
1 {-|
2 Module : Gargantext.Database.Cooc
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE NoImplicitPrelude #-}
13 --{-# LANGUAGE OverloadedStrings #-}
14
15 {-# LANGUAGE QuasiQuotes #-}
16 {-# LANGUAGE RankNTypes #-}
17
18 module Gargantext.Database.Cooc where
19
20 import Database.PostgreSQL.Simple.SqlQQ
21
22 import Gargantext.Prelude
23 import Gargantext.Database.Utils (Cmd, runPGSQuery)
24 import Gargantext.API.Settings (runCmdDevNoErr, DevEnv)
25
26 type CorpusId = Int
27 type MainListId = Int
28 type GroupListId = Int
29
30 coocTest :: DevEnv -> IO [(Int, Int, Int)]
31 coocTest env = runCmdDevNoErr env $ dBcooc 421968 446602 446599
32
33 dBcooc :: CorpusId -> MainListId -> GroupListId -> Cmd err [(Int, Int, Int)]
34 dBcooc corpus mainList groupList = runPGSQuery [sql|
35 set work_mem='1GB';
36
37 --EXPLAIN ANALYZE
38 WITH COOC as (
39 SELECT
40 COALESCE(grA.ngram1_id, wlA.ngram_id) as ngA,
41 COALESCE(grB.ngram1_id, wlB.ngram_id) as ngB,
42 COUNT(*) AS score
43 FROM
44 nodes AS n
45 -- / -- X Y
46 -- SQL graph for getting the cooccurrences
47
48 -- STEP 1: X axis of the matrix
49 INNER JOIN nodes_ngrams
50 AS ngA ON ngA.node_id = n.id
51 -- \--> get the occurrences node/ngram of the corpus
52
53 INNER JOIN nodes_ngrams
54 AS wlA ON ngA.ngram_id = wlA.ngram_id
55 AND wlA.node_id = ?
56 -- \--> filter with white/main list (typename 7)
57
58 LEFT JOIN nodes_ngrams_ngrams
59 AS grA ON wlA.ngram_id = grA.ngram1_id
60 AND grA.node_id = ?
61 -- \--> adding (joining) ngrams that are grouped (typename 6)
62 LEFT JOIN nodes_ngrams
63 AS wlAA ON grA.ngram2_id = wlAA.ngram_id
64 AND wlAA.node_id = wlA.node_id
65 -- \--> adding (joining) ngrams that are not grouped
66 --LEFT JOIN ngrams AS wlAA ON grA.ngram2_id = wlAA.id
67 -- \--> for joining all synonyms even if they are not in the main list (white list)
68
69
70 -- STEP 2: Y axi of the matrix
71 INNER JOIN nodes_ngrams
72 AS ngB ON ngB.node_id = n.id
73 -- \--> get the occurrences node/ngram of the corpus
74
75 INNER JOIN nodes_ngrams
76 AS wlB ON ngB.ngram_id = wlB.ngram_id
77 AND wlB.node_id = ?
78 -- \--> filter with white/main list
79
80 LEFT JOIN nodes_ngrams_ngrams
81 AS grB ON wlB.ngram_id = grB.ngram1_id
82 AND grB.node_id = ?
83 -- \--> adding (joining) ngrams that are grouped
84 LEFT JOIN nodes_ngrams
85 AS wlBB ON grB.ngram2_id = wlBB.ngram_id
86 AND wlBB.node_id = wlB.node_id
87 -- \--> adding (joining) ngrams that are not grouped
88
89 -- LEFT JOIN ngrams AS wlBB ON grB.ngram2_id = wlBB.id
90 -- \--> for joining all synonyms even if they are not in the main list (white list)
91
92 WHERE
93 n.typename = 4
94 AND n.parent_id = ?
95 GROUP BY 1,2
96 -- ==
97 -- GROUP BY ngA, ngB
98 )
99
100 SELECT ngA, ngB, score
101 FROM COOC --> from the query above
102
103 WHERE score >= 3
104 AND
105 ngA <= ngB
106 |] (mainList, groupList, mainList, groupList, corpus)
107
108