]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Cooc.hs
[Pipeline] clustering with C++ Louvain bindings, ok.
[gargantext.git] / src / Gargantext / Database / Cooc.hs
1 {-|
2 Module : Gargantext.Database.TextSearch
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 --{-# LANGUAGE OverloadedStrings #-}
16
17 {-# LANGUAGE QuasiQuotes #-}
18
19 module Gargantext.Database.Cooc where
20
21 import Control.Monad ((>>=))
22 import Database.PostgreSQL.Simple
23 import Database.PostgreSQL.Simple.SqlQQ
24
25 import Gargantext.Prelude
26 import Gargantext (connectGargandb)
27
28 type CorpusId = Int
29 type MainListId = Int
30 type GroupListId = Int
31
32 coocTest :: IO [(Int, Int, Int)]
33 coocTest = connectGargandb "gargantext.ini"
34 >>= \conn -> dBcooc conn 421968 446602 446599
35
36 dBcooc :: Connection -> CorpusId -> MainListId -> GroupListId -> IO [(Int, Int, Int)]
37 dBcooc conn corpus mainList groupList = query conn [sql|
38 set work_mem='1GB';
39
40 --EXPLAIN ANALYZE
41 WITH COOC as (
42 SELECT
43 COALESCE(grA.ngram1_id, wlA.ngram_id) as ngA,
44 COALESCE(grB.ngram1_id, wlB.ngram_id) as ngB,
45 COUNT(*) AS score
46 FROM
47 nodes AS n
48 -- / -- X Y
49 -- SQL graph for getting the cooccurrences
50
51 -- STEP 1: X axis of the matrix
52 INNER JOIN nodes_ngrams
53 AS ngA ON ngA.node_id = n.id
54 -- \--> get the occurrences node/ngram of the corpus
55
56 INNER JOIN nodes_ngrams
57 AS wlA ON ngA.ngram_id = wlA.ngram_id
58 AND wlA.node_id = ?
59 -- \--> filter with white/main list (typename 7)
60
61 LEFT JOIN nodes_ngrams_ngrams
62 AS grA ON wlA.ngram_id = grA.ngram1_id
63 AND grA.node_id = ?
64 -- \--> adding (joining) ngrams that are grouped (typename 6)
65 LEFT JOIN nodes_ngrams
66 AS wlAA ON grA.ngram2_id = wlAA.ngram_id
67 AND wlAA.node_id = wlA.node_id
68 -- \--> adding (joining) ngrams that are not grouped
69 --LEFT JOIN ngrams AS wlAA ON grA.ngram2_id = wlAA.id
70 -- \--> for joining all synonyms even if they are not in the main list (white list)
71
72
73 -- STEP 2: Y axi of the matrix
74 INNER JOIN nodes_ngrams
75 AS ngB ON ngB.node_id = n.id
76 -- \--> get the occurrences node/ngram of the corpus
77
78 INNER JOIN nodes_ngrams
79 AS wlB ON ngB.ngram_id = wlB.ngram_id
80 AND wlB.node_id = ?
81 -- \--> filter with white/main list
82
83 LEFT JOIN nodes_ngrams_ngrams
84 AS grB ON wlB.ngram_id = grB.ngram1_id
85 AND grB.node_id = ?
86 -- \--> adding (joining) ngrams that are grouped
87 LEFT JOIN nodes_ngrams
88 AS wlBB ON grB.ngram2_id = wlBB.ngram_id
89 AND wlBB.node_id = wlB.node_id
90 -- \--> adding (joining) ngrams that are not grouped
91
92 -- LEFT JOIN ngrams AS wlBB ON grB.ngram2_id = wlBB.id
93 -- \--> for joining all synonyms even if they are not in the main list (white list)
94
95 WHERE
96 n.typename = 4
97 AND n.parent_id = ?
98 GROUP BY 1,2
99 -- ==
100 -- GROUP BY ngA, ngB
101 )
102
103 SELECT ngA, ngB, score
104 FROM COOC --> from the query above
105
106 WHERE score >= 3
107 AND
108 ngA <= ngB
109 |] (mainList, groupList, mainList, groupList, corpus)
110
111