src/Gargantext/Database/Cooc.hs

   1 {-|
   2 Module      : Gargantext.Database.TextSearch
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Here is a longer description of this module, containing some
  11 commentary with @some markup@.
  12 -}
  13
  14 {-# LANGUAGE NoImplicitPrelude #-}
  15 --{-# LANGUAGE OverloadedStrings #-}
  16
  17 {-# LANGUAGE QuasiQuotes #-}
  18
  19 module Gargantext.Database.Cooc where
  20
  21 import Control.Monad ((>>=))
  22 import Database.PostgreSQL.Simple
  23 import Database.PostgreSQL.Simple.SqlQQ
  24
  25 import Gargantext.Prelude
  26 import Gargantext (connectGargandb)
  27
  28 type CorpusId    = Int
  29 type MainListId  = Int
  30 type GroupListId = Int
  31
  32 coocTest :: IO [(Int, Int, Int)]
  33 coocTest = connectGargandb "gargantext.ini"
  34   >>= \conn -> dBcooc conn 421968 446602 446599
  35
  36 dBcooc :: Connection -> CorpusId -> MainListId -> GroupListId -> IO [(Int, Int, Int)]
  37 dBcooc conn corpus mainList groupList = query conn [sql|
  38   set work_mem='1GB';
  39
  40   --EXPLAIN ANALYZE
  41       WITH COOC as (
  42       SELECT
  43       COALESCE(grA.ngram1_id, wlA.ngram_id) as ngA,
  44       COALESCE(grB.ngram1_id, wlB.ngram_id) as ngB,
  45       COUNT(*) AS score
  46       FROM
  47       nodes AS n
  48       --      /     --     X   Y
  49       -- SQL graph for getting the cooccurrences
  50
  51           -- STEP 1: X axis of the matrix
  52           INNER JOIN nodes_ngrams
  53                   AS ngA  ON ngA.node_id  = n.id
  54           -- \--> get the occurrences node/ngram of the corpus
  55
  56               INNER JOIN nodes_ngrams
  57                       AS wlA  ON ngA.ngram_id = wlA.ngram_id
  58                              AND wlA.node_id  = ?
  59               -- \--> filter with white/main list (typename 7)
  60
  61           LEFT JOIN  nodes_ngrams_ngrams
  62                  AS grA  ON wlA.ngram_id = grA.ngram1_id
  63                         AND grA.node_id  = ?
  64           -- \--> adding (joining) ngrams that are grouped (typename 6)
  65           LEFT JOIN  nodes_ngrams
  66                  AS wlAA ON grA.ngram2_id = wlAA.ngram_id
  67                         AND wlAA.node_id  = wlA.node_id
  68           -- \--> adding (joining) ngrams that are not grouped
  69           --LEFT JOIN  ngrams        AS wlAA ON grA.ngram2_id = wlAA.id
  70           -- \--> for joining all synonyms even if they are not in the main list (white list)
  71
  72
  73           -- STEP 2: Y axi of the matrix
  74           INNER JOIN nodes_ngrams
  75                   AS ngB  ON ngB.node_id  = n.id
  76           -- \--> get the occurrences node/ngram of the corpus
  77
  78               INNER JOIN nodes_ngrams
  79                       AS wlB  ON ngB.ngram_id = wlB.ngram_id
  80                              AND wlB.node_id  = ?
  81               -- \--> filter with white/main list
  82
  83           LEFT JOIN  nodes_ngrams_ngrams
  84                  AS grB  ON wlB.ngram_id = grB.ngram1_id
  85                         AND grB.node_id  = ?
  86           -- \--> adding (joining) ngrams that are grouped
  87           LEFT JOIN  nodes_ngrams
  88                  AS wlBB ON grB.ngram2_id = wlBB.ngram_id
  89                         AND wlBB.node_id   = wlB.node_id
  90           -- \--> adding (joining) ngrams that are not grouped
  91
  92           -- LEFT JOIN  ngrams        AS wlBB ON grB.ngram2_id = wlBB.id
  93           -- \--> for joining all synonyms even if they are not in the main list (white list)
  94
  95           WHERE
  96               n.typename  = 4
  97           AND n.parent_id = ?
  98           GROUP BY 1,2
  99           --    ==
 100           -- GROUP BY ngA, ngB
 101           )
 102
 103       SELECT ngA, ngB, score
 104           FROM COOC    --> from the query above
 105
 106    WHERE score >= 3
 107   AND
 108   ngA <= ngB
 109   |] (mainList, groupList, mainList, groupList, corpus)
 110
 111