2 Module : Gargantext.Database.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE MultiParamTypeClasses #-}
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE QuasiQuotes #-}
22 {-# LANGUAGE TemplateHaskell #-}
24 module Gargantext.Database.Ngrams where
27 import Control.Lens (makeLenses)
28 import Data.ByteString.Internal (ByteString)
29 import Data.Map (Map, fromList, lookup)
30 import Data.Text (Text, splitOn)
31 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
32 import Database.PostgreSQL.Simple.SqlQQ (sql)
33 import Database.PostgreSQL.Simple.ToField (toField)
34 import Database.PostgreSQL.Simple.ToRow (toRow)
35 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
36 import GHC.Generics (Generic)
37 import Gargantext.Database.Config (nodeTypeId)
38 import Gargantext.Database.Types.Node (NodeType)
39 import Gargantext.Database.Node (mkCmd, Cmd(..))
40 import Gargantext.Prelude
41 import qualified Database.PostgreSQL.Simple as DPS
44 --data NgramPoly id terms n = NgramDb { ngram_id :: id
45 -- , ngram_terms :: terms
49 --type NgramWrite = NgramPoly (Maybe (Column PGInt4))
53 --type NgramRead = NgramPoly (Column PGInt4)
57 ----type Ngram = NgramPoly Int Text Int
59 -- $(makeAdaptorAndInstance "pNgram" ''NgramPoly)
60 -- $(makeLensesWith abbreviatedFields ''NgramPoly)
62 --ngramTable :: Table NgramWrite NgramRead
63 --ngramTable = Table "ngrams" (pNgram NgramDb { ngram_id = optional "id"
64 -- , ngram_terms = required "terms"
65 -- , ngram_n = required "n"
69 --queryNgramTable :: Query NgramRead
70 --queryNgramTable = queryTable ngramTable
72 --dbGetNgrams :: PGS.Connection -> IO [NgramDb]
73 --dbGetNgrams conn = runQuery conn queryNgramTable
75 -- | Main Ngrams Types
77 -- Typed Ngrams localize the context of the ngrams
78 -- ngrams in source field of document has Sources Type
79 -- ngrams in authors field of document has Authors Type
80 -- ngrams in text (title or abstract) of documents has Terms Type
81 data NgramsType = Authors | Institutes | Sources | Terms
82 deriving (Eq, Show, Ord)
84 ngramsTypeId :: NgramsType -> Int
85 ngramsTypeId Authors = 1
86 ngramsTypeId Institutes = 2
87 ngramsTypeId Sources = 3
88 ngramsTypeId Terms = 4
90 type NgramsTerms = Text
94 ------------------------------------------------------------------------
95 -- | TODO put it in Gargantext.Text.Ngrams
96 data Ngrams = Ngrams { _ngramsTerms :: Text
98 } deriving (Generic, Show, Eq, Ord)
101 instance DPS.ToRow Ngrams where
102 toRow (Ngrams t s) = [toField t, toField s]
104 text2ngrams :: Text -> Ngrams
105 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
107 -------------------------------------------------------------------------
108 -- | TODO put it in Gargantext.Text.Ngrams
109 -- Named entity are typed ngrams of Terms Ngrams
111 NgramsT { _ngramsType :: NgramsType
113 } deriving (Generic, Show, Eq, Ord)
116 -----------------------------------------------------------------------
120 , _ngramsId :: NgramsId
121 } deriving (Show, Generic, Eq, Ord)
123 makeLenses ''NgramsIndexed
124 ------------------------------------------------------------------------
129 } deriving (Show, Generic, Eq, Ord)
131 instance DPS.FromRow NgramIds where
132 fromRow = NgramIds <$> field <*> field
134 ----------------------
135 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
136 indexNgramsT m ngrId = indexNgramsTWith f ngrId
138 f n = maybe (panic "indexNgramsT: should not happen") identity (lookup n m)
140 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams-> NgramsT NgramsIndexed
141 indexNgramsTWith f (NgramsT t n) = NgramsT t (NgramsIndexed n ((f . _ngramsTerms) n))
143 insertNgrams :: [Ngrams] -> Cmd (Map NgramsTerms NgramsId)
144 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
146 insertNgrams' :: [Ngrams] -> Cmd [NgramIds]
147 insertNgrams' ns = mkCmd $ \conn -> DPS.query conn queryInsertNgrams (DPS.Only $ Values fields ns)
149 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
151 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd ByteString
152 insertNgrams_Debug ns = mkCmd $ \conn -> DPS.formatQuery conn queryInsertNgrams (DPS.Only $ Values fields ns)
154 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
156 ----------------------
157 queryInsertNgrams :: DPS.Query
158 queryInsertNgrams = [sql|
159 WITH input_rows(terms,n) AS (?)
161 INSERT INTO ngrams (terms,n)
162 SELECT * FROM input_rows
163 ON CONFLICT (terms) DO NOTHING -- unique index created here
172 JOIN ngrams c USING (terms); -- columns of unique index
180 data NgramsTableParam =
181 NgramsTableParam { _nt_listId :: Int
182 , _nt_corpusId :: Int
185 type NgramsTableParamUser = NgramsTableParam
186 type NgramsTableParamMaster = NgramsTableParam
188 data NgramsTableData = NgramsTableData { _ntd_terms :: Text
190 , _ntd_ngramsType :: Int
191 , _ntd_weight :: Double
194 getTableNgrams :: NodeType -> NgramsType -> NgramsTableParamUser -> NgramsTableParamMaster -> Cmd [(Text, Int, Int, Double)]
195 getTableNgrams nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) =
196 mkCmd $ \conn -> DPS.query conn querySelectTableNgrams (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId)
198 nodeTId = nodeTypeId nodeT
199 ngrmTId = ngramsTypeId ngrmT
203 querySelectTableNgrams :: DPS.Query
204 querySelectTableNgrams = [sql|
206 WITH tableUser AS (select ngs.terms, ngs.n, nn1.ngrams_type,nn2.weight FROM ngrams ngs
207 JOIN nodes_ngrams nn1 ON nn1.ngram_id = ngs.id
208 JOIN nodes_ngrams nn2 ON nn2.ngram_id = ngs.id
209 JOIN nodes n ON n.id = nn2.node_id
210 WHERE nn1.node_id = ? -- User listId
211 AND n.parent_id = ? -- User CorpusId or AnnuaireId
212 AND n.typename = ? -- both type of childs (Documents or Contacts)
213 AND nn2.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
214 ), tableMaster AS (select ngs.terms, ngs.n, nn1.ngrams_type,nn2.weight FROM ngrams ngs
215 JOIN nodes_ngrams nn1 ON nn1.ngram_id = ngs.id
216 JOIN nodes_ngrams nn2 ON nn2.ngram_id = ngs.id
217 JOIN nodes n ON n.id = nn2.node_id
218 WHERE nn1.node_id = ? -- Master listId
219 AND n.parent_id = ? -- Master CorpusId or AnnuaireId
220 AND n.typename = ? -- both type of childs (Documents or Contacts)
221 AND nn2.ngrams_type = ? -- both type of ngrams (Authors or Terms?)
224 SELECT COALESCE(tu.terms,tm.terms) AS terms
225 , COALESCE(tu.n,tm.n) AS n
226 , COALESCE(tu.ngrams_type,tm.ngrams_type) AS ngrams_type
227 , SUM(COALESCE(tu.weight,tm.weight)) AS weight
228 FROM tableUser tu RIGHT JOIN tableMaster tm ON tu.terms = tm.terms
229 GROUP BY tu.terms,tm.terms,tu.n,tm.n,tu.ngrams_type,tm.ngrams_type;
233 type ListIdUser = Int
234 type ListIdMaster = Int
237 getNgramsGroup :: ListIdUser -> ListIdMaster -> Cmd [(Text, Text)]
238 getNgramsGroup lu lm = mkCmd $ \conn -> DPS.query conn querySelectNgramsGroup (lu,lm)
240 querySelectNgramsGroup :: DPS.Query
241 querySelectNgramsGroup = [sql|
243 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
244 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
245 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
247 nnn.node_id = ? -- User listId
250 SELECT n1.terms AS t1, n2.terms AS t2 FROM nodes_ngrams_ngrams nnn
251 JOIN ngrams n1 ON n1.id = nnn.ngram1_id
252 JOIN ngrams n2 ON n2.id = nnn.ngram2_id
254 nnn.node_id = ? -- Master listId
256 SELECT COALESCE(gu.t1,gm.t1) AS ngram1_id
257 , COALESCE(gu.t2,gm.t2) AS ngram2_id
258 FROM groupUser gu RIGHT JOIN groupMaster gm ON gu.t1 = gm.t1