2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Schema.Ngrams where
28 import Control.Lens (makeLenses, over)
29 import Control.Monad (mzero)
31 import Data.Aeson.Types (toJSONKeyText)
32 import Data.ByteString.Internal (ByteString)
33 import Data.Map (Map, fromList, lookup)
34 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
35 import Data.Text (Text, splitOn, pack)
36 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
37 import Database.PostgreSQL.Simple.SqlQQ (sql)
38 import Database.PostgreSQL.Simple.ToField (toField, ToField)
39 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
40 import Database.PostgreSQL.Simple.ToRow (toRow)
41 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
42 import GHC.Generics (Generic)
43 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
44 import Gargantext.Prelude
45 import Opaleye hiding (FromField)
46 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
47 import qualified Database.PostgreSQL.Simple as PGS
51 type NgramsTerms = Text
54 data NgramsPoly id terms n = NgramsDb { _ngrams_id :: id
55 , _ngrams_terms :: terms
59 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
63 type NgramsRead = NgramsPoly (Column PGInt4)
67 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
68 (Column (Nullable PGText))
69 (Column (Nullable PGInt4))
71 type NgramsDb = NgramsPoly Int Text Int
73 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
74 makeLenses ''NgramsPoly
77 ngramsTable :: Table NgramsWrite NgramsRead
78 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { _ngrams_id = optional "id"
79 , _ngrams_terms = required "terms"
80 , _ngrams_n = required "n"
84 queryNgramsTable :: Query NgramsRead
85 queryNgramsTable = queryTable ngramsTable
87 dbGetNgramsDb :: Cmd err [NgramsDb]
88 dbGetNgramsDb = runOpaQuery queryNgramsTable
90 -- | Main Ngrams Types
92 -- Typed Ngrams localize the context of the ngrams
93 -- ngrams in source field of document has Sources Type
94 -- ngrams in authors field of document has Authors Type
95 -- ngrams in text (title or abstract) of documents has Terms Type
96 data NgramsType = Authors | Institutes | Sources | NgramsTerms
97 deriving (Eq, Show, Ord, Enum, Bounded, Generic)
99 instance FromJSON NgramsType
100 instance FromJSONKey NgramsType where
101 fromJSONKey = FromJSONKeyTextParser (parseJSON . String)
102 instance ToJSON NgramsType
103 instance ToJSONKey NgramsType where
104 toJSONKey = toJSONKeyText (pack . show)
106 newtype NgramsTypeId = NgramsTypeId Int
107 deriving (Eq, Show, Ord, Num)
109 instance ToField NgramsTypeId where
110 toField (NgramsTypeId n) = toField n
112 instance FromField NgramsTypeId where
113 fromField fld mdata = do
114 n <- fromField fld mdata
115 if (n :: Int) > 0 then return $ NgramsTypeId n
118 instance QueryRunnerColumnDefault (Nullable PGInt4) NgramsTypeId
120 queryRunnerColumnDefault = fieldQueryRunnerColumn
122 pgNgramsType :: NgramsType -> Column PGInt4
123 pgNgramsType = pgNgramsTypeId . ngramsTypeId
125 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
126 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
128 ngramsTypeId :: NgramsType -> NgramsTypeId
129 ngramsTypeId Authors = 1
130 ngramsTypeId Institutes = 2
131 ngramsTypeId Sources = 3
132 ngramsTypeId NgramsTerms = 4
134 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
135 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
137 ------------------------------------------------------------------------
138 -- | TODO put it in Gargantext.Text.Ngrams
139 data Ngrams = Ngrams { _ngramsTerms :: Text
141 } deriving (Generic, Show, Eq, Ord)
144 instance PGS.ToRow Ngrams where
145 toRow (Ngrams t s) = [toField t, toField s]
147 text2ngrams :: Text -> Ngrams
148 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
150 -------------------------------------------------------------------------
151 -- | TODO put it in Gargantext.Text.Ngrams
152 -- Named entity are typed ngrams of Terms Ngrams
154 NgramsT { _ngramsType :: NgramsType
156 } deriving (Generic, Show, Eq, Ord)
160 instance Functor NgramsT where
162 -----------------------------------------------------------------------
166 , _ngramsId :: NgramsId
167 } deriving (Show, Generic, Eq, Ord)
169 makeLenses ''NgramsIndexed
170 ------------------------------------------------------------------------
175 } deriving (Show, Generic, Eq, Ord)
177 instance PGS.FromRow NgramIds where
178 fromRow = NgramIds <$> field <*> field
180 ----------------------
181 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
182 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
184 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
185 indexNgramsT = fmap . indexNgramsWith . withMap
187 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
188 indexNgrams = indexNgramsWith . withMap
190 -- NP: not sure we need it anymore
191 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
192 indexNgramsTWith = fmap . indexNgramsWith
194 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
195 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
197 -- TODO-ACCESS: access must not be checked here but when insertNgrams is called.
198 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
199 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
201 -- TODO-ACCESS: access must not be checked here but when insertNgrams' is called.
202 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
203 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
205 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
207 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
208 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
210 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
212 ----------------------
213 queryInsertNgrams :: PGS.Query
214 queryInsertNgrams = [sql|
215 WITH input_rows(terms,n) AS (?)
217 INSERT INTO ngrams (terms,n)
218 SELECT * FROM input_rows
219 ON CONFLICT (terms) DO NOTHING -- unique index created here
228 JOIN ngrams c USING (terms); -- columns of unique index