2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Ngrams connection to the Database.
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
26 module Gargantext.Database.Schema.Ngrams where
28 import Control.Lens (makeLenses, over)
29 import Control.Monad (mzero)
31 import Data.Aeson.Types (toJSONKeyText)
32 import Data.ByteString.Internal (ByteString)
33 import Data.Map (Map, fromList, lookup)
34 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
35 import Data.Text (Text, splitOn, pack)
36 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
37 import Database.PostgreSQL.Simple.SqlQQ (sql)
38 import Database.PostgreSQL.Simple.ToField (toField, ToField)
39 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
40 import Database.PostgreSQL.Simple.ToRow (toRow)
41 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
42 import GHC.Generics (Generic)
43 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
44 import Gargantext.Prelude
45 import Opaleye hiding (FromField)
46 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
47 import qualified Database.PostgreSQL.Simple as PGS
51 type NgramsTerms = Text
54 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
55 , ngrams_terms :: terms
59 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
63 type NgramsRead = NgramsPoly (Column PGInt4)
67 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
68 (Column (Nullable PGText))
69 (Column (Nullable PGInt4))
71 type NgramsDb = NgramsPoly Int Text Int
73 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
74 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
76 ngramsTable :: Table NgramsWrite NgramsRead
77 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
78 , ngrams_terms = required "terms"
79 , ngrams_n = required "n"
83 queryNgramsTable :: Query NgramsRead
84 queryNgramsTable = queryTable ngramsTable
86 dbGetNgramsDb :: Cmd err [NgramsDb]
87 dbGetNgramsDb = runOpaQuery queryNgramsTable
89 -- | Main Ngrams Types
91 -- Typed Ngrams localize the context of the ngrams
92 -- ngrams in source field of document has Sources Type
93 -- ngrams in authors field of document has Authors Type
94 -- ngrams in text (title or abstract) of documents has Terms Type
95 data NgramsType = Authors | Institutes | Sources | NgramsTerms
96 deriving (Eq, Show, Ord, Enum, Bounded, Generic)
98 instance FromJSON NgramsType
99 instance FromJSONKey NgramsType where
100 fromJSONKey = FromJSONKeyTextParser (parseJSON . String)
101 instance ToJSON NgramsType
102 instance ToJSONKey NgramsType where
103 toJSONKey = toJSONKeyText (pack . show)
105 newtype NgramsTypeId = NgramsTypeId Int
106 deriving (Eq, Show, Ord, Num)
108 instance ToField NgramsTypeId where
109 toField (NgramsTypeId n) = toField n
111 instance FromField NgramsTypeId where
112 fromField fld mdata = do
113 n <- fromField fld mdata
114 if (n :: Int) > 0 then return $ NgramsTypeId n
117 instance QueryRunnerColumnDefault (Nullable PGInt4) NgramsTypeId
119 queryRunnerColumnDefault = fieldQueryRunnerColumn
121 pgNgramsType :: NgramsType -> Column PGInt4
122 pgNgramsType = pgNgramsTypeId . ngramsTypeId
124 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
125 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
127 ngramsTypeId :: NgramsType -> NgramsTypeId
128 ngramsTypeId Authors = 1
129 ngramsTypeId Institutes = 2
130 ngramsTypeId Sources = 3
131 ngramsTypeId NgramsTerms = 4
133 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
134 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
136 ------------------------------------------------------------------------
137 -- | TODO put it in Gargantext.Text.Ngrams
138 data Ngrams = Ngrams { _ngramsTerms :: Text
140 } deriving (Generic, Show, Eq, Ord)
143 instance PGS.ToRow Ngrams where
144 toRow (Ngrams t s) = [toField t, toField s]
146 text2ngrams :: Text -> Ngrams
147 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
149 -------------------------------------------------------------------------
150 -- | TODO put it in Gargantext.Text.Ngrams
151 -- Named entity are typed ngrams of Terms Ngrams
153 NgramsT { _ngramsType :: NgramsType
155 } deriving (Generic, Show, Eq, Ord)
159 instance Functor NgramsT where
161 -----------------------------------------------------------------------
165 , _ngramsId :: NgramsId
166 } deriving (Show, Generic, Eq, Ord)
168 makeLenses ''NgramsIndexed
169 ------------------------------------------------------------------------
174 } deriving (Show, Generic, Eq, Ord)
176 instance PGS.FromRow NgramIds where
177 fromRow = NgramIds <$> field <*> field
179 ----------------------
180 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
181 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
183 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
184 indexNgramsT = fmap . indexNgramsWith . withMap
186 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
187 indexNgrams = indexNgramsWith . withMap
189 -- NP: not sure we need it anymore
190 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
191 indexNgramsTWith = fmap . indexNgramsWith
193 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
194 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
196 -- TODO-ACCESS: access must not be checked here but when insertNgrams is called.
197 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
198 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
200 -- TODO-ACCESS: access must not be checked here but when insertNgrams' is called.
201 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
202 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
204 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
206 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
207 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
209 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
211 ----------------------
212 queryInsertNgrams :: PGS.Query
213 queryInsertNgrams = [sql|
214 WITH input_rows(terms,n) AS (?)
216 INSERT INTO ngrams (terms,n)
217 SELECT * FROM input_rows
218 ON CONFLICT (terms) DO NOTHING -- unique index created here
227 JOIN ngrams c USING (terms); -- columns of unique index