]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
[docker] update image, add README info
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Schema.Ngrams where
27
28 import Control.Lens (makeLenses, over)
29 import Control.Monad (mzero)
30 import Data.Aeson
31 import Data.Aeson.Types (toJSONKeyText)
32 import Data.ByteString.Internal (ByteString)
33 import Data.Map (Map, fromList, lookup)
34 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
35 import Data.Text (Text, splitOn, pack)
36 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
37 import Database.PostgreSQL.Simple.SqlQQ (sql)
38 import Database.PostgreSQL.Simple.ToField (toField, ToField)
39 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
40 import Database.PostgreSQL.Simple.ToRow (toRow)
41 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
42 import GHC.Generics (Generic)
43 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
44 import Gargantext.Core.Types (TODO(..))
45 import Gargantext.Prelude
46 import Opaleye hiding (FromField)
47 import Servant (FromHttpApiData, parseUrlPiece, Proxy(..))
48 import Text.Read (read)
49 import Data.Swagger (ToParamSchema, toParamSchema, ToSchema)
50 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
51 import qualified Database.PostgreSQL.Simple as PGS
52
53
54 type NgramsId = Int
55 type NgramsTerms = Text
56 type Size = Int
57
58 data NgramsPoly id terms n = NgramsDb { _ngrams_id :: id
59 , _ngrams_terms :: terms
60 , _ngrams_n :: n
61 } deriving (Show)
62
63 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
64 (Column PGText)
65 (Column PGInt4)
66
67 type NgramsRead = NgramsPoly (Column PGInt4)
68 (Column PGText)
69 (Column PGInt4)
70
71 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
72 (Column (Nullable PGText))
73 (Column (Nullable PGInt4))
74
75 type NgramsDb = NgramsPoly Int Text Int
76
77 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
78 makeLenses ''NgramsPoly
79
80
81 ngramsTable :: Table NgramsWrite NgramsRead
82 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { _ngrams_id = optional "id"
83 , _ngrams_terms = required "terms"
84 , _ngrams_n = required "n"
85 }
86 )
87
88 queryNgramsTable :: Query NgramsRead
89 queryNgramsTable = queryTable ngramsTable
90
91 dbGetNgramsDb :: Cmd err [NgramsDb]
92 dbGetNgramsDb = runOpaQuery queryNgramsTable
93
94 -- | Main Ngrams Types
95 -- | Typed Ngrams
96 -- Typed Ngrams localize the context of the ngrams
97 -- ngrams in source field of document has Sources Type
98 -- ngrams in authors field of document has Authors Type
99 -- ngrams in text (title or abstract) of documents has Terms Type
100 data NgramsType = Authors | Institutes | Sources | NgramsTerms
101 deriving (Eq, Show, Read, Ord, Enum, Bounded, Generic)
102
103 ngramsTypes :: [NgramsType]
104 ngramsTypes = [minBound..]
105
106 instance ToSchema NgramsType
107 {- where
108 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_nre_")
109 -}
110
111 instance FromJSON NgramsType
112 instance FromJSONKey NgramsType where
113 fromJSONKey = FromJSONKeyTextParser (parseJSON . String)
114 instance ToJSON NgramsType
115 instance ToJSONKey NgramsType where
116 toJSONKey = toJSONKeyText (pack . show)
117
118 newtype NgramsTypeId = NgramsTypeId Int
119 deriving (Eq, Show, Ord, Num)
120
121 instance ToField NgramsTypeId where
122 toField (NgramsTypeId n) = toField n
123
124 instance FromField NgramsTypeId where
125 fromField fld mdata = do
126 n <- fromField fld mdata
127 if (n :: Int) > 0 then return $ NgramsTypeId n
128 else mzero
129
130 instance FromHttpApiData NgramsType where
131 parseUrlPiece n = pure $ (read . cs) n
132
133 instance ToParamSchema NgramsType where
134 toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
135
136
137
138
139 instance QueryRunnerColumnDefault (Nullable PGInt4) NgramsTypeId
140 where
141 queryRunnerColumnDefault = fieldQueryRunnerColumn
142
143 pgNgramsType :: NgramsType -> Column PGInt4
144 pgNgramsType = pgNgramsTypeId . ngramsTypeId
145
146 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
147 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
148
149 ngramsTypeId :: NgramsType -> NgramsTypeId
150 ngramsTypeId Authors = 1
151 ngramsTypeId Institutes = 2
152 ngramsTypeId Sources = 3
153 ngramsTypeId NgramsTerms = 4
154
155 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
156 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
157
158 ------------------------------------------------------------------------
159 -- | TODO put it in Gargantext.Text.Ngrams
160 data Ngrams = Ngrams { _ngramsTerms :: Text
161 , _ngramsSize :: Int
162 } deriving (Generic, Show, Eq, Ord)
163
164 makeLenses ''Ngrams
165 instance PGS.ToRow Ngrams where
166 toRow (Ngrams t s) = [toField t, toField s]
167
168 text2ngrams :: Text -> Ngrams
169 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
170
171 -------------------------------------------------------------------------
172 -- | TODO put it in Gargantext.Text.Ngrams
173 -- Named entity are typed ngrams of Terms Ngrams
174 data NgramsT a =
175 NgramsT { _ngramsType :: NgramsType
176 , _ngramsT :: a
177 } deriving (Generic, Show, Eq, Ord)
178
179 makeLenses ''NgramsT
180
181 instance Functor NgramsT where
182 fmap = over ngramsT
183 -----------------------------------------------------------------------
184 data NgramsIndexed =
185 NgramsIndexed
186 { _ngrams :: Ngrams
187 , _ngramsId :: NgramsId
188 } deriving (Show, Generic, Eq, Ord)
189
190 makeLenses ''NgramsIndexed
191 ------------------------------------------------------------------------
192 data NgramIds =
193 NgramIds
194 { ngramId :: Int
195 , ngramTerms :: Text
196 } deriving (Show, Generic, Eq, Ord)
197
198 instance PGS.FromRow NgramIds where
199 fromRow = NgramIds <$> field <*> field
200
201 ----------------------
202 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
203 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
204
205 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
206 indexNgramsT = fmap . indexNgramsWith . withMap
207
208 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
209 indexNgrams = indexNgramsWith . withMap
210
211 -- NP: not sure we need it anymore
212 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
213 indexNgramsTWith = fmap . indexNgramsWith
214
215 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
216 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
217
218 -- TODO-ACCESS: access must not be checked here but when insertNgrams is called.
219 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
220 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
221
222 -- TODO-ACCESS: access must not be checked here but when insertNgrams' is called.
223 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
224 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
225 where
226 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
227
228 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
229 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
230 where
231 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
232
233 ----------------------
234 queryInsertNgrams :: PGS.Query
235 queryInsertNgrams = [sql|
236 WITH input_rows(terms,n) AS (?)
237 , ins AS (
238 INSERT INTO ngrams (terms,n)
239 SELECT * FROM input_rows
240 ON CONFLICT (terms) DO NOTHING -- unique index created here
241 RETURNING id,terms
242 )
243
244 SELECT id, terms
245 FROM ins
246 UNION ALL
247 SELECT c.id, terms
248 FROM input_rows
249 JOIN ngrams c USING (terms); -- columns of unique index
250 |]
251
252
253