]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
[DB/FACTO/WIP] G.D.S.Prelude
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleContexts #-}
17 {-# LANGUAGE FlexibleInstances #-}
18 {-# LANGUAGE FunctionalDependencies #-}
19 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
20 {-# LANGUAGE MultiParamTypeClasses #-}
21 {-# LANGUAGE NoImplicitPrelude #-}
22 {-# LANGUAGE OverloadedStrings #-}
23 {-# LANGUAGE QuasiQuotes #-}
24 {-# LANGUAGE RankNTypes #-}
25 {-# LANGUAGE TemplateHaskell #-}
26
27 module Gargantext.Database.Schema.Ngrams where
28
29 import Control.Lens (makeLenses, over)
30 import Control.Monad (mzero)
31 import Data.Aeson
32 import Data.Aeson.Types (toJSONKeyText)
33 import Data.ByteString.Internal (ByteString)
34 import Data.Map (Map, fromList, lookup)
35 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
36 import Data.Swagger (ToParamSchema, toParamSchema, ToSchema)
37 import Data.Text (Text, splitOn, pack)
38 import GHC.Generics (Generic)
39 import Gargantext.Core.Types (TODO(..))
40 import Gargantext.Database.Admin.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
41 import Gargantext.Prelude
42 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
43 import Servant (FromHttpApiData, parseUrlPiece, Proxy(..))
44 import Text.Read (read)
45 import Gargantext.Database.Schema.Prelude
46 import qualified Database.PostgreSQL.Simple as PGS
47
48
49 type NgramsId = Int
50 type NgramsTerms = Text
51 type Size = Int
52
53 data NgramsPoly id terms n = NgramsDb { _ngrams_id :: !id
54 , _ngrams_terms :: !terms
55 , _ngrams_n :: !n
56 } deriving (Show)
57
58 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
59 (Column PGText)
60 (Column PGInt4)
61
62 type NgramsRead = NgramsPoly (Column PGInt4)
63 (Column PGText)
64 (Column PGInt4)
65
66 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
67 (Column (Nullable PGText))
68 (Column (Nullable PGInt4))
69
70 type NgramsDb = NgramsPoly Int Text Int
71
72 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
73 makeLenses ''NgramsPoly
74
75
76 ngramsTable :: Table NgramsWrite NgramsRead
77 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { _ngrams_id = optional "id"
78 , _ngrams_terms = required "terms"
79 , _ngrams_n = required "n"
80 }
81 )
82
83 queryNgramsTable :: Query NgramsRead
84 queryNgramsTable = queryTable ngramsTable
85
86 dbGetNgramsDb :: Cmd err [NgramsDb]
87 dbGetNgramsDb = runOpaQuery queryNgramsTable
88
89 -- | Main Ngrams Types
90 -- | Typed Ngrams
91 -- Typed Ngrams localize the context of the ngrams
92 -- ngrams in source field of document has Sources Type
93 -- ngrams in authors field of document has Authors Type
94 -- ngrams in text (title or abstract) of documents has Terms Type
95 data NgramsType = Authors | Institutes | Sources | NgramsTerms
96 deriving (Eq, Show, Read, Ord, Enum, Bounded, Generic)
97
98 ngramsTypes :: [NgramsType]
99 ngramsTypes = [minBound..]
100
101 instance ToSchema NgramsType
102 {- where
103 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_nre_")
104 --}
105
106 instance FromJSON NgramsType
107 instance FromJSONKey NgramsType where
108 fromJSONKey = FromJSONKeyTextParser (parseJSON . String)
109 instance ToJSON NgramsType
110 instance ToJSONKey NgramsType where
111 toJSONKey = toJSONKeyText (pack . show)
112
113 newtype NgramsTypeId = NgramsTypeId Int
114 deriving (Eq, Show, Ord, Num)
115
116 instance ToField NgramsTypeId where
117 toField (NgramsTypeId n) = toField n
118
119 instance FromField NgramsTypeId where
120 fromField fld mdata = do
121 n <- fromField fld mdata
122 if (n :: Int) > 0 then return $ NgramsTypeId n
123 else mzero
124
125 instance FromHttpApiData NgramsType where
126 parseUrlPiece n = pure $ (read . cs) n
127
128 instance ToParamSchema NgramsType where
129 toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
130
131
132 instance QueryRunnerColumnDefault (Nullable PGInt4) NgramsTypeId
133 where
134 queryRunnerColumnDefault = fieldQueryRunnerColumn
135
136 pgNgramsType :: NgramsType -> Column PGInt4
137 pgNgramsType = pgNgramsTypeId . ngramsTypeId
138
139 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
140 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
141
142 ngramsTypeId :: NgramsType -> NgramsTypeId
143 ngramsTypeId Authors = 1
144 ngramsTypeId Institutes = 2
145 ngramsTypeId Sources = 3
146 ngramsTypeId NgramsTerms = 4
147
148 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
149 fromNgramsTypeId id = lookup id
150 $ fromList [ (ngramsTypeId nt,nt)
151 | nt <- [minBound .. maxBound] :: [NgramsType]
152 ]
153
154 ------------------------------------------------------------------------
155 -- | TODO put it in Gargantext.Text.Ngrams
156 data Ngrams = Ngrams { _ngramsTerms :: Text
157 , _ngramsSize :: Int
158 } deriving (Generic, Show, Eq, Ord)
159
160 makeLenses ''Ngrams
161 instance PGS.ToRow Ngrams where
162 toRow (Ngrams t s) = [toField t, toField s]
163
164 text2ngrams :: Text -> Ngrams
165 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
166
167 -------------------------------------------------------------------------
168 -- | TODO put it in Gargantext.Text.Ngrams
169 -- Named entity are typed ngrams of Terms Ngrams
170 data NgramsT a =
171 NgramsT { _ngramsType :: NgramsType
172 , _ngramsT :: a
173 } deriving (Generic, Show, Eq, Ord)
174
175 makeLenses ''NgramsT
176
177 instance Functor NgramsT where
178 fmap = over ngramsT
179 -----------------------------------------------------------------------
180 data NgramsIndexed =
181 NgramsIndexed
182 { _ngrams :: Ngrams
183 , _ngramsId :: NgramsId
184 } deriving (Show, Generic, Eq, Ord)
185
186 makeLenses ''NgramsIndexed
187 ------------------------------------------------------------------------
188 data NgramIds =
189 NgramIds
190 { ngramId :: Int
191 , ngramTerms :: Text
192 } deriving (Show, Generic, Eq, Ord)
193
194 instance PGS.FromRow NgramIds where
195 fromRow = NgramIds <$> field <*> field
196
197 ----------------------
198 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
199 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
200
201 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
202 indexNgramsT = fmap . indexNgramsWith . withMap
203
204 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
205 indexNgrams = indexNgramsWith . withMap
206
207 -- NP: not sure we need it anymore
208 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
209 indexNgramsTWith = fmap . indexNgramsWith
210
211 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
212 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
213
214 -- TODO-ACCESS: access must not be checked here but when insertNgrams is called.
215 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
216 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
217
218 -- TODO-ACCESS: access must not be checked here but when insertNgrams' is called.
219 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
220 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
221 where
222 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
223
224 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
225 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
226 where
227 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
228
229 ----------------------
230 queryInsertNgrams :: PGS.Query
231 queryInsertNgrams = [sql|
232 WITH input_rows(terms,n) AS (?)
233 , ins AS (
234 INSERT INTO ngrams (terms,n)
235 SELECT * FROM input_rows
236 ON CONFLICT (terms) DO NOTHING -- unique index created here
237 RETURNING id,terms
238 )
239
240 SELECT id, terms
241 FROM ins
242 UNION ALL
243 SELECT c.id, terms
244 FROM input_rows
245 JOIN ngrams c USING (terms); -- columns of unique index
246 |]
247