]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
changement du seuil
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Schema.Ngrams where
27
28 import Control.Lens (makeLenses, over)
29 import Control.Monad (mzero)
30 import Data.Aeson
31 import Data.Aeson.Types (toJSONKeyText)
32 import Data.ByteString.Internal (ByteString)
33 import Data.Map (Map, fromList, lookup)
34 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
35 import Data.Text (Text, splitOn, pack)
36 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
37 import Database.PostgreSQL.Simple.SqlQQ (sql)
38 import Database.PostgreSQL.Simple.ToField (toField, ToField)
39 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
40 import Database.PostgreSQL.Simple.ToRow (toRow)
41 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
42 import GHC.Generics (Generic)
43 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
44 import Gargantext.Core.Types (TODO(..))
45 import Gargantext.Prelude
46 import Opaleye hiding (FromField)
47 import Servant (FromHttpApiData, parseUrlPiece, Proxy(..))
48 import Text.Read (read)
49 import Data.Swagger (ToParamSchema, toParamSchema)
50 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
51 import qualified Database.PostgreSQL.Simple as PGS
52
53
54 type NgramsId = Int
55 type NgramsTerms = Text
56 type Size = Int
57
58 data NgramsPoly id terms n = NgramsDb { _ngrams_id :: id
59 , _ngrams_terms :: terms
60 , _ngrams_n :: n
61 } deriving (Show)
62
63 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
64 (Column PGText)
65 (Column PGInt4)
66
67 type NgramsRead = NgramsPoly (Column PGInt4)
68 (Column PGText)
69 (Column PGInt4)
70
71 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
72 (Column (Nullable PGText))
73 (Column (Nullable PGInt4))
74
75 type NgramsDb = NgramsPoly Int Text Int
76
77 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
78 makeLenses ''NgramsPoly
79
80
81 ngramsTable :: Table NgramsWrite NgramsRead
82 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { _ngrams_id = optional "id"
83 , _ngrams_terms = required "terms"
84 , _ngrams_n = required "n"
85 }
86 )
87
88 queryNgramsTable :: Query NgramsRead
89 queryNgramsTable = queryTable ngramsTable
90
91 dbGetNgramsDb :: Cmd err [NgramsDb]
92 dbGetNgramsDb = runOpaQuery queryNgramsTable
93
94 -- | Main Ngrams Types
95 -- | Typed Ngrams
96 -- Typed Ngrams localize the context of the ngrams
97 -- ngrams in source field of document has Sources Type
98 -- ngrams in authors field of document has Authors Type
99 -- ngrams in text (title or abstract) of documents has Terms Type
100 data NgramsType = Authors | Institutes | Sources | NgramsTerms
101 deriving (Eq, Show, Read, Ord, Enum, Bounded, Generic)
102
103 instance FromJSON NgramsType
104 instance FromJSONKey NgramsType where
105 fromJSONKey = FromJSONKeyTextParser (parseJSON . String)
106 instance ToJSON NgramsType
107 instance ToJSONKey NgramsType where
108 toJSONKey = toJSONKeyText (pack . show)
109
110 newtype NgramsTypeId = NgramsTypeId Int
111 deriving (Eq, Show, Ord, Num)
112
113 instance ToField NgramsTypeId where
114 toField (NgramsTypeId n) = toField n
115
116 instance FromField NgramsTypeId where
117 fromField fld mdata = do
118 n <- fromField fld mdata
119 if (n :: Int) > 0 then return $ NgramsTypeId n
120 else mzero
121
122 instance FromHttpApiData NgramsType where
123 parseUrlPiece n = pure $ (read . cs) n
124
125 instance ToParamSchema NgramsType where
126 toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
127
128
129
130
131 instance QueryRunnerColumnDefault (Nullable PGInt4) NgramsTypeId
132 where
133 queryRunnerColumnDefault = fieldQueryRunnerColumn
134
135 pgNgramsType :: NgramsType -> Column PGInt4
136 pgNgramsType = pgNgramsTypeId . ngramsTypeId
137
138 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
139 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
140
141 ngramsTypeId :: NgramsType -> NgramsTypeId
142 ngramsTypeId Authors = 1
143 ngramsTypeId Institutes = 2
144 ngramsTypeId Sources = 3
145 ngramsTypeId NgramsTerms = 4
146
147 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
148 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
149
150 ------------------------------------------------------------------------
151 -- | TODO put it in Gargantext.Text.Ngrams
152 data Ngrams = Ngrams { _ngramsTerms :: Text
153 , _ngramsSize :: Int
154 } deriving (Generic, Show, Eq, Ord)
155
156 makeLenses ''Ngrams
157 instance PGS.ToRow Ngrams where
158 toRow (Ngrams t s) = [toField t, toField s]
159
160 text2ngrams :: Text -> Ngrams
161 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
162
163 -------------------------------------------------------------------------
164 -- | TODO put it in Gargantext.Text.Ngrams
165 -- Named entity are typed ngrams of Terms Ngrams
166 data NgramsT a =
167 NgramsT { _ngramsType :: NgramsType
168 , _ngramsT :: a
169 } deriving (Generic, Show, Eq, Ord)
170
171 makeLenses ''NgramsT
172
173 instance Functor NgramsT where
174 fmap = over ngramsT
175 -----------------------------------------------------------------------
176 data NgramsIndexed =
177 NgramsIndexed
178 { _ngrams :: Ngrams
179 , _ngramsId :: NgramsId
180 } deriving (Show, Generic, Eq, Ord)
181
182 makeLenses ''NgramsIndexed
183 ------------------------------------------------------------------------
184 data NgramIds =
185 NgramIds
186 { ngramId :: Int
187 , ngramTerms :: Text
188 } deriving (Show, Generic, Eq, Ord)
189
190 instance PGS.FromRow NgramIds where
191 fromRow = NgramIds <$> field <*> field
192
193 ----------------------
194 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
195 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
196
197 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
198 indexNgramsT = fmap . indexNgramsWith . withMap
199
200 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
201 indexNgrams = indexNgramsWith . withMap
202
203 -- NP: not sure we need it anymore
204 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
205 indexNgramsTWith = fmap . indexNgramsWith
206
207 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
208 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
209
210 -- TODO-ACCESS: access must not be checked here but when insertNgrams is called.
211 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
212 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
213
214 -- TODO-ACCESS: access must not be checked here but when insertNgrams' is called.
215 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
216 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
217 where
218 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
219
220 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
221 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
222 where
223 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
224
225 ----------------------
226 queryInsertNgrams :: PGS.Query
227 queryInsertNgrams = [sql|
228 WITH input_rows(terms,n) AS (?)
229 , ins AS (
230 INSERT INTO ngrams (terms,n)
231 SELECT * FROM input_rows
232 ON CONFLICT (terms) DO NOTHING -- unique index created here
233 RETURNING id,terms
234 )
235
236 SELECT id, terms
237 FROM ins
238 UNION ALL
239 SELECT c.id, terms
240 FROM input_rows
241 JOIN ngrams c USING (terms); -- columns of unique index
242 |]
243
244
245