]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
[FIX] Corpus Hyperdata type.
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Schema.Ngrams where
27
28 import Control.Lens (makeLenses, over)
29 import Control.Monad (mzero)
30 import Data.Aeson
31 import Data.Aeson.Types (toJSONKeyText)
32 import Data.ByteString.Internal (ByteString)
33 import Data.Map (Map, fromList, lookup)
34 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
35 import Data.Text (Text, splitOn, pack)
36 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
37 import Database.PostgreSQL.Simple.SqlQQ (sql)
38 import Database.PostgreSQL.Simple.ToField (toField, ToField)
39 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
40 import Database.PostgreSQL.Simple.ToRow (toRow)
41 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
42 import GHC.Generics (Generic)
43 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
44 import Gargantext.Prelude
45 import Opaleye hiding (FromField)
46 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
47 import qualified Database.PostgreSQL.Simple as PGS
48
49
50 type NgramsId = Int
51 type NgramsTerms = Text
52 type Size = Int
53
54 data NgramsPoly id terms n = NgramsDb { _ngrams_id :: id
55 , _ngrams_terms :: terms
56 , _ngrams_n :: n
57 } deriving (Show)
58
59 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
60 (Column PGText)
61 (Column PGInt4)
62
63 type NgramsRead = NgramsPoly (Column PGInt4)
64 (Column PGText)
65 (Column PGInt4)
66
67 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
68 (Column (Nullable PGText))
69 (Column (Nullable PGInt4))
70
71 type NgramsDb = NgramsPoly Int Text Int
72
73 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
74 makeLenses ''NgramsPoly
75
76
77 ngramsTable :: Table NgramsWrite NgramsRead
78 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { _ngrams_id = optional "id"
79 , _ngrams_terms = required "terms"
80 , _ngrams_n = required "n"
81 }
82 )
83
84 queryNgramsTable :: Query NgramsRead
85 queryNgramsTable = queryTable ngramsTable
86
87 dbGetNgramsDb :: Cmd err [NgramsDb]
88 dbGetNgramsDb = runOpaQuery queryNgramsTable
89
90 -- | Main Ngrams Types
91 -- | Typed Ngrams
92 -- Typed Ngrams localize the context of the ngrams
93 -- ngrams in source field of document has Sources Type
94 -- ngrams in authors field of document has Authors Type
95 -- ngrams in text (title or abstract) of documents has Terms Type
96 data NgramsType = Authors | Institutes | Sources | NgramsTerms
97 deriving (Eq, Show, Ord, Enum, Bounded, Generic)
98
99 instance FromJSON NgramsType
100 instance FromJSONKey NgramsType where
101 fromJSONKey = FromJSONKeyTextParser (parseJSON . String)
102 instance ToJSON NgramsType
103 instance ToJSONKey NgramsType where
104 toJSONKey = toJSONKeyText (pack . show)
105
106 newtype NgramsTypeId = NgramsTypeId Int
107 deriving (Eq, Show, Ord, Num)
108
109 instance ToField NgramsTypeId where
110 toField (NgramsTypeId n) = toField n
111
112 instance FromField NgramsTypeId where
113 fromField fld mdata = do
114 n <- fromField fld mdata
115 if (n :: Int) > 0 then return $ NgramsTypeId n
116 else mzero
117
118 instance QueryRunnerColumnDefault (Nullable PGInt4) NgramsTypeId
119 where
120 queryRunnerColumnDefault = fieldQueryRunnerColumn
121
122 pgNgramsType :: NgramsType -> Column PGInt4
123 pgNgramsType = pgNgramsTypeId . ngramsTypeId
124
125 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
126 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
127
128 ngramsTypeId :: NgramsType -> NgramsTypeId
129 ngramsTypeId Authors = 1
130 ngramsTypeId Institutes = 2
131 ngramsTypeId Sources = 3
132 ngramsTypeId NgramsTerms = 4
133
134 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
135 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
136
137 ------------------------------------------------------------------------
138 -- | TODO put it in Gargantext.Text.Ngrams
139 data Ngrams = Ngrams { _ngramsTerms :: Text
140 , _ngramsSize :: Int
141 } deriving (Generic, Show, Eq, Ord)
142
143 makeLenses ''Ngrams
144 instance PGS.ToRow Ngrams where
145 toRow (Ngrams t s) = [toField t, toField s]
146
147 text2ngrams :: Text -> Ngrams
148 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
149
150 -------------------------------------------------------------------------
151 -- | TODO put it in Gargantext.Text.Ngrams
152 -- Named entity are typed ngrams of Terms Ngrams
153 data NgramsT a =
154 NgramsT { _ngramsType :: NgramsType
155 , _ngramsT :: a
156 } deriving (Generic, Show, Eq, Ord)
157
158 makeLenses ''NgramsT
159
160 instance Functor NgramsT where
161 fmap = over ngramsT
162 -----------------------------------------------------------------------
163 data NgramsIndexed =
164 NgramsIndexed
165 { _ngrams :: Ngrams
166 , _ngramsId :: NgramsId
167 } deriving (Show, Generic, Eq, Ord)
168
169 makeLenses ''NgramsIndexed
170 ------------------------------------------------------------------------
171 data NgramIds =
172 NgramIds
173 { ngramId :: Int
174 , ngramTerms :: Text
175 } deriving (Show, Generic, Eq, Ord)
176
177 instance PGS.FromRow NgramIds where
178 fromRow = NgramIds <$> field <*> field
179
180 ----------------------
181 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
182 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
183
184 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
185 indexNgramsT = fmap . indexNgramsWith . withMap
186
187 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
188 indexNgrams = indexNgramsWith . withMap
189
190 -- NP: not sure we need it anymore
191 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
192 indexNgramsTWith = fmap . indexNgramsWith
193
194 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
195 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
196
197 -- TODO-ACCESS: access must not be checked here but when insertNgrams is called.
198 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
199 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
200
201 -- TODO-ACCESS: access must not be checked here but when insertNgrams' is called.
202 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
203 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
204 where
205 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
206
207 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
208 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
209 where
210 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
211
212 ----------------------
213 queryInsertNgrams :: PGS.Query
214 queryInsertNgrams = [sql|
215 WITH input_rows(terms,n) AS (?)
216 , ins AS (
217 INSERT INTO ngrams (terms,n)
218 SELECT * FROM input_rows
219 ON CONFLICT (terms) DO NOTHING -- unique index created here
220 RETURNING id,terms
221 )
222
223 SELECT id, terms
224 FROM ins
225 UNION ALL
226 SELECT c.id, terms
227 FROM input_rows
228 JOIN ngrams c USING (terms); -- columns of unique index
229 |]
230
231
232