]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
[FEAT] Ngrams extraction unsupervized (doc, type, function).
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE DeriveGeneric #-}
16 {-# LANGUAGE FlexibleInstances #-}
17 {-# LANGUAGE FunctionalDependencies #-}
18 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
19 {-# LANGUAGE MultiParamTypeClasses #-}
20 {-# LANGUAGE NoImplicitPrelude #-}
21 {-# LANGUAGE OverloadedStrings #-}
22 {-# LANGUAGE QuasiQuotes #-}
23 {-# LANGUAGE RankNTypes #-}
24 {-# LANGUAGE TemplateHaskell #-}
25
26 module Gargantext.Database.Schema.Ngrams where
27
28 import Control.Lens (makeLenses, over)
29 import Control.Monad (mzero)
30 import Data.Aeson
31 import Data.Aeson.Types (toJSONKeyText)
32 import Data.ByteString.Internal (ByteString)
33 import Data.Map (Map, fromList, lookup)
34 import Data.Profunctor.Product.TH (makeAdaptorAndInstance)
35 import Data.Text (Text, splitOn, pack)
36 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
37 import Database.PostgreSQL.Simple.SqlQQ (sql)
38 import Database.PostgreSQL.Simple.ToField (toField, ToField)
39 import Database.PostgreSQL.Simple.FromField (FromField, fromField)
40 import Database.PostgreSQL.Simple.ToRow (toRow)
41 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
42 import GHC.Generics (Generic)
43 import Gargantext.Database.Utils (Cmd, runPGSQuery, runOpaQuery, formatPGSQuery)
44 import Gargantext.Prelude
45 import Opaleye hiding (FromField)
46 import Prelude (Enum, Bounded, minBound, maxBound, Functor)
47 import qualified Database.PostgreSQL.Simple as PGS
48
49
50 type NgramsId = Int
51 type NgramsTerms = Text
52 type Size = Int
53
54 data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
55 , ngrams_terms :: terms
56 , ngrams_n :: n
57 } deriving (Show)
58
59 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
60 (Column PGText)
61 (Column PGInt4)
62
63 type NgramsRead = NgramsPoly (Column PGInt4)
64 (Column PGText)
65 (Column PGInt4)
66
67 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
68 (Column (Nullable PGText))
69 (Column (Nullable PGInt4))
70
71 type NgramsDb = NgramsPoly Int Text Int
72
73 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
74 -- $(makeLensesWith abbreviatedFields ''NgramsPoly)
75
76 ngramsTable :: Table NgramsWrite NgramsRead
77 ngramsTable = Table "ngrams" (pNgramsDb NgramsDb { ngrams_id = optional "id"
78 , ngrams_terms = required "terms"
79 , ngrams_n = required "n"
80 }
81 )
82
83 queryNgramsTable :: Query NgramsRead
84 queryNgramsTable = queryTable ngramsTable
85
86 dbGetNgramsDb :: Cmd err [NgramsDb]
87 dbGetNgramsDb = runOpaQuery queryNgramsTable
88
89 -- | Main Ngrams Types
90 -- | Typed Ngrams
91 -- Typed Ngrams localize the context of the ngrams
92 -- ngrams in source field of document has Sources Type
93 -- ngrams in authors field of document has Authors Type
94 -- ngrams in text (title or abstract) of documents has Terms Type
95 data NgramsType = Authors | Institutes | Sources | NgramsTerms
96 deriving (Eq, Show, Ord, Enum, Bounded, Generic)
97
98 instance FromJSON NgramsType
99 instance FromJSONKey NgramsType where
100 fromJSONKey = FromJSONKeyTextParser (parseJSON . String)
101 instance ToJSON NgramsType
102 instance ToJSONKey NgramsType where
103 toJSONKey = toJSONKeyText (pack . show)
104
105 newtype NgramsTypeId = NgramsTypeId Int
106 deriving (Eq, Show, Ord, Num)
107
108 instance ToField NgramsTypeId where
109 toField (NgramsTypeId n) = toField n
110
111 instance FromField NgramsTypeId where
112 fromField fld mdata = do
113 n <- fromField fld mdata
114 if (n :: Int) > 0 then return $ NgramsTypeId n
115 else mzero
116
117 instance QueryRunnerColumnDefault (Nullable PGInt4) NgramsTypeId
118 where
119 queryRunnerColumnDefault = fieldQueryRunnerColumn
120
121 pgNgramsType :: NgramsType -> Column PGInt4
122 pgNgramsType = pgNgramsTypeId . ngramsTypeId
123
124 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
125 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
126
127 ngramsTypeId :: NgramsType -> NgramsTypeId
128 ngramsTypeId Authors = 1
129 ngramsTypeId Institutes = 2
130 ngramsTypeId Sources = 3
131 ngramsTypeId NgramsTerms = 4
132
133 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
134 fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
135
136 ------------------------------------------------------------------------
137 -- | TODO put it in Gargantext.Text.Ngrams
138 data Ngrams = Ngrams { _ngramsTerms :: Text
139 , _ngramsSize :: Int
140 } deriving (Generic, Show, Eq, Ord)
141
142 makeLenses ''Ngrams
143 instance PGS.ToRow Ngrams where
144 toRow (Ngrams t s) = [toField t, toField s]
145
146 text2ngrams :: Text -> Ngrams
147 text2ngrams txt = Ngrams txt $ length $ splitOn " " txt
148
149 -------------------------------------------------------------------------
150 -- | TODO put it in Gargantext.Text.Ngrams
151 -- Named entity are typed ngrams of Terms Ngrams
152 data NgramsT a =
153 NgramsT { _ngramsType :: NgramsType
154 , _ngramsT :: a
155 } deriving (Generic, Show, Eq, Ord)
156
157 makeLenses ''NgramsT
158
159 instance Functor NgramsT where
160 fmap = over ngramsT
161 -----------------------------------------------------------------------
162 data NgramsIndexed =
163 NgramsIndexed
164 { _ngrams :: Ngrams
165 , _ngramsId :: NgramsId
166 } deriving (Show, Generic, Eq, Ord)
167
168 makeLenses ''NgramsIndexed
169 ------------------------------------------------------------------------
170 data NgramIds =
171 NgramIds
172 { ngramId :: Int
173 , ngramTerms :: Text
174 } deriving (Show, Generic, Eq, Ord)
175
176 instance PGS.FromRow NgramIds where
177 fromRow = NgramIds <$> field <*> field
178
179 ----------------------
180 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
181 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
182
183 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
184 indexNgramsT = fmap . indexNgramsWith . withMap
185
186 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
187 indexNgrams = indexNgramsWith . withMap
188
189 -- NP: not sure we need it anymore
190 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
191 indexNgramsTWith = fmap . indexNgramsWith
192
193 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
194 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
195
196 -- TODO-ACCESS: access must not be checked here but when insertNgrams is called.
197 insertNgrams :: [Ngrams] -> Cmd err (Map NgramsTerms NgramsId)
198 insertNgrams ns = fromList <$> map (\(NgramIds i t) -> (t, i)) <$> (insertNgrams' ns)
199
200 -- TODO-ACCESS: access must not be checked here but when insertNgrams' is called.
201 insertNgrams' :: [Ngrams] -> Cmd err [NgramIds]
202 insertNgrams' ns = runPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
203 where
204 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
205
206 insertNgrams_Debug :: [(NgramsTerms, Size)] -> Cmd err ByteString
207 insertNgrams_Debug ns = formatPGSQuery queryInsertNgrams (PGS.Only $ Values fields ns)
208 where
209 fields = map (\t -> QualifiedIdentifier Nothing t) ["text", "int4"]
210
211 ----------------------
212 queryInsertNgrams :: PGS.Query
213 queryInsertNgrams = [sql|
214 WITH input_rows(terms,n) AS (?)
215 , ins AS (
216 INSERT INTO ngrams (terms,n)
217 SELECT * FROM input_rows
218 ON CONFLICT (terms) DO NOTHING -- unique index created here
219 RETURNING id,terms
220 )
221
222 SELECT id, terms
223 FROM ins
224 UNION ALL
225 SELECT c.id, terms
226 FROM input_rows
227 JOIN ngrams c USING (terms); -- columns of unique index
228 |]
229
230
231