]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
Update README.md
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.NgramsPostag
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-orphans #-}
15 {-# LANGUAGE Arrows #-}
16 {-# LANGUAGE FunctionalDependencies #-}
17 {-# LANGUAGE QuasiQuotes #-}
18 {-# LANGUAGE TemplateHaskell #-}
19
20 module Gargantext.Database.Schema.Ngrams
21 where
22
23 import Codec.Serialise (Serialise())
24 import Control.Lens (over)
25 import Control.Monad (mzero)
26 import Data.Aeson
27 import Data.Aeson.Types (toJSONKeyText)
28 import Data.HashMap.Strict (HashMap)
29 import Data.Hashable (Hashable)
30 import Data.Map.Strict (fromList, lookup)
31 import Data.Maybe (fromMaybe)
32 import Data.Text (Text, splitOn, pack, strip)
33 import Database.PostgreSQL.Simple.FromField (returnError, ResultError(..))
34 import Gargantext.Core (HasDBid(..))
35 import Gargantext.Core.Types (TODO(..), Typed(..))
36 import Gargantext.Database.Schema.Prelude
37 import Gargantext.Database.Types
38 import Gargantext.Prelude
39 import Servant (FromHttpApiData(..), Proxy(..), ToHttpApiData(..))
40 import Test.QuickCheck (elements)
41 import Text.Read (read)
42 import qualified Data.ByteString.Char8 as B
43 import qualified Data.HashMap.Strict as HashMap
44 import qualified Database.PostgreSQL.Simple as PGS
45
46
47 type NgramsId = Int
48 type Size = Int
49
50 data NgramsPoly id terms n = NgramsDB { _ngrams_id :: !id
51 , _ngrams_terms :: !terms
52 , _ngrams_n :: !n
53 } deriving (Show)
54
55 type NgramsWrite = NgramsPoly (Maybe (Column SqlInt4))
56 (Column SqlText)
57 (Column SqlInt4)
58
59 type NgramsRead = NgramsPoly (Column SqlInt4)
60 (Column SqlText)
61 (Column SqlInt4)
62
63 type NgramsReadNull = NgramsPoly (Column (Nullable SqlInt4))
64 (Column (Nullable SqlText))
65 (Column (Nullable SqlInt4))
66
67 type NgramsDB = NgramsPoly Int Text Int
68
69 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
70 makeLenses ''NgramsPoly
71
72
73 ngramsTable :: Table NgramsWrite NgramsRead
74 ngramsTable = Table "ngrams" (pNgramsDb NgramsDB { _ngrams_id = optionalTableField "id"
75 , _ngrams_terms = requiredTableField "terms"
76 , _ngrams_n = requiredTableField "n"
77 }
78 )
79
80 -- | Main Ngrams Types
81 -- | Typed Ngrams
82 -- Typed Ngrams localize the context of the ngrams
83 -- ngrams in source field of document has Sources Type
84 -- ngrams in authors field of document has Authors Type
85 -- ngrams in text fields of documents has Terms Type (i.e. either title or abstract)
86 data NgramsType = Authors | Institutes | Sources | NgramsTerms
87 deriving (Eq, Show, Read, Ord, Enum, Bounded, Generic)
88
89 instance Serialise NgramsType
90 instance FromJSON NgramsType
91 where
92 parseJSON (String "Authors") = pure Authors
93 parseJSON (String "Institutes") = pure Institutes
94 parseJSON (String "Sources") = pure Sources
95 parseJSON (String "Terms") = pure NgramsTerms
96 parseJSON (String "NgramsTerms") = pure NgramsTerms
97 parseJSON _ = mzero
98
99 instance FromJSONKey NgramsType where
100 fromJSONKey = FromJSONKeyTextParser (parseJSON . String)
101
102 instance ToJSON NgramsType
103 where
104 toJSON Authors = String "Authors"
105 toJSON Institutes = String "Institutes"
106 toJSON Sources = String "Sources"
107 toJSON NgramsTerms = String "Terms"
108
109 instance ToJSONKey NgramsType where
110 toJSONKey = toJSONKeyText (pack . show)
111 instance FromHttpApiData NgramsType where
112 parseUrlPiece n = pure $ (read . cs) n
113 instance ToHttpApiData NgramsType where
114 toUrlPiece = pack . show
115 instance ToParamSchema NgramsType where
116 toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
117 instance Arbitrary NgramsType where
118 arbitrary = elements [ minBound .. maxBound ]
119
120 -- map NgramsType to its assigned id
121 instance FromField NgramsType where
122 fromField fld mdata =
123 case B.unpack `fmap` mdata of
124 Nothing -> returnError UnexpectedNull fld ""
125 Just dat -> do
126 n <- fromField fld mdata
127 if (n :: Int) > 0 then
128 case fromNgramsTypeId (NgramsTypeId n) of
129 Nothing -> returnError ConversionFailed fld dat
130 Just nt -> pure nt
131 else
132 returnError ConversionFailed fld dat
133 instance ToField NgramsType where
134 toField nt = toField $ ngramsTypeId nt
135
136
137 ngramsTypes :: [NgramsType]
138 ngramsTypes = [minBound..]
139
140 instance ToSchema NgramsType
141 {- where
142 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_nre_")
143 --}
144
145 newtype NgramsTypeId = NgramsTypeId Int
146 deriving (Eq, Show, Ord, Num)
147 instance ToField NgramsTypeId where
148 toField (NgramsTypeId n) = toField n
149 instance FromField NgramsTypeId where
150 fromField fld mdata = do
151 n <- fromField fld mdata
152 if (n :: Int) > 0 then return $ NgramsTypeId n
153 else mzero
154 instance DefaultFromField (Nullable SqlInt4) NgramsTypeId
155 where
156 defaultFromField = fromPGSFromField
157
158 pgNgramsType :: NgramsType -> Column SqlInt4
159 pgNgramsType = pgNgramsTypeId . ngramsTypeId
160
161 pgNgramsTypeId :: NgramsTypeId -> Column SqlInt4
162 pgNgramsTypeId (NgramsTypeId n) = sqlInt4 n
163
164 ngramsTypeId :: NgramsType -> NgramsTypeId
165 ngramsTypeId Authors = 1
166 ngramsTypeId Institutes = 2
167 ngramsTypeId Sources = 3
168 ngramsTypeId NgramsTerms = 4
169
170 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
171 fromNgramsTypeId id = lookup id
172 $ fromList [ (ngramsTypeId nt,nt)
173 | nt <- [minBound .. maxBound] :: [NgramsType]
174 ]
175
176 unNgramsTypeId :: NgramsTypeId -> Int
177 unNgramsTypeId (NgramsTypeId i) = i
178
179 toNgramsTypeId :: Int -> NgramsTypeId
180 toNgramsTypeId i = NgramsTypeId i
181
182 instance HasDBid NgramsType where
183 toDBid = unNgramsTypeId . ngramsTypeId
184 fromDBid = fromMaybe (panic "NgramsType id not indexed") . fromNgramsTypeId . toNgramsTypeId
185
186 ------------------------------------------------------------------------
187 ------------------------------------------------------------------------
188 -- | TODO put it in Gargantext.Core.Text.Ngrams
189 data Ngrams = UnsafeNgrams { _ngramsTerms :: Text
190 , _ngramsSize :: Int
191 }
192 deriving (Generic, Show, Eq, Ord)
193
194 instance Hashable Ngrams
195
196 makeLenses ''Ngrams
197 instance PGS.ToRow Ngrams where
198 toRow (UnsafeNgrams t s) = [toField t, toField s]
199
200 instance FromField Ngrams where
201 fromField fld mdata = do
202 x <- fromField fld mdata
203 pure $ text2ngrams x
204
205 instance PGS.ToRow Text where
206 toRow t = [toField t]
207
208 text2ngrams :: Text -> Ngrams
209 text2ngrams txt = UnsafeNgrams txt' $ length $ splitOn " " txt'
210 where
211 txt' = strip txt
212
213
214 ------------------------------------------------------------------------
215 -------------------------------------------------------------------------
216 -- | TODO put it in Gargantext.Core.Text.Ngrams
217 -- Named entity are typed ngrams of Terms Ngrams
218 data NgramsT a =
219 NgramsT { _ngramsType :: NgramsType
220 , _ngramsT :: a
221 } deriving (Generic, Show, Eq, Ord)
222
223 makeLenses ''NgramsT
224
225 instance Functor NgramsT where
226 fmap = over ngramsT
227
228 -----------------------------------------------------------------------
229 withMap :: HashMap Text NgramsId -> Text -> NgramsId
230 withMap m n = maybe (panic $ "[G.D.S.Ngrams.withMap] Should not happen" <> (cs $ show n))
231 identity (HashMap.lookup n m)
232
233 indexNgramsT :: HashMap Text NgramsId -> NgramsT Ngrams -> NgramsT (Indexed Int Ngrams)
234 indexNgramsT = fmap . indexNgramsWith . withMap
235
236 -- | TODO replace NgramsT whith Typed NgramsType Ngrams
237 indexTypedNgrams :: HashMap Text NgramsId
238 -> Typed NgramsType Ngrams
239 -> Typed NgramsType (Indexed Int Ngrams)
240 indexTypedNgrams = fmap . indexNgramsWith . withMap
241
242 indexNgrams :: HashMap Text NgramsId -> Ngrams -> Indexed Int Ngrams
243 indexNgrams = indexNgramsWith . withMap
244
245 indexNgramsWith :: (Text -> NgramsId) -> Ngrams -> Indexed Int Ngrams
246 indexNgramsWith f n = Indexed (f $ _ngramsTerms n) n