]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
[FIX] Clean Text before sending it to NLP micro services + tests + clean code for...
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.NgramsPostag
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-orphans #-}
15 {-# LANGUAGE Arrows #-}
16 {-# LANGUAGE FunctionalDependencies #-}
17 {-# LANGUAGE QuasiQuotes #-}
18 {-# LANGUAGE TemplateHaskell #-}
19
20 module Gargantext.Database.Schema.Ngrams
21 where
22
23 import Codec.Serialise (Serialise())
24 import Control.Lens (over)
25 import Control.Monad (mzero)
26 import Data.Aeson
27 import Data.Aeson.Types (toJSONKeyText)
28 import Data.HashMap.Strict (HashMap)
29 import Data.Hashable (Hashable)
30 import Data.Map.Strict (fromList, lookup)
31 import Data.Maybe (fromMaybe)
32 import Data.Text (Text, splitOn, pack, strip)
33 import Database.PostgreSQL.Simple.FromField (returnError, ResultError(..))
34 import Gargantext.Core (HasDBid(..))
35 import Gargantext.Core.Types (TODO(..), Typed(..))
36 import Gargantext.Database.Schema.Prelude hiding (over)
37 import Gargantext.Database.Types
38 import Gargantext.Prelude
39 import Servant (FromHttpApiData(..), Proxy(..), ToHttpApiData(..))
40 import Test.QuickCheck (elements)
41 import Text.Read (read)
42 import qualified Data.ByteString.Char8 as B
43 import qualified Data.HashMap.Strict as HashMap
44 import qualified Database.PostgreSQL.Simple as PGS
45
46
47 type NgramsId = Int
48 type Size = Int
49
50 data NgramsPoly id terms n = NgramsDB { _ngrams_id :: !id
51 , _ngrams_terms :: !terms
52 , _ngrams_n :: !n
53 } deriving (Show)
54
55 type NgramsWrite = NgramsPoly (Maybe (Field SqlInt4))
56 (Field SqlText)
57 (Field SqlInt4)
58
59 type NgramsRead = NgramsPoly (Field SqlInt4)
60 (Field SqlText)
61 (Field SqlInt4)
62
63 type NgramsDB = NgramsPoly Int Text Int
64
65 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
66 makeLenses ''NgramsPoly
67
68
69 ngramsTable :: Table NgramsWrite NgramsRead
70 ngramsTable = Table "ngrams" (pNgramsDb NgramsDB { _ngrams_id = optionalTableField "id"
71 , _ngrams_terms = requiredTableField "terms"
72 , _ngrams_n = requiredTableField "n"
73 }
74 )
75
76 -- | Main Ngrams Types
77 -- | Typed Ngrams
78 -- Typed Ngrams localize the context of the ngrams
79 -- ngrams in source field of document has Sources Type
80 -- ngrams in authors field of document has Authors Type
81 -- ngrams in text fields of documents has Terms Type (i.e. either title or abstract)
82 data NgramsType = Authors | Institutes | Sources | NgramsTerms
83 deriving (Eq, Show, Read, Ord, Enum, Bounded, Generic)
84
85 instance Serialise NgramsType
86 instance FromJSON NgramsType
87 where
88 parseJSON (String "Authors") = pure Authors
89 parseJSON (String "Institutes") = pure Institutes
90 parseJSON (String "Sources") = pure Sources
91 parseJSON (String "Terms") = pure NgramsTerms
92 parseJSON (String "NgramsTerms") = pure NgramsTerms
93 parseJSON _ = mzero
94
95 instance FromJSONKey NgramsType where
96 fromJSONKey = FromJSONKeyTextParser (parseJSON . String)
97
98 instance ToJSON NgramsType
99 where
100 toJSON Authors = String "Authors"
101 toJSON Institutes = String "Institutes"
102 toJSON Sources = String "Sources"
103 toJSON NgramsTerms = String "Terms"
104
105 instance ToJSONKey NgramsType where
106 toJSONKey = toJSONKeyText (pack . show)
107 instance FromHttpApiData NgramsType where
108 parseUrlPiece n = pure $ (read . cs) n
109 instance ToHttpApiData NgramsType where
110 toUrlPiece = pack . show
111 instance ToParamSchema NgramsType where
112 toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
113 instance Arbitrary NgramsType where
114 arbitrary = elements [ minBound .. maxBound ]
115
116 -- map NgramsType to its assigned id
117 instance FromField NgramsType where
118 fromField fld mdata =
119 case B.unpack `fmap` mdata of
120 Nothing -> returnError UnexpectedNull fld ""
121 Just dat -> do
122 n <- fromField fld mdata
123 if (n :: Int) > 0 then
124 case fromNgramsTypeId (NgramsTypeId n) of
125 Nothing -> returnError ConversionFailed fld dat
126 Just nt -> pure nt
127 else
128 returnError ConversionFailed fld dat
129 instance ToField NgramsType where
130 toField nt = toField $ ngramsTypeId nt
131
132
133 ngramsTypes :: [NgramsType]
134 ngramsTypes = [minBound..]
135
136 instance ToSchema NgramsType
137 {- where
138 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_nre_")
139 --}
140
141 newtype NgramsTypeId = NgramsTypeId Int
142 deriving (Eq, Show, Ord, Num)
143 instance ToField NgramsTypeId where
144 toField (NgramsTypeId n) = toField n
145 instance FromField NgramsTypeId where
146 fromField fld mdata = do
147 n <- fromField fld mdata
148 if (n :: Int) > 0 then return $ NgramsTypeId n
149 else mzero
150 instance DefaultFromField (Nullable SqlInt4) NgramsTypeId
151 where
152 defaultFromField = fromPGSFromField
153
154 pgNgramsType :: NgramsType -> Field SqlInt4
155 pgNgramsType = pgNgramsTypeId . ngramsTypeId
156
157 pgNgramsTypeId :: NgramsTypeId -> Field SqlInt4
158 pgNgramsTypeId (NgramsTypeId n) = sqlInt4 n
159
160 ngramsTypeId :: NgramsType -> NgramsTypeId
161 ngramsTypeId Authors = 1
162 ngramsTypeId Institutes = 2
163 ngramsTypeId Sources = 3
164 ngramsTypeId NgramsTerms = 4
165
166 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
167 fromNgramsTypeId id = lookup id
168 $ fromList [ (ngramsTypeId nt,nt)
169 | nt <- [minBound .. maxBound] :: [NgramsType]
170 ]
171
172 unNgramsTypeId :: NgramsTypeId -> Int
173 unNgramsTypeId (NgramsTypeId i) = i
174
175 toNgramsTypeId :: Int -> NgramsTypeId
176 toNgramsTypeId i = NgramsTypeId i
177
178 instance HasDBid NgramsType where
179 toDBid = unNgramsTypeId . ngramsTypeId
180 fromDBid = fromMaybe (panic "NgramsType id not indexed") . fromNgramsTypeId . toNgramsTypeId
181
182 ------------------------------------------------------------------------
183 ------------------------------------------------------------------------
184 -- | TODO put it in Gargantext.Core.Text.Ngrams
185 data Ngrams = UnsafeNgrams { _ngramsTerms :: Text
186 , _ngramsSize :: Int
187 }
188 deriving (Generic, Show, Eq, Ord)
189
190 instance Hashable Ngrams
191
192 makeLenses ''Ngrams
193 instance PGS.ToRow Ngrams where
194 toRow (UnsafeNgrams t s) = [toField t, toField s]
195
196 instance FromField Ngrams where
197 fromField fld mdata = do
198 x <- fromField fld mdata
199 pure $ text2ngrams x
200
201 instance PGS.ToRow Text where
202 toRow t = [toField t]
203
204 text2ngrams :: Text -> Ngrams
205 text2ngrams txt = UnsafeNgrams txt' $ length $ splitOn " " txt'
206 where
207 txt' = strip txt
208
209
210 ------------------------------------------------------------------------
211 -------------------------------------------------------------------------
212 -- | TODO put it in Gargantext.Core.Text.Ngrams
213 -- Named entity are typed ngrams of Terms Ngrams
214 data NgramsT a =
215 NgramsT { _ngramsType :: NgramsType
216 , _ngramsT :: a
217 } deriving (Generic, Show, Eq, Ord)
218
219 makeLenses ''NgramsT
220
221 instance Functor NgramsT where
222 fmap = over ngramsT
223
224 -----------------------------------------------------------------------
225 withMap :: HashMap Text NgramsId -> Text -> NgramsId
226 withMap m n = maybe (panic $ "[G.D.S.Ngrams.withMap] Should not happen" <> (cs $ show n))
227 identity (HashMap.lookup n m)
228
229 indexNgramsT :: HashMap Text NgramsId -> NgramsT Ngrams -> NgramsT (Indexed Int Ngrams)
230 indexNgramsT = fmap . indexNgramsWith . withMap
231
232 -- | TODO replace NgramsT whith Typed NgramsType Ngrams
233 indexTypedNgrams :: HashMap Text NgramsId
234 -> Typed NgramsType Ngrams
235 -> Typed NgramsType (Indexed Int Ngrams)
236 indexTypedNgrams = fmap . indexNgramsWith . withMap
237
238 indexNgrams :: HashMap Text NgramsId -> Ngrams -> Indexed Int Ngrams
239 indexNgrams = indexNgramsWith . withMap
240
241 indexNgramsWith :: (Text -> NgramsId) -> Ngrams -> Indexed Int Ngrams
242 indexNgramsWith f n = Indexed (f $ _ngramsTerms n) n