]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
[WIP] bridgeness2 needs optim
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.NgramsPostag
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# OPTIONS_GHC -fno-warn-orphans #-}
15 {-# LANGUAGE Arrows #-}
16 {-# LANGUAGE FunctionalDependencies #-}
17 {-# LANGUAGE QuasiQuotes #-}
18 {-# LANGUAGE TemplateHaskell #-}
19
20 module Gargantext.Database.Schema.Ngrams
21 where
22
23 import Codec.Serialise (Serialise())
24 import Control.Lens (over)
25 import Control.Monad (mzero)
26 import Data.Aeson
27 import Data.Aeson.Types (toJSONKeyText)
28 import Data.HashMap.Strict (HashMap)
29 import Data.Hashable (Hashable)
30 import Data.Map (fromList, lookup)
31 import Data.Maybe (fromMaybe)
32 import Data.Text (Text, splitOn, pack, strip)
33 import Database.PostgreSQL.Simple.FromField (returnError, ResultError(..))
34 import Gargantext.Core (HasDBid(..))
35 import Gargantext.Core.Types (TODO(..), Typed(..))
36 import Gargantext.Database.Schema.Prelude
37 import Gargantext.Database.Types
38 import Gargantext.Prelude
39 import Servant (FromHttpApiData(..), Proxy(..), ToHttpApiData(..))
40 import Test.QuickCheck (elements)
41 import Text.Read (read)
42 import qualified Data.ByteString.Char8 as B
43 import qualified Data.HashMap.Strict as HashMap
44 import qualified Database.PostgreSQL.Simple as PGS
45
46
47 type NgramsId = Int
48 type Size = Int
49
50 data NgramsPoly id terms n = NgramsDB { _ngrams_id :: !id
51 , _ngrams_terms :: !terms
52 , _ngrams_n :: !n
53 } deriving (Show)
54
55 type NgramsWrite = NgramsPoly (Maybe (Column SqlInt4))
56 (Column SqlText)
57 (Column SqlInt4)
58
59 type NgramsRead = NgramsPoly (Column SqlInt4)
60 (Column SqlText)
61 (Column SqlInt4)
62
63 type NgramsReadNull = NgramsPoly (Column (Nullable SqlInt4))
64 (Column (Nullable SqlText))
65 (Column (Nullable SqlInt4))
66
67 type NgramsDB = NgramsPoly Int Text Int
68
69 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
70 makeLenses ''NgramsPoly
71
72
73 ngramsTable :: Table NgramsWrite NgramsRead
74 ngramsTable = Table "ngrams" (pNgramsDb NgramsDB { _ngrams_id = optionalTableField "id"
75 , _ngrams_terms = requiredTableField "terms"
76 , _ngrams_n = requiredTableField "n"
77 }
78 )
79
80 -- | Main Ngrams Types
81 -- | Typed Ngrams
82 -- Typed Ngrams localize the context of the ngrams
83 -- ngrams in source field of document has Sources Type
84 -- ngrams in authors field of document has Authors Type
85 -- ngrams in text fields of documents has Terms Type (i.e. either title or abstract)
86 data NgramsType = Authors | Institutes | Sources | NgramsTerms
87 deriving (Eq, Show, Read, Ord, Enum, Bounded, Generic)
88
89 instance Serialise NgramsType
90 instance FromJSON NgramsType
91 where
92 parseJSON (String "Authors") = pure Authors
93 parseJSON (String "Institutes") = pure Institutes
94 parseJSON (String "Sources") = pure Sources
95 parseJSON (String "Terms") = pure NgramsTerms
96 parseJSON _ = mzero
97
98 instance FromJSONKey NgramsType where
99 fromJSONKey = FromJSONKeyTextParser (parseJSON . String)
100
101 instance ToJSON NgramsType
102 where
103 toJSON Authors = String "Authors"
104 toJSON Institutes = String "Institutes"
105 toJSON Sources = String "Sources"
106 toJSON NgramsTerms = String "Terms"
107
108 instance ToJSONKey NgramsType where
109 toJSONKey = toJSONKeyText (pack . show)
110 instance FromHttpApiData NgramsType where
111 parseUrlPiece n = pure $ (read . cs) n
112 instance ToHttpApiData NgramsType where
113 toUrlPiece = pack . show
114 instance ToParamSchema NgramsType where
115 toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
116 instance Arbitrary NgramsType where
117 arbitrary = elements [ minBound .. maxBound ]
118
119 -- map NgramsType to its assigned id
120 instance FromField NgramsType where
121 fromField fld mdata =
122 case B.unpack `fmap` mdata of
123 Nothing -> returnError UnexpectedNull fld ""
124 Just dat -> do
125 n <- fromField fld mdata
126 if (n :: Int) > 0 then
127 case fromNgramsTypeId (NgramsTypeId n) of
128 Nothing -> returnError ConversionFailed fld dat
129 Just nt -> pure nt
130 else
131 returnError ConversionFailed fld dat
132 instance ToField NgramsType where
133 toField nt = toField $ ngramsTypeId nt
134
135
136 ngramsTypes :: [NgramsType]
137 ngramsTypes = [minBound..]
138
139 instance ToSchema NgramsType
140 {- where
141 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_nre_")
142 --}
143
144 newtype NgramsTypeId = NgramsTypeId Int
145 deriving (Eq, Show, Ord, Num)
146 instance ToField NgramsTypeId where
147 toField (NgramsTypeId n) = toField n
148 instance FromField NgramsTypeId where
149 fromField fld mdata = do
150 n <- fromField fld mdata
151 if (n :: Int) > 0 then return $ NgramsTypeId n
152 else mzero
153 instance DefaultFromField (Nullable SqlInt4) NgramsTypeId
154 where
155 defaultFromField = fromPGSFromField
156
157 pgNgramsType :: NgramsType -> Column SqlInt4
158 pgNgramsType = pgNgramsTypeId . ngramsTypeId
159
160 pgNgramsTypeId :: NgramsTypeId -> Column SqlInt4
161 pgNgramsTypeId (NgramsTypeId n) = sqlInt4 n
162
163 ngramsTypeId :: NgramsType -> NgramsTypeId
164 ngramsTypeId Authors = 1
165 ngramsTypeId Institutes = 2
166 ngramsTypeId Sources = 3
167 ngramsTypeId NgramsTerms = 4
168
169 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
170 fromNgramsTypeId id = lookup id
171 $ fromList [ (ngramsTypeId nt,nt)
172 | nt <- [minBound .. maxBound] :: [NgramsType]
173 ]
174
175 unNgramsTypeId :: NgramsTypeId -> Int
176 unNgramsTypeId (NgramsTypeId i) = i
177
178 toNgramsTypeId :: Int -> NgramsTypeId
179 toNgramsTypeId i = NgramsTypeId i
180
181 instance HasDBid NgramsType where
182 toDBid = unNgramsTypeId . ngramsTypeId
183 fromDBid = fromMaybe (panic "NgramsType id not indexed") . fromNgramsTypeId . toNgramsTypeId
184
185 ------------------------------------------------------------------------
186 ------------------------------------------------------------------------
187 -- | TODO put it in Gargantext.Core.Text.Ngrams
188 data Ngrams = UnsafeNgrams { _ngramsTerms :: Text
189 , _ngramsSize :: Int
190 }
191 deriving (Generic, Show, Eq, Ord)
192
193 instance Hashable Ngrams
194
195 makeLenses ''Ngrams
196 instance PGS.ToRow Ngrams where
197 toRow (UnsafeNgrams t s) = [toField t, toField s]
198
199 instance FromField Ngrams where
200 fromField fld mdata = do
201 x <- fromField fld mdata
202 pure $ text2ngrams x
203
204 instance PGS.ToRow Text where
205 toRow t = [toField t]
206
207 text2ngrams :: Text -> Ngrams
208 text2ngrams txt = UnsafeNgrams txt' $ length $ splitOn " " txt'
209 where
210 txt' = strip txt
211
212
213 ------------------------------------------------------------------------
214 -------------------------------------------------------------------------
215 -- | TODO put it in Gargantext.Core.Text.Ngrams
216 -- Named entity are typed ngrams of Terms Ngrams
217 data NgramsT a =
218 NgramsT { _ngramsType :: NgramsType
219 , _ngramsT :: a
220 } deriving (Generic, Show, Eq, Ord)
221
222 makeLenses ''NgramsT
223
224 instance Functor NgramsT where
225 fmap = over ngramsT
226
227 -----------------------------------------------------------------------
228 withMap :: HashMap Text NgramsId -> Text -> NgramsId
229 withMap m n = maybe (panic $ "[G.D.S.Ngrams.withMap] Should not happen" <> (cs $ show n))
230 identity (HashMap.lookup n m)
231
232 indexNgramsT :: HashMap Text NgramsId -> NgramsT Ngrams -> NgramsT (Indexed Int Ngrams)
233 indexNgramsT = fmap . indexNgramsWith . withMap
234
235 -- | TODO replace NgramsT whith Typed NgramsType Ngrams
236 indexTypedNgrams :: HashMap Text NgramsId
237 -> Typed NgramsType Ngrams
238 -> Typed NgramsType (Indexed Int Ngrams)
239 indexTypedNgrams = fmap . indexNgramsWith . withMap
240
241 indexNgrams :: HashMap Text NgramsId -> Ngrams -> Indexed Int Ngrams
242 indexNgrams = indexNgramsWith . withMap
243
244 indexNgramsWith :: (Text -> NgramsId) -> Ngrams -> Indexed Int Ngrams
245 indexNgramsWith f n = Indexed (f $ _ngramsTerms n) n