]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
Improve text2ngrams to strip spaces
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.Ngrams
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE FunctionalDependencies #-}
16 {-# LANGUAGE QuasiQuotes #-}
17 {-# LANGUAGE TemplateHaskell #-}
18
19 module Gargantext.Database.Schema.Ngrams
20 where
21
22 import Codec.Serialise (Serialise())
23 import Control.Lens (over)
24 import Control.Monad (mzero)
25 import Data.Aeson
26 import Data.Aeson.Types (toJSONKeyText)
27 import Data.Map (Map, fromList, lookup)
28 import Data.Text (Text, splitOn, pack, strip)
29 import Gargantext.Core.Types (TODO(..))
30 import Gargantext.Prelude
31 import Prelude (Functor)
32 import Servant (FromHttpApiData, parseUrlPiece, Proxy(..))
33 import Text.Read (read)
34 import Gargantext.Database.Schema.Prelude
35 import qualified Database.PostgreSQL.Simple as PGS
36
37
38 type NgramsId = Int
39 type NgramsTerms = Text
40 type Size = Int
41
42 data NgramsPoly id terms n = NgramsDB { _ngrams_id :: !id
43 , _ngrams_terms :: !terms
44 , _ngrams_n :: !n
45 } deriving (Show)
46
47 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
48 (Column PGText)
49 (Column PGInt4)
50
51 type NgramsRead = NgramsPoly (Column PGInt4)
52 (Column PGText)
53 (Column PGInt4)
54
55 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
56 (Column (Nullable PGText))
57 (Column (Nullable PGInt4))
58
59 type NgramsDB = NgramsPoly Int Text Int
60
61 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
62 makeLenses ''NgramsPoly
63
64
65 ngramsTable :: Table NgramsWrite NgramsRead
66 ngramsTable = Table "ngrams" (pNgramsDb NgramsDB { _ngrams_id = optional "id"
67 , _ngrams_terms = required "terms"
68 , _ngrams_n = required "n"
69 }
70 )
71
72
73
74 -- | Main Ngrams Types
75 -- | Typed Ngrams
76 -- Typed Ngrams localize the context of the ngrams
77 -- ngrams in source field of document has Sources Type
78 -- ngrams in authors field of document has Authors Type
79 -- ngrams in text (title or abstract) of documents has Terms Type
80 data NgramsType = Authors | Institutes | Sources | NgramsTerms
81 deriving (Eq, Show, Read, Ord, Enum, Bounded, Generic)
82
83 instance Serialise NgramsType
84
85 ngramsTypes :: [NgramsType]
86 ngramsTypes = [minBound..]
87
88 instance ToSchema NgramsType
89 {- where
90 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_nre_")
91 --}
92
93 newtype NgramsTypeId = NgramsTypeId Int
94 deriving (Eq, Show, Ord, Num)
95
96 instance ToField NgramsTypeId where
97 toField (NgramsTypeId n) = toField n
98
99 instance FromField NgramsTypeId where
100 fromField fld mdata = do
101 n <- fromField fld mdata
102 if (n :: Int) > 0 then return $ NgramsTypeId n
103 else mzero
104
105 instance FromJSON NgramsType
106 instance FromJSONKey NgramsType where
107 fromJSONKey = FromJSONKeyTextParser (parseJSON . String)
108 instance ToJSON NgramsType
109 instance ToJSONKey NgramsType where
110 toJSONKey = toJSONKeyText (pack . show)
111
112 instance FromHttpApiData NgramsType where
113 parseUrlPiece n = pure $ (read . cs) n
114
115 instance ToParamSchema NgramsType where
116 toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
117
118
119 instance QueryRunnerColumnDefault (Nullable PGInt4) NgramsTypeId
120 where
121 queryRunnerColumnDefault = fieldQueryRunnerColumn
122
123 pgNgramsType :: NgramsType -> Column PGInt4
124 pgNgramsType = pgNgramsTypeId . ngramsTypeId
125
126 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
127 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
128
129 ngramsTypeId :: NgramsType -> NgramsTypeId
130 ngramsTypeId Authors = 1
131 ngramsTypeId Institutes = 2
132 ngramsTypeId Sources = 3
133 ngramsTypeId NgramsTerms = 4
134
135 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
136 fromNgramsTypeId id = lookup id
137 $ fromList [ (ngramsTypeId nt,nt)
138 | nt <- [minBound .. maxBound] :: [NgramsType]
139 ]
140
141 ------------------------------------------------------------------------
142 -- | TODO put it in Gargantext.Core.Text.Ngrams
143 data Ngrams = UnsafeNgrams { _ngramsTerms :: Text
144 , _ngramsSize :: Int
145 } deriving (Generic, Show, Eq, Ord)
146
147 makeLenses ''Ngrams
148 instance PGS.ToRow Ngrams where
149 toRow (UnsafeNgrams t s) = [toField t, toField s]
150
151 text2ngrams :: Text -> Ngrams
152 text2ngrams txt = UnsafeNgrams txt' $ length $ splitOn " " txt'
153 where
154 txt' = strip txt
155
156 -------------------------------------------------------------------------
157 -- | TODO put it in Gargantext.Core.Text.Ngrams
158 -- Named entity are typed ngrams of Terms Ngrams
159 data NgramsT a =
160 NgramsT { _ngramsType :: NgramsType
161 , _ngramsT :: a
162 } deriving (Generic, Show, Eq, Ord)
163
164 makeLenses ''NgramsT
165
166 instance Functor NgramsT where
167 fmap = over ngramsT
168 -----------------------------------------------------------------------
169 data NgramsIndexed =
170 NgramsIndexed
171 { _ngrams :: Ngrams
172 , _ngramsId :: NgramsId
173 } deriving (Show, Generic, Eq, Ord)
174
175 makeLenses ''NgramsIndexed
176 ------------------------------------------------------------------------
177 data NgramIds =
178 NgramIds
179 { ngramId :: Int
180 , ngramTerms :: Text
181 } deriving (Show, Generic, Eq, Ord)
182
183 instance PGS.FromRow NgramIds where
184 fromRow = NgramIds <$> field <*> field
185
186 ----------------------
187 withMap :: Map NgramsTerms NgramsId -> NgramsTerms -> NgramsId
188 withMap m n = maybe (panic "withMap: should not happen") identity (lookup n m)
189
190 indexNgramsT :: Map NgramsTerms NgramsId -> NgramsT Ngrams -> NgramsT NgramsIndexed
191 indexNgramsT = fmap . indexNgramsWith . withMap
192
193 indexNgrams :: Map NgramsTerms NgramsId -> Ngrams -> NgramsIndexed
194 indexNgrams = indexNgramsWith . withMap
195
196 -- NP: not sure we need it anymore
197 indexNgramsTWith :: (NgramsTerms -> NgramsId) -> NgramsT Ngrams -> NgramsT NgramsIndexed
198 indexNgramsTWith = fmap . indexNgramsWith
199
200 indexNgramsWith :: (NgramsTerms -> NgramsId) -> Ngrams -> NgramsIndexed
201 indexNgramsWith f n = NgramsIndexed n (f $ _ngramsTerms n)
202
203
204
205