]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
Merge branch 'dev' into dev-corpora-from-write-nodes
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.NgramsPostag
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE FunctionalDependencies #-}
16 {-# LANGUAGE QuasiQuotes #-}
17 {-# LANGUAGE TemplateHaskell #-}
18
19 module Gargantext.Database.Schema.Ngrams
20 where
21
22 import Data.HashMap.Strict (HashMap)
23 import Data.Hashable (Hashable)
24 import Codec.Serialise (Serialise())
25 import Control.Lens (over)
26 import Control.Monad (mzero)
27 import Data.Aeson
28 import Data.Aeson.Types (toJSONKeyText)
29 import Data.Map (fromList, lookup)
30 import Data.Text (Text, splitOn, pack, strip)
31 import Gargantext.Core.Types (TODO(..), Typed(..))
32 import Gargantext.Prelude
33 import Servant (FromHttpApiData, parseUrlPiece, Proxy(..))
34 import Text.Read (read)
35 import Gargantext.Database.Types
36 import Gargantext.Database.Schema.Prelude
37 import qualified Database.PostgreSQL.Simple as PGS
38 import qualified Data.HashMap.Strict as HashMap
39
40
41 type NgramsId = Int
42 type Size = Int
43
44 data NgramsPoly id terms n = NgramsDB { _ngrams_id :: !id
45 , _ngrams_terms :: !terms
46 , _ngrams_n :: !n
47 } deriving (Show)
48
49 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
50 (Column PGText)
51 (Column PGInt4)
52
53 type NgramsRead = NgramsPoly (Column PGInt4)
54 (Column PGText)
55 (Column PGInt4)
56
57 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
58 (Column (Nullable PGText))
59 (Column (Nullable PGInt4))
60
61 type NgramsDB = NgramsPoly Int Text Int
62
63 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
64 makeLenses ''NgramsPoly
65
66
67 ngramsTable :: Table NgramsWrite NgramsRead
68 ngramsTable = Table "ngrams" (pNgramsDb NgramsDB { _ngrams_id = optionalTableField "id"
69 , _ngrams_terms = requiredTableField "terms"
70 , _ngrams_n = requiredTableField "n"
71 }
72 )
73
74 -- | Main Ngrams Types
75 -- | Typed Ngrams
76 -- Typed Ngrams localize the context of the ngrams
77 -- ngrams in source field of document has Sources Type
78 -- ngrams in authors field of document has Authors Type
79 -- ngrams in text fields of documents has Terms Type (i.e. either title or abstract)
80 data NgramsType = Authors | Institutes | Sources | NgramsTerms
81 deriving (Eq, Show, Read, Ord, Enum, Bounded, Generic)
82
83 instance Serialise NgramsType
84
85 ngramsTypes :: [NgramsType]
86 ngramsTypes = [minBound..]
87
88 instance ToSchema NgramsType
89 {- where
90 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_nre_")
91 --}
92
93 newtype NgramsTypeId = NgramsTypeId Int
94 deriving (Eq, Show, Ord, Num)
95
96 instance ToField NgramsTypeId where
97 toField (NgramsTypeId n) = toField n
98
99 instance FromField NgramsTypeId where
100 fromField fld mdata = do
101 n <- fromField fld mdata
102 if (n :: Int) > 0 then return $ NgramsTypeId n
103 else mzero
104
105 instance FromJSON NgramsType
106 instance FromJSONKey NgramsType where
107 fromJSONKey = FromJSONKeyTextParser (parseJSON . String)
108
109 instance ToJSON NgramsType
110 instance ToJSONKey NgramsType where
111 toJSONKey = toJSONKeyText (pack . show)
112
113 instance FromHttpApiData NgramsType where
114 parseUrlPiece n = pure $ (read . cs) n
115
116 instance ToParamSchema NgramsType where
117 toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
118
119
120 instance DefaultFromField (Nullable PGInt4) NgramsTypeId
121 where
122 defaultFromField = fieldQueryRunnerColumn
123
124 pgNgramsType :: NgramsType -> Column PGInt4
125 pgNgramsType = pgNgramsTypeId . ngramsTypeId
126
127 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
128 pgNgramsTypeId (NgramsTypeId n) = sqlInt4 n
129
130 ngramsTypeId :: NgramsType -> NgramsTypeId
131 ngramsTypeId Authors = 1
132 ngramsTypeId Institutes = 2
133 ngramsTypeId Sources = 3
134 ngramsTypeId NgramsTerms = 4
135
136 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
137 fromNgramsTypeId id = lookup id
138 $ fromList [ (ngramsTypeId nt,nt)
139 | nt <- [minBound .. maxBound] :: [NgramsType]
140 ]
141
142 ------------------------------------------------------------------------
143 ------------------------------------------------------------------------
144 -- | TODO put it in Gargantext.Core.Text.Ngrams
145 data Ngrams = UnsafeNgrams { _ngramsTerms :: Text
146 , _ngramsSize :: Int
147 }
148 deriving (Generic, Show, Eq, Ord)
149
150 instance Hashable Ngrams
151
152 makeLenses ''Ngrams
153 instance PGS.ToRow Ngrams where
154 toRow (UnsafeNgrams t s) = [toField t, toField s]
155
156 instance FromField Ngrams where
157 fromField fld mdata = do
158 x <- fromField fld mdata
159 pure $ text2ngrams x
160
161 text2ngrams :: Text -> Ngrams
162 text2ngrams txt = UnsafeNgrams txt' $ length $ splitOn " " txt'
163 where
164 txt' = strip txt
165
166
167 ------------------------------------------------------------------------
168 -------------------------------------------------------------------------
169 -- | TODO put it in Gargantext.Core.Text.Ngrams
170 -- Named entity are typed ngrams of Terms Ngrams
171 data NgramsT a =
172 NgramsT { _ngramsType :: NgramsType
173 , _ngramsT :: a
174 } deriving (Generic, Show, Eq, Ord)
175
176 makeLenses ''NgramsT
177
178 instance Functor NgramsT where
179 fmap = over ngramsT
180
181 -----------------------------------------------------------------------
182 withMap :: HashMap Text NgramsId -> Text -> NgramsId
183 withMap m n = maybe (panic $ "[G.D.S.Ngrams.withMap] Should not happen" <> (cs $ show n))
184 identity (HashMap.lookup n m)
185
186 indexNgramsT :: HashMap Text NgramsId -> NgramsT Ngrams -> NgramsT (Indexed Int Ngrams)
187 indexNgramsT = fmap . indexNgramsWith . withMap
188
189 -- | TODO replace NgramsT whith Typed NgramsType Ngrams
190 indexTypedNgrams :: HashMap Text NgramsId
191 -> Typed NgramsType Ngrams
192 -> Typed NgramsType (Indexed Int Ngrams)
193 indexTypedNgrams = fmap . indexNgramsWith . withMap
194
195 indexNgrams :: HashMap Text NgramsId -> Ngrams -> Indexed Int Ngrams
196 indexNgrams = indexNgramsWith . withMap
197
198 indexNgramsWith :: (Text -> NgramsId) -> Ngrams -> Indexed Int Ngrams
199 indexNgramsWith f n = Indexed (f $ _ngramsTerms n) n