]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
[john-snow] implement pos/lemma language
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.NgramsPostag
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE FunctionalDependencies #-}
16 {-# LANGUAGE QuasiQuotes #-}
17 {-# LANGUAGE TemplateHaskell #-}
18
19 module Gargantext.Database.Schema.Ngrams
20 where
21
22 import Data.HashMap.Strict (HashMap)
23 import Data.Hashable (Hashable)
24 import Codec.Serialise (Serialise())
25 import Control.Lens (over)
26 import Control.Monad (mzero)
27 import Data.Aeson
28 import Data.Aeson.Types (toJSONKeyText)
29 import Data.Map (fromList, lookup)
30 import Data.Text (Text, splitOn, pack, strip)
31 import Gargantext.Core.Types (TODO(..), Typed(..))
32 import Gargantext.Prelude
33 import Servant (FromHttpApiData(..), Proxy(..), ToHttpApiData(..))
34 import Text.Read (read)
35 import Gargantext.Database.Types
36 import Gargantext.Database.Schema.Prelude
37 import qualified Database.PostgreSQL.Simple as PGS
38 import qualified Data.HashMap.Strict as HashMap
39
40
41 type NgramsId = Int
42 type Size = Int
43
44 data NgramsPoly id terms n = NgramsDB { _ngrams_id :: !id
45 , _ngrams_terms :: !terms
46 , _ngrams_n :: !n
47 } deriving (Show)
48
49 type NgramsWrite = NgramsPoly (Maybe (Column SqlInt4))
50 (Column SqlText)
51 (Column SqlInt4)
52
53 type NgramsRead = NgramsPoly (Column SqlInt4)
54 (Column SqlText)
55 (Column SqlInt4)
56
57 type NgramsReadNull = NgramsPoly (Column (Nullable SqlInt4))
58 (Column (Nullable SqlText))
59 (Column (Nullable SqlInt4))
60
61 type NgramsDB = NgramsPoly Int Text Int
62
63 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
64 makeLenses ''NgramsPoly
65
66
67 ngramsTable :: Table NgramsWrite NgramsRead
68 ngramsTable = Table "ngrams" (pNgramsDb NgramsDB { _ngrams_id = optionalTableField "id"
69 , _ngrams_terms = requiredTableField "terms"
70 , _ngrams_n = requiredTableField "n"
71 }
72 )
73
74 -- | Main Ngrams Types
75 -- | Typed Ngrams
76 -- Typed Ngrams localize the context of the ngrams
77 -- ngrams in source field of document has Sources Type
78 -- ngrams in authors field of document has Authors Type
79 -- ngrams in text fields of documents has Terms Type (i.e. either title or abstract)
80 data NgramsType = Authors | Institutes | Sources | NgramsTerms
81 deriving (Eq, Show, Read, Ord, Enum, Bounded, Generic)
82
83 instance Serialise NgramsType
84
85 ngramsTypes :: [NgramsType]
86 ngramsTypes = [minBound..]
87
88 instance ToSchema NgramsType
89 {- where
90 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_nre_")
91 --}
92
93 newtype NgramsTypeId = NgramsTypeId Int
94 deriving (Eq, Show, Ord, Num)
95
96 instance ToField NgramsTypeId where
97 toField (NgramsTypeId n) = toField n
98
99 instance FromField NgramsTypeId where
100 fromField fld mdata = do
101 n <- fromField fld mdata
102 if (n :: Int) > 0 then return $ NgramsTypeId n
103 else mzero
104
105 instance FromJSON NgramsType
106 instance FromJSONKey NgramsType where
107 fromJSONKey = FromJSONKeyTextParser (parseJSON . String)
108
109 instance ToJSON NgramsType
110 instance ToJSONKey NgramsType where
111 toJSONKey = toJSONKeyText (pack . show)
112
113 instance FromHttpApiData NgramsType where
114 parseUrlPiece n = pure $ (read . cs) n
115 instance ToHttpApiData NgramsType where
116 toUrlPiece = pack . show
117
118 instance ToParamSchema NgramsType where
119 toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
120
121
122 instance DefaultFromField (Nullable SqlInt4) NgramsTypeId
123 where
124 defaultFromField = fromPGSFromField
125
126 pgNgramsType :: NgramsType -> Column SqlInt4
127 pgNgramsType = pgNgramsTypeId . ngramsTypeId
128
129 pgNgramsTypeId :: NgramsTypeId -> Column SqlInt4
130 pgNgramsTypeId (NgramsTypeId n) = sqlInt4 n
131
132 ngramsTypeId :: NgramsType -> NgramsTypeId
133 ngramsTypeId Authors = 1
134 ngramsTypeId Institutes = 2
135 ngramsTypeId Sources = 3
136 ngramsTypeId NgramsTerms = 4
137
138 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
139 fromNgramsTypeId id = lookup id
140 $ fromList [ (ngramsTypeId nt,nt)
141 | nt <- [minBound .. maxBound] :: [NgramsType]
142 ]
143
144 ------------------------------------------------------------------------
145 ------------------------------------------------------------------------
146 -- | TODO put it in Gargantext.Core.Text.Ngrams
147 data Ngrams = UnsafeNgrams { _ngramsTerms :: Text
148 , _ngramsSize :: Int
149 }
150 deriving (Generic, Show, Eq, Ord)
151
152 instance Hashable Ngrams
153
154 makeLenses ''Ngrams
155 instance PGS.ToRow Ngrams where
156 toRow (UnsafeNgrams t s) = [toField t, toField s]
157
158 instance FromField Ngrams where
159 fromField fld mdata = do
160 x <- fromField fld mdata
161 pure $ text2ngrams x
162
163 text2ngrams :: Text -> Ngrams
164 text2ngrams txt = UnsafeNgrams txt' $ length $ splitOn " " txt'
165 where
166 txt' = strip txt
167
168
169 ------------------------------------------------------------------------
170 -------------------------------------------------------------------------
171 -- | TODO put it in Gargantext.Core.Text.Ngrams
172 -- Named entity are typed ngrams of Terms Ngrams
173 data NgramsT a =
174 NgramsT { _ngramsType :: NgramsType
175 , _ngramsT :: a
176 } deriving (Generic, Show, Eq, Ord)
177
178 makeLenses ''NgramsT
179
180 instance Functor NgramsT where
181 fmap = over ngramsT
182
183 -----------------------------------------------------------------------
184 withMap :: HashMap Text NgramsId -> Text -> NgramsId
185 withMap m n = maybe (panic $ "[G.D.S.Ngrams.withMap] Should not happen" <> (cs $ show n))
186 identity (HashMap.lookup n m)
187
188 indexNgramsT :: HashMap Text NgramsId -> NgramsT Ngrams -> NgramsT (Indexed Int Ngrams)
189 indexNgramsT = fmap . indexNgramsWith . withMap
190
191 -- | TODO replace NgramsT whith Typed NgramsType Ngrams
192 indexTypedNgrams :: HashMap Text NgramsId
193 -> Typed NgramsType Ngrams
194 -> Typed NgramsType (Indexed Int Ngrams)
195 indexTypedNgrams = fmap . indexNgramsWith . withMap
196
197 indexNgrams :: HashMap Text NgramsId -> Ngrams -> Indexed Int Ngrams
198 indexNgrams = indexNgramsWith . withMap
199
200 indexNgramsWith :: (Text -> NgramsId) -> Ngrams -> Indexed Int Ngrams
201 indexNgramsWith f n = Indexed (f $ _ngramsTerms n) n