]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Schema/Ngrams.hs
[TextFlow REFACT] addin ExtractedNgrams type for simple or enriched ngrams extraction
[gargantext.git] / src / Gargantext / Database / Schema / Ngrams.hs
1 {-|
2 Module : Gargantext.Database.Schema.NgramsPostag
3 Description : Ngram connection to the Database
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Ngrams connection to the Database.
11
12 -}
13
14 {-# LANGUAGE Arrows #-}
15 {-# LANGUAGE FunctionalDependencies #-}
16 {-# LANGUAGE QuasiQuotes #-}
17 {-# LANGUAGE TemplateHaskell #-}
18
19 module Gargantext.Database.Schema.Ngrams
20 where
21
22 import Data.HashMap.Strict (HashMap)
23 import Data.Hashable (Hashable)
24 import Codec.Serialise (Serialise())
25 import Control.Lens (over)
26 import Control.Monad (mzero)
27 import Data.Aeson
28 import Data.Aeson.Types (toJSONKeyText)
29 import Data.Map (Map, fromList, lookup)
30 import Data.Text (Text, splitOn, pack, strip)
31 import Gargantext.Core.Types (TODO(..), Typed(..))
32 import Gargantext.Prelude
33 import Prelude (Functor)
34 import Servant (FromHttpApiData, parseUrlPiece, Proxy(..))
35 import Text.Read (read)
36 import Gargantext.Database.Types
37 import Gargantext.Database.Schema.Prelude
38 import qualified Database.PostgreSQL.Simple as PGS
39 import qualified Data.HashMap.Strict as HashMap
40
41
42 type NgramsId = Int
43 type Size = Int
44
45 data NgramsPoly id terms n = NgramsDB { _ngrams_id :: !id
46 , _ngrams_terms :: !terms
47 , _ngrams_n :: !n
48 } deriving (Show)
49
50 type NgramsWrite = NgramsPoly (Maybe (Column PGInt4))
51 (Column PGText)
52 (Column PGInt4)
53
54 type NgramsRead = NgramsPoly (Column PGInt4)
55 (Column PGText)
56 (Column PGInt4)
57
58 type NgramsReadNull = NgramsPoly (Column (Nullable PGInt4))
59 (Column (Nullable PGText))
60 (Column (Nullable PGInt4))
61
62 type NgramsDB = NgramsPoly Int Text Int
63
64 $(makeAdaptorAndInstance "pNgramsDb" ''NgramsPoly)
65 makeLenses ''NgramsPoly
66
67
68 ngramsTable :: Table NgramsWrite NgramsRead
69 ngramsTable = Table "ngrams" (pNgramsDb NgramsDB { _ngrams_id = optional "id"
70 , _ngrams_terms = required "terms"
71 , _ngrams_n = required "n"
72 }
73 )
74
75 -- | Main Ngrams Types
76 -- | Typed Ngrams
77 -- Typed Ngrams localize the context of the ngrams
78 -- ngrams in source field of document has Sources Type
79 -- ngrams in authors field of document has Authors Type
80 -- ngrams in text fields of documents has Terms Type (i.e. either title or abstract)
81 data NgramsType = Authors | Institutes | Sources | NgramsTerms
82 deriving (Eq, Show, Read, Ord, Enum, Bounded, Generic)
83
84 instance Serialise NgramsType
85
86 ngramsTypes :: [NgramsType]
87 ngramsTypes = [minBound..]
88
89 instance ToSchema NgramsType
90 {- where
91 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_nre_")
92 --}
93
94 newtype NgramsTypeId = NgramsTypeId Int
95 deriving (Eq, Show, Ord, Num)
96
97 instance ToField NgramsTypeId where
98 toField (NgramsTypeId n) = toField n
99
100 instance FromField NgramsTypeId where
101 fromField fld mdata = do
102 n <- fromField fld mdata
103 if (n :: Int) > 0 then return $ NgramsTypeId n
104 else mzero
105
106 instance FromJSON NgramsType
107 instance FromJSONKey NgramsType where
108 fromJSONKey = FromJSONKeyTextParser (parseJSON . String)
109
110 instance ToJSON NgramsType
111 instance ToJSONKey NgramsType where
112 toJSONKey = toJSONKeyText (pack . show)
113
114 instance FromHttpApiData NgramsType where
115 parseUrlPiece n = pure $ (read . cs) n
116
117 instance ToParamSchema NgramsType where
118 toParamSchema _ = toParamSchema (Proxy :: Proxy TODO)
119
120
121 instance QueryRunnerColumnDefault (Nullable PGInt4) NgramsTypeId
122 where
123 queryRunnerColumnDefault = fieldQueryRunnerColumn
124
125 pgNgramsType :: NgramsType -> Column PGInt4
126 pgNgramsType = pgNgramsTypeId . ngramsTypeId
127
128 pgNgramsTypeId :: NgramsTypeId -> Column PGInt4
129 pgNgramsTypeId (NgramsTypeId n) = pgInt4 n
130
131 ngramsTypeId :: NgramsType -> NgramsTypeId
132 ngramsTypeId Authors = 1
133 ngramsTypeId Institutes = 2
134 ngramsTypeId Sources = 3
135 ngramsTypeId NgramsTerms = 4
136
137 fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
138 fromNgramsTypeId id = lookup id
139 $ fromList [ (ngramsTypeId nt,nt)
140 | nt <- [minBound .. maxBound] :: [NgramsType]
141 ]
142
143 ------------------------------------------------------------------------
144 ------------------------------------------------------------------------
145 -- | TODO put it in Gargantext.Core.Text.Ngrams
146 data Ngrams = UnsafeNgrams { _ngramsTerms :: Text
147 , _ngramsSize :: Int
148 }
149 deriving (Generic, Show, Eq, Ord)
150
151 instance Hashable Ngrams
152
153 makeLenses ''Ngrams
154 instance PGS.ToRow Ngrams where
155 toRow (UnsafeNgrams t s) = [toField t, toField s]
156
157 instance FromField Ngrams where
158 fromField fld mdata = do
159 x <- fromField fld mdata
160 pure $ text2ngrams x
161
162 text2ngrams :: Text -> Ngrams
163 text2ngrams txt = UnsafeNgrams txt' $ length $ splitOn " " txt'
164 where
165 txt' = strip txt
166
167
168 ------------------------------------------------------------------------
169 -------------------------------------------------------------------------
170 -- | TODO put it in Gargantext.Core.Text.Ngrams
171 -- Named entity are typed ngrams of Terms Ngrams
172 data NgramsT a =
173 NgramsT { _ngramsType :: NgramsType
174 , _ngramsT :: a
175 } deriving (Generic, Show, Eq, Ord)
176
177 makeLenses ''NgramsT
178
179 instance Functor NgramsT where
180 fmap = over ngramsT
181
182 -----------------------------------------------------------------------
183 withMap :: HashMap Text NgramsId -> Text -> NgramsId
184 withMap m n = maybe (panic "withMap: should not happen") identity (HashMap.lookup n m)
185
186 indexNgramsT :: HashMap Text NgramsId -> NgramsT Ngrams -> NgramsT (Indexed Int Ngrams)
187 indexNgramsT = fmap . indexNgramsWith . withMap
188
189 -- | TODO replace NgramsT whith Typed NgramsType Ngrams
190 indexTypedNgrams :: HashMap Text NgramsId
191 -> Typed NgramsType Ngrams
192 -> Typed NgramsType (Indexed Int Ngrams)
193 indexTypedNgrams = fmap . indexNgramsWith . withMap
194
195 indexNgrams :: HashMap Text NgramsId -> Ngrams -> Indexed Int Ngrams
196 indexNgrams = indexNgramsWith . withMap
197
198 indexNgramsWith :: (Text -> NgramsId) -> Ngrams -> Indexed Int Ngrams
199 indexNgramsWith f n = Indexed (f $ _ngramsTerms n) n