]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Query/Table/Node/Document/Insert.hs
[istex] scroll API fetch, first draft
[gargantext.git] / src / Gargantext / Database / Query / Table / Node / Document / Insert.hs
1 {-|
2 Module : Gargantext.Database.Node.Document.Insert
3 Description : Importing context of texts (documents)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 * Purpose of this module
11
12 Enabling "common goods" of text data and respecting privacy.
13
14 Gargantext shares as "common good" the links between context of texts
15 and terms / words / ngrams.
16
17 Basically a context of text can be defined as a document (see 'Gargantext.Core.Text').
18
19 Issue to tackle in that module: each global document of Gargantext has
20 to be unique, then shared, but how to respect privacy if needed ?
21
22
23 * Methodology to get uniqueness and privacy by design
24
25 As a consequence, when importing/inserting a new document in Gargantext,
26 a policy for the uniqueness of the inserted docuemnts has to be defined.
27
28 That is the purpose of this module which defines its main concepts.
29
30 Unique identifier in database is of a 3-tuple of 3 policies that
31 together define uniqueness:
32
33 - Design policy: type of node is needed as TypenameId, that is a
34 Document or Individual or something else;
35
36 - Privacy policy: with ParentId, parent becomes unique, then it enables
37 users to get their own copy without sharing it with all the users of the
38 database (in others words parent_id is necessary to preserve privacy for
39 instance).
40
41 - Hash policy: this UniqId is a sha256 uniq id which is the result of
42 the concatenation of the parameters defined by @shaParameters@.
43
44 > -- * Example
45 > insertTest :: FromRow r => CorpusId -> [Node HyperdataDocument] -> IO [r]
46 > insertTest :: IO [ReturnId]
47 > insertTest = runCmdDev $ insertDocuments 1 452162 hyperdataDocuments
48
49 -}
50 ------------------------------------------------------------------------
51 {-# LANGUAGE QuasiQuotes #-}
52 {-# LANGUAGE TypeSynonymInstances #-}
53 ------------------------------------------------------------------------
54 module Gargantext.Database.Query.Table.Node.Document.Insert
55 where
56
57 import Control.Lens (set, view)
58 import Control.Lens.Cons
59 import Control.Lens.Prism
60 import Data.Aeson (toJSON, encode, ToJSON)
61 import Data.Maybe (fromMaybe)
62 import Data.Text (Text)
63 -- import Data.ByteString (ByteString)
64 import Data.Time.Segment (jour)
65 import Database.PostgreSQL.Simple (FromRow, Query, Only(..))
66 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
67 -- import Database.PostgreSQL.Simple.ToRow (toRow, ToRow)
68 import Database.PostgreSQL.Simple.SqlQQ
69 import Database.PostgreSQL.Simple.ToField (toField, Action{-, ToField-})
70 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
71 import GHC.Generics (Generic)
72 import Gargantext.Core (HasDBid(toDBid))
73 import Gargantext.Database.Admin.Types.Hyperdata
74 import Gargantext.Database.Admin.Types.Node
75 import Gargantext.Database.Prelude (Cmd, runPGSQuery{-, formatPGSQuery-})
76 import Gargantext.Database.Schema.Node (NodePoly(..))
77 import qualified Gargantext.Defaults as Defaults
78 import Gargantext.Prelude
79 import Gargantext.Prelude.Crypto.Hash (hash)
80 import qualified Data.Text as DT (pack, concat, take)
81
82 {-| To Print result query
83 import Data.ByteString.Internal (ByteString)
84 import Database.PostgreSQL.Simple (formatQuery)
85 -}
86
87 ---------------------------------------------------------------------------
88 -- * Main Insert functions
89
90 -- | Insert Document main function
91 -- UserId : user who is inserting the documents
92 -- ParentId : folder ID which is parent of the inserted documents
93 -- Administrator of the database has to create a uniq index as following SQL command:
94 -- `create unique index on nodes (typename, parent_id, (hyperdata ->> 'uniqId'));`
95 insertDb :: (InsertDb a, HasDBid NodeType) => UserId -> ParentId -> [a] -> Cmd err [ReturnId]
96 insertDb u p = runPGSQuery queryInsert . Only . Values fields . map (insertDb' u p)
97 where
98 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
99
100 class InsertDb a
101 where
102 insertDb' :: HasDBid NodeType => UserId -> ParentId -> a -> [Action]
103
104
105 instance InsertDb HyperdataDocument
106 where
107 insertDb' u p h = [ toField ("" :: Text)
108 , toField $ toDBid NodeDocument
109 , toField u
110 , toField p
111 , toField $ maybe "No Title" (DT.take 255) (_hd_title h)
112 , toField $ _hd_publication_date h -- TODO USE UTCTime
113 , (toField . toJSON) h
114 ]
115
116 instance InsertDb HyperdataContact
117 where
118 insertDb' u p h = [ toField ("" :: Text)
119 , toField $ toDBid NodeContact
120 , toField u
121 , toField p
122 , toField $ maybe "Contact" (DT.take 255) (Just "Name") -- (_hc_name h)
123 , toField $ jour 0 1 1 -- TODO put default date
124 , (toField . toJSON) h
125 ]
126
127 instance ToJSON a => InsertDb (Node a)
128 where
129 insertDb' _u _p (Node _nid hashId t u p n d h) = [ toField hashId
130 , toField t
131 , toField u
132 , toField p
133 , toField n
134 , toField d
135 , (toField . toJSON) h
136 ]
137
138 -- | Debug SQL function
139 --
140 -- to print rendered query (Debug purpose) use @formatQuery@ function.
141 {-
142 insertDocuments_Debug :: (Hyperdata a, ToJSON a, ToRow a, InsertDb [a])
143 => UserId -> ParentId -> [a] -> Cmd err ByteString
144 insertDocuments_Debug uId pId hs = formatPGSQuery queryInsert (Only $ Values fields inputData)
145 where
146 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
147 inputData = insertDb' uId pId hs
148 -}
149
150 -- | Input Tables: types of the tables
151 inputSqlTypes :: [Text]
152 inputSqlTypes = map DT.pack ["text", "int4","int4","int4","text","date","jsonb"]
153
154 -- | SQL query to insert documents inside the database
155 queryInsert :: Query
156 queryInsert = [sql|
157 WITH input_rows(hash_id,typename,user_id,parent_id,name,date,hyperdata) AS (?)
158 , ins AS (
159 INSERT INTO nodes (hash_id, typename,user_id,parent_id,name,date,hyperdata)
160 SELECT * FROM input_rows
161 ON CONFLICT (hash_id) DO NOTHING -- on unique index -- this does not return the ids
162 RETURNING id,hash_id
163 )
164
165 SELECT true AS source -- true for 'newly inserted'
166 , id
167 , hash_id
168 FROM ins
169 UNION ALL
170 SELECT false AS source -- false for 'not inserted'
171 , n.id
172 , hash_id
173 FROM input_rows
174 JOIN nodes n USING (hash_id); -- columns of unique index
175 |]
176
177 ------------------------------------------------------------------------
178 -- * Main Types used
179 -- ** Return Types
180
181 -- | When documents are inserted
182 -- ReturnType after insertion
183 data ReturnId = ReturnId { reInserted :: Bool -- if the document is inserted (True: is new, False: is not new)
184 , reId :: NodeId -- always return the id of the document (even new or not new)
185 -- this is the uniq id in the database
186 , reUniqId :: Text -- Hash Id with concatenation of sha parameters
187 } deriving (Show, Generic)
188
189 instance FromRow ReturnId where
190 fromRow = ReturnId <$> field <*> field <*> field
191
192 ---------------------------------------------------------------------------
193 -- * Uniqueness of document definition
194
195 class AddUniqId a
196 where
197 addUniqId :: a -> a
198
199 instance AddUniqId HyperdataDocument
200 where
201 addUniqId = addUniqIdsDoc
202 where
203 addUniqIdsDoc :: HyperdataDocument -> HyperdataDocument
204 addUniqIdsDoc doc = set hd_uniqIdBdd (Just shaBdd)
205 $ set hd_uniqId (Just shaUni) doc
206 where
207 shaUni = hash $ DT.concat $ map ($ doc) shaParametersDoc
208 shaBdd = hash $ DT.concat $ map ($ doc) ([(\d -> maybeText (_hd_bdd d))] <> shaParametersDoc)
209
210 shaParametersDoc :: [(HyperdataDocument -> Text)]
211 shaParametersDoc = [ \d -> maybeText (_hd_title d)
212 , \d -> maybeText (_hd_abstract d)
213 , \d -> maybeText (_hd_source d)
214 , \d -> maybeText (_hd_publication_date d)
215 ]
216 -- TODO put this elsewhere (fix bin/gargantext-init/Main.hs too)
217 secret :: Text
218 secret = "Database secret to change"
219
220
221 instance (AddUniqId a, ToJSON a, HasDBid NodeType) => AddUniqId (Node a)
222 where
223 addUniqId (Node nid _ t u p n d h) = Node nid hashId t u p n d h
224 where
225 hashId = Just $ "\\x" <> (hash $ DT.concat params)
226 params = [ secret
227 , cs $ show $ toDBid NodeDocument
228 , n
229 , cs $ show p
230 , cs $ encode h
231 ]
232 {-
233 addUniqId n@(Node nid _ t u p n d h) =
234 case n of
235 Node HyperdataDocument -> Node nid hashId t u p n d h
236 where
237 hashId = "\\x" <> (hash $ DT.concat params)
238 params = [ secret
239 , cs $ show $ toDBid NodeDocument
240 , n
241 , cs $ show p
242 , cs $ encode h
243 ]
244 _ -> undefined
245 -}
246
247 ---------------------------------------------------------------------------
248 -- * Uniqueness of document definition
249 -- TODO factorize with above (use the function below for tests)
250
251 instance AddUniqId HyperdataContact
252 where
253 addUniqId = addUniqIdsContact
254
255 addUniqIdsContact :: HyperdataContact -> HyperdataContact
256 addUniqIdsContact hc = set (hc_uniqIdBdd) (Just shaBdd)
257 $ set (hc_uniqId ) (Just shaUni) hc
258 where
259 shaUni = hash $ DT.concat $ map ($ hc) shaParametersContact
260 shaBdd = hash $ DT.concat $ map ($ hc) ([\d -> maybeText (view hc_bdd d)] <> shaParametersContact)
261
262 -- | TODO add more shaparameters
263 shaParametersContact :: [(HyperdataContact -> Text)]
264 shaParametersContact = [ \d -> maybeText $ view (hc_who . _Just . cw_firstName ) d
265 , \d -> maybeText $ view (hc_who . _Just . cw_lastName ) d
266 , \d -> maybeText $ view (hc_where . _head . cw_touch . _Just . ct_mail) d
267 ]
268
269 maybeText :: Maybe Text -> Text
270 maybeText = maybe (DT.pack "") identity
271
272 ---------------------------------------------------------------------------
273 class ToNode a
274 where
275 -- TODO Maybe NodeId
276 toNode :: HasDBid NodeType => UserId -> ParentId -> a -> Node a
277
278 instance ToNode HyperdataDocument where
279 toNode u p h = Node 0 Nothing (toDBid NodeDocument) u (Just p) n date h
280 where
281 n = maybe "No Title" (DT.take 255) (_hd_title h)
282 date = jour y m d
283 -- NOTE: There is no year '0' in postgres, there is year 1 AD and beofre that year 1 BC:
284 -- select '0001-01-01'::date, '0001-01-01'::date - '1 day'::interval;
285 -- 0001-01-01 0001-12-31 00:00:00 BC
286 y = fromIntegral $ fromMaybe Defaults.day $ _hd_publication_year h
287 m = fromMaybe Defaults.month $ _hd_publication_month h
288 d = fromMaybe (fromIntegral Defaults.year) $ _hd_publication_day h
289
290 -- TODO better Node
291 instance ToNode HyperdataContact where
292 toNode u p h = Node 0 Nothing (toDBid NodeContact) u (Just p) "Contact" date h
293 where
294 date = jour 2020 01 01
295
296
297