]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Query/Table/Node/Document/Insert.hs
[conduit] some more work on conduit in file parser (still doesn't compile)
[gargantext.git] / src / Gargantext / Database / Query / Table / Node / Document / Insert.hs
1 {-|
2 Module : Gargantext.Database.Node.Document.Insert
3 Description : Importing context of texts (documents)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 * Purpose of this module
11
12 Enabling "common goods" of text data and respecting privacy.
13
14 Gargantext shares as "common good" the links between context of texts
15 and terms / words / ngrams.
16
17 Basically a context of text can be defined as a document (see 'Gargantext.Core.Text').
18
19 Issue to tackle in that module: each global document of Gargantext has
20 to be unique, then shared, but how to respect privacy if needed ?
21
22
23 * Methodology to get uniqueness and privacy by design
24
25 As a consequence, when importing/inserting a new document in Gargantext,
26 a policy for the uniqueness of the inserted docuemnts has to be defined.
27
28 That is the purpose of this module which defines its main concepts.
29
30 Unique identifier in database is of a 3-tuple of 3 policies that
31 together define uniqueness:
32
33 - Design policy: type of node is needed as TypenameId, that is a
34 Document or Individual or something else;
35
36 - Privacy policy: with ParentId, parent becomes unique, then it enables
37 users to get their own copy without sharing it with all the users of the
38 database (in others words parent_id is necessary to preserve privacy for
39 instance).
40
41 - Hash policy: this UniqId is a sha256 uniq id which is the result of
42 the concatenation of the parameters defined by @shaParameters@.
43
44 > -- * Example
45 > insertTest :: FromRow r => CorpusId -> [Node HyperdataDocument] -> IO [r]
46 > insertTest :: IO [ReturnId]
47 > insertTest = runCmdDev $ insertDocuments 1 452162 hyperdataDocuments
48
49 -}
50 ------------------------------------------------------------------------
51 {-# LANGUAGE QuasiQuotes #-}
52 {-# LANGUAGE TypeSynonymInstances #-}
53 ------------------------------------------------------------------------
54 module Gargantext.Database.Query.Table.Node.Document.Insert
55 where
56
57 import Control.Lens (set, view)
58 import Control.Lens.Cons
59 import Control.Lens.Prism
60 import Data.Aeson (toJSON, encode, ToJSON)
61 import Data.Maybe (fromMaybe)
62 import Data.Text (Text)
63 -- import Data.ByteString (ByteString)
64 import Data.Time.Segment (jour)
65 import Database.PostgreSQL.Simple (FromRow, Query, Only(..))
66 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
67 -- import Database.PostgreSQL.Simple.ToRow (toRow, ToRow)
68 import Database.PostgreSQL.Simple.SqlQQ
69 import Database.PostgreSQL.Simple.ToField (toField, Action{-, ToField-})
70 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
71 import GHC.Generics (Generic)
72 import Gargantext.Core (HasDBid(toDBid))
73 import Gargantext.Database.Admin.Types.Hyperdata
74 import Gargantext.Database.Admin.Types.Node
75 import Gargantext.Database.Prelude (Cmd, runPGSQuery{-, formatPGSQuery-})
76 import Gargantext.Database.Schema.Node (NodePoly(..))
77 import Gargantext.Prelude
78 import Gargantext.Prelude.Crypto.Hash (hash)
79 import qualified Data.Text as DT (pack, concat, take)
80
81 {-| To Print result query
82 import Data.ByteString.Internal (ByteString)
83 import Database.PostgreSQL.Simple (formatQuery)
84 -}
85
86 ---------------------------------------------------------------------------
87 -- * Main Insert functions
88
89 -- | Insert Document main function
90 -- UserId : user who is inserting the documents
91 -- ParentId : folder ID which is parent of the inserted documents
92 -- Administrator of the database has to create a uniq index as following SQL command:
93 -- `create unique index on contexts table (typename, parent_id, (hyperdata ->> 'uniqId'));`
94 insertDb :: (InsertDb a, HasDBid NodeType) => UserId -> ParentId -> [a] -> Cmd err [ReturnId]
95 insertDb u p = runPGSQuery queryInsert . Only . Values fields . map (insertDb' u p)
96 where
97 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
98
99 class InsertDb a
100 where
101 insertDb' :: HasDBid NodeType => UserId -> ParentId -> a -> [Action]
102
103
104 instance InsertDb HyperdataDocument
105 where
106 insertDb' u p h = [ toField ("" :: Text)
107 , toField $ toDBid NodeDocument
108 , toField u
109 , toField p
110 , toField $ maybe "No Title" (DT.take 255) (_hd_title h)
111 , toField $ _hd_publication_date h -- TODO USE UTCTime
112 , (toField . toJSON) h
113 ]
114
115 instance InsertDb HyperdataContact
116 where
117 insertDb' u p h = [ toField ("" :: Text)
118 , toField $ toDBid NodeContact
119 , toField u
120 , toField p
121 , toField $ maybe "Contact" (DT.take 255) (Just "Name") -- (_hc_name h)
122 , toField $ jour 0 1 1 -- TODO put default date
123 , (toField . toJSON) h
124 ]
125
126 instance ToJSON a => InsertDb (Node a)
127 where
128 insertDb' _u _p (Node _nid hashId t u p n d h) = [ toField hashId
129 , toField t
130 , toField u
131 , toField p
132 , toField n
133 , toField d
134 , (toField . toJSON) h
135 ]
136
137 -- | Debug SQL function
138 --
139 -- to print rendered query (Debug purpose) use @formatQuery@ function.
140 {-
141 insertDocuments_Debug :: (Hyperdata a, ToJSON a, ToRow a, InsertDb [a])
142 => UserId -> ParentId -> [a] -> Cmd err ByteString
143 insertDocuments_Debug uId pId hs = formatPGSQuery queryInsert (Only $ Values fields inputData)
144 where
145 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
146 inputData = insertDb' uId pId hs
147 -}
148
149 -- | Input Tables: types of the tables
150 inputSqlTypes :: [Text]
151 inputSqlTypes = map DT.pack ["text", "int4","int4","int4","text","date","jsonb"]
152
153 -- | SQL query to insert documents inside the database
154 queryInsert :: Query
155 queryInsert = [sql|
156 WITH input_rows(hash_id,typename,user_id,parent_id,name,date,hyperdata) AS (?)
157 , ins AS (
158 INSERT INTO contexts (hash_id, typename,user_id,parent_id,name,date,hyperdata)
159 SELECT * FROM input_rows
160 ON CONFLICT (hash_id) DO NOTHING -- on unique index -- this does not return the ids
161 RETURNING id,hash_id
162 )
163
164 SELECT true AS source -- true for 'newly inserted'
165 , id
166 , hash_id
167 FROM ins
168 UNION ALL
169 SELECT false AS source -- false for 'not inserted'
170 , n.id
171 , hash_id
172 FROM input_rows
173 JOIN contexts n USING (hash_id); -- columns of unique index
174 |]
175
176 ------------------------------------------------------------------------
177 -- * Main Types used
178 -- ** Return Types
179
180 -- | When documents are inserted
181 -- ReturnType after insertion
182 data ReturnId = ReturnId { reInserted :: Bool -- if the document is inserted (True: is new, False: is not new)
183 , reId :: NodeId -- always return the id of the document (even new or not new)
184 -- this is the uniq id in the database
185 , reUniqId :: Text -- Hash Id with concatenation of sha parameters
186 } deriving (Show, Generic)
187
188 instance FromRow ReturnId where
189 fromRow = ReturnId <$> field <*> field <*> field
190
191 ---------------------------------------------------------------------------
192 -- * Uniqueness of document definition
193
194 class AddUniqId a
195 where
196 addUniqId :: a -> a
197
198 instance AddUniqId HyperdataDocument
199 where
200 addUniqId = addUniqIdsDoc
201 where
202 addUniqIdsDoc :: HyperdataDocument -> HyperdataDocument
203 addUniqIdsDoc doc = set hd_uniqIdBdd (Just shaBdd)
204 $ set hd_uniqId (Just shaUni) doc
205 where
206 shaUni = hash $ DT.concat $ map ($ doc) shaParametersDoc
207 shaBdd = hash $ DT.concat $ map ($ doc) ([(\d -> maybeText (_hd_bdd d))] <> shaParametersDoc)
208
209 shaParametersDoc :: [(HyperdataDocument -> Text)]
210 shaParametersDoc = [ \d -> maybeText (_hd_title d)
211 , \d -> maybeText (_hd_abstract d)
212 , \d -> maybeText (_hd_source d)
213 , \d -> maybeText (_hd_publication_date d)
214 ]
215 -- TODO put this elsewhere (fix bin/gargantext-init/Main.hs too)
216 secret :: Text
217 secret = "Database secret to change"
218
219
220 instance (AddUniqId a, ToJSON a, HasDBid NodeType) => AddUniqId (Node a)
221 where
222 addUniqId (Node nid _ t u p n d h) = Node nid hashId t u p n d h
223 where
224 hashId = Just $ "\\x" <> (hash $ DT.concat params)
225 params = [ secret
226 , cs $ show $ toDBid NodeDocument
227 , n
228 , cs $ show p
229 , cs $ encode h
230 ]
231 {-
232 addUniqId n@(Node nid _ t u p n d h) =
233 case n of
234 Node HyperdataDocument -> Node nid hashId t u p n d h
235 where
236 hashId = "\\x" <> (hash $ DT.concat params)
237 params = [ secret
238 , cs $ show $ toDBid NodeDocument
239 , n
240 , cs $ show p
241 , cs $ encode h
242 ]
243 _ -> undefined
244 -}
245
246 ---------------------------------------------------------------------------
247 -- * Uniqueness of document definition
248 -- TODO factorize with above (use the function below for tests)
249
250 instance AddUniqId HyperdataContact
251 where
252 addUniqId = addUniqIdsContact
253
254 addUniqIdsContact :: HyperdataContact -> HyperdataContact
255 addUniqIdsContact hc = set (hc_uniqIdBdd) (Just shaBdd)
256 $ set (hc_uniqId ) (Just shaUni) hc
257 where
258 shaUni = hash $ DT.concat $ map ($ hc) shaParametersContact
259 shaBdd = hash $ DT.concat $ map ($ hc) ([\d -> maybeText (view hc_bdd d)] <> shaParametersContact)
260
261 -- | TODO add more shaparameters
262 shaParametersContact :: [(HyperdataContact -> Text)]
263 shaParametersContact = [ \d -> maybeText $ view (hc_who . _Just . cw_firstName ) d
264 , \d -> maybeText $ view (hc_who . _Just . cw_lastName ) d
265 , \d -> maybeText $ view (hc_where . _head . cw_touch . _Just . ct_mail) d
266 ]
267
268 maybeText :: Maybe Text -> Text
269 maybeText = maybe (DT.pack "") identity
270
271 ---------------------------------------------------------------------------
272 class ToNode a
273 where
274 -- TODO Maybe NodeId
275 toNode :: HasDBid NodeType => UserId -> ParentId -> a -> Node a
276
277 instance ToNode HyperdataDocument where
278 toNode u p h = Node 0 Nothing (toDBid NodeDocument) u (Just p) n date h
279 where
280 n = maybe "No Title" (DT.take 255) (_hd_title h)
281 date = jour y m d
282 -- NOTE: There is no year '0' in postgres, there is year 1 AD and beofre that year 1 BC:
283 -- select '0001-01-01'::date, '0001-01-01'::date - '1 day'::interval;
284 -- 0001-01-01 0001-12-31 00:00:00 BC
285 y = maybe 1 fromIntegral $ _hd_publication_year h
286 m = fromMaybe 1 $ _hd_publication_month h
287 d = fromMaybe 1 $ _hd_publication_day h
288
289 -- TODO better Node
290 instance ToNode HyperdataContact where
291 toNode u p h = Node 0 Nothing (toDBid NodeContact) u (Just p) "Contact" date h
292 where
293 date = jour 2020 01 01
294
295
296