2 Module : Gargantext.Database.Node.Document.Insert
3 Description : Importing context of texts (documents)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 * Purpose of this module
12 Enabling "common goods" of text data and respecting privacy.
14 Gargantext shares as "common good" the links between context of texts
15 and terms / words / ngrams.
17 Basically a context of text can be defined as a document (see 'Gargantext.Core.Text').
19 Issue to tackle in that module: each global document of Gargantext has
20 to be unique, then shared, but how to respect privacy if needed ?
23 * Methodology to get uniqueness and privacy by design
25 As a consequence, when importing/inserting a new document in Gargantext,
26 a policy for the uniqueness of the inserted docuemnts has to be defined.
28 That is the purpose of this module which defines its main concepts.
30 Unique identifier in database is of a 3-tuple of 3 policies that
31 together define uniqueness:
33 - Design policy: type of node is needed as TypenameId, that is a
34 Document or Individual or something else;
36 - Privacy policy: with ParentId, parent becomes unique, then it enables
37 users to get their own copy without sharing it with all the users of the
38 database (in others words parent_id is necessary to preserve privacy for
41 - Hash policy: this UniqId is a sha256 uniq id which is the result of
42 the concatenation of the parameters defined by @shaParameters@.
45 > insertTest :: FromRow r => CorpusId -> [Node HyperdataDocument] -> IO [r]
46 > insertTest :: IO [ReturnId]
47 > insertTest = runCmdDev $ insertDocuments 1 452162 hyperdataDocuments
50 ------------------------------------------------------------------------
51 {-# LANGUAGE QuasiQuotes #-}
52 {-# LANGUAGE TypeSynonymInstances #-}
53 ------------------------------------------------------------------------
54 module Gargantext.Database.Query.Table.Node.Document.Insert
57 import Control.Lens (set, view)
58 import Control.Lens.Cons
59 import Control.Lens.Prism
60 import Data.Aeson (toJSON, ToJSON)
61 import Data.Char (isAlpha)
62 import Data.Maybe (fromMaybe)
63 import Data.Text (Text)
64 -- import Data.ByteString (ByteString)
65 import Data.Time.Segment (jour)
66 import Database.PostgreSQL.Simple (FromRow, Query, Only(..))
67 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
68 -- import Database.PostgreSQL.Simple.ToRow (toRow, ToRow)
69 import Database.PostgreSQL.Simple.SqlQQ
70 import Database.PostgreSQL.Simple.ToField (toField, Action{-, ToField-})
71 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
72 import GHC.Generics (Generic)
73 import Gargantext.Core (HasDBid(toDBid))
74 import Gargantext.Database.Admin.Types.Hyperdata
75 import Gargantext.Database.Admin.Types.Node
76 import Gargantext.Database.Prelude (Cmd, runPGSQuery{-, formatPGSQuery-})
77 import Gargantext.Database.Schema.Node (NodePoly(..))
78 import qualified Gargantext.Defaults as Defaults
79 import Gargantext.Prelude
80 import Gargantext.Prelude.Crypto.Hash (hash)
81 import qualified Data.Text as DT (pack, concat, take, filter, toLower)
83 {-| To Print result query
84 import Data.ByteString.Internal (ByteString)
85 import Database.PostgreSQL.Simple (formatQuery)
88 ---------------------------------------------------------------------------
89 -- * Main Insert functions
91 -- | Insert Document main function
92 -- UserId : user who is inserting the documents
93 -- ParentId : folder ID which is parent of the inserted documents
94 -- Administrator of the database has to create a uniq index as following SQL command:
95 -- `create unique index on contexts table (typename, parent_id, (hyperdata ->> 'uniqId'));`
96 insertDb :: (InsertDb a, HasDBid NodeType) => UserId -> Maybe ParentId -> [a] -> Cmd err [ReturnId]
97 insertDb u p = runPGSQuery queryInsert . Only . Values fields . map (insertDb' u p)
99 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
103 insertDb' :: HasDBid NodeType => UserId -> Maybe ParentId -> a -> [Action]
106 instance InsertDb HyperdataDocument
108 insertDb' u p h = [ toField ("" :: Text)
109 , toField $ toDBid NodeDocument
112 , toField $ maybe "No Title" (DT.take 255) (_hd_title h)
113 , toField $ _hd_publication_date h -- TODO USE UTCTime
114 , (toField . toJSON) (addUniqId h)
117 instance InsertDb HyperdataContact
119 insertDb' u p h = [ toField ("" :: Text)
120 , toField $ toDBid NodeContact
123 , toField $ maybe "Contact" (DT.take 255) (Just "Name") -- (_hc_name h)
124 , toField $ jour 0 1 1 -- TODO put default date
125 , (toField . toJSON) (addUniqId h)
128 instance ToJSON a => InsertDb (Node a)
130 insertDb' _u _p (Node _nid hashId t u p n d h) = [ toField hashId
136 , (toField . toJSON) h
139 -- | Debug SQL function
141 -- to print rendered query (Debug purpose) use @formatQuery@ function.
143 insertDocuments_Debug :: (Hyperdata a, ToJSON a, ToRow a, InsertDb [a])
144 => UserId -> ParentId -> [a] -> Cmd err ByteString
145 insertDocuments_Debug uId pId hs = formatPGSQuery queryInsert (Only $ Values fields inputData)
147 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
148 inputData = insertDb' uId pId hs
151 -- | Input Tables: types of the tables
152 inputSqlTypes :: [Text]
153 inputSqlTypes = map DT.pack ["text", "int4","int4","int4","text","date","jsonb"]
155 -- | SQL query to insert documents inside the database
158 WITH input_rows(hash_id,typename,user_id,parent_id,name,date,hyperdata) AS (?)
160 INSERT INTO contexts (hash_id, typename,user_id,parent_id,name,date,hyperdata)
161 SELECT * FROM input_rows
162 ON CONFLICT (hash_id) DO NOTHING -- on unique index -- this does not return the ids
166 SELECT true AS source -- true for 'newly inserted'
171 SELECT false AS source -- false for 'not inserted'
175 JOIN contexts n USING (hash_id); -- columns of unique index
178 ------------------------------------------------------------------------
182 -- | When documents are inserted
183 -- ReturnType after insertion
184 data ReturnId = ReturnId { reInserted :: Bool -- if the document is inserted (True: is new, False: is not new)
185 , reId :: NodeId -- always return the id of the document (even new or not new)
186 -- this is the uniq id in the database
187 , reUniqId :: Text -- Hash Id with concatenation of sha parameters
188 } deriving (Show, Generic)
190 instance FromRow ReturnId where
191 fromRow = ReturnId <$> field <*> field <*> field
193 ---------------------------------------------------------------------------
194 -- * Uniqueness of document definition
200 class UniqParameters a
202 uniqParameters :: ParentId -> a -> Text
204 instance AddUniqId HyperdataDocument
206 addUniqId = addUniqIdsDoc
208 addUniqIdsDoc :: HyperdataDocument -> HyperdataDocument
209 addUniqIdsDoc doc = set hd_uniqIdBdd (Just shaBdd)
210 $ set hd_uniqId (Just shaUni) doc
212 shaUni = hash $ DT.concat $ map ($ doc) shaParametersDoc
213 shaBdd = hash $ DT.concat $ map ($ doc) ([(\d -> maybeText (_hd_bdd d))] <> shaParametersDoc)
215 shaParametersDoc :: [(HyperdataDocument -> Text)]
216 shaParametersDoc = [ \d -> filterText $ maybeText (_hd_title d)
217 , \d -> filterText $ maybeText (_hd_abstract d)
218 , \d -> filterText $ maybeText (_hd_source d)
219 -- , \d -> maybeText (_hd_publication_date d)
222 instance UniqParameters HyperdataDocument
224 uniqParameters _ h = filterText $ DT.concat $ map maybeText $ [_hd_title h, _hd_abstract h, _hd_source h]
226 instance UniqParameters HyperdataContact
228 uniqParameters _ _ = ""
230 instance UniqParameters (Node a)
232 uniqParameters _ _ = undefined
235 filterText :: Text -> Text
236 filterText = DT.toLower . (DT.filter isAlpha)
239 instance (UniqParameters a, ToJSON a, HasDBid NodeType) => AddUniqId (Node a)
241 addUniqId (Node nid _ t u p n d h) = Node nid (Just newHash) t u p n d h
243 newHash = "\\x" <> (hash $ uniqParameters (fromMaybe 0 p) h)
246 ---------------------------------------------------------------------------
247 -- * Uniqueness of document definition
248 -- TODO factorize with above (use the function below for tests)
250 instance AddUniqId HyperdataContact
252 addUniqId = addUniqIdsContact
254 addUniqIdsContact :: HyperdataContact -> HyperdataContact
255 addUniqIdsContact hc = set (hc_uniqIdBdd) (Just shaBdd)
256 $ set (hc_uniqId ) (Just shaUni) hc
258 shaUni = hash $ DT.concat $ map ($ hc) shaParametersContact
259 shaBdd = hash $ DT.concat $ map ($ hc) ([\d -> maybeText (view hc_bdd d)] <> shaParametersContact)
261 -- | TODO add more shaparameters
262 shaParametersContact :: [(HyperdataContact -> Text)]
263 shaParametersContact = [ \d -> maybeText $ view (hc_who . _Just . cw_firstName ) d
264 , \d -> maybeText $ view (hc_who . _Just . cw_lastName ) d
265 , \d -> maybeText $ view (hc_where . _head . cw_touch . _Just . ct_mail) d
269 maybeText :: Maybe Text -> Text
270 maybeText = maybe (DT.pack "") identity
272 ---------------------------------------------------------------------------
276 toNode :: HasDBid NodeType => UserId -> Maybe ParentId -> a -> Node a
278 instance ToNode HyperdataDocument where
279 toNode u p h = Node 0 Nothing (toDBid NodeDocument) u p n date h
281 n = maybe "No Title" (DT.take 255) (_hd_title h)
283 -- NOTE: There is no year '0' in postgres, there is year 1 AD and beofre that year 1 BC:
284 -- select '0001-01-01'::date, '0001-01-01'::date - '1 day'::interval;
285 -- 0001-01-01 0001-12-31 00:00:00 BC
286 y = fromIntegral $ fromMaybe Defaults.day $ _hd_publication_year h
287 m = fromMaybe Defaults.month $ _hd_publication_month h
288 d = fromMaybe (fromIntegral Defaults.year) $ _hd_publication_day h
291 instance ToNode HyperdataContact where
292 toNode u p h = Node 0 Nothing (toDBid NodeContact) u p "Contact" date h
294 date = jour 2020 01 01