2 Module : Gargantext.Database.Node.Document.Insert
3 Description : Importing context of texts (documents)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 * Purpose of this module
12 Enabling "common goods" of text data and respecting privacy.
14 Gargantext shares as "common good" the links between context of texts
15 and terms / words / ngrams.
17 Basically a context of text can be defined as a document (see 'Gargantext.Text').
19 Issue to tackle in that module: each global document of Gargantext has
20 to be unique, then shared, but how to respect privacy if needed ?
23 * Methodology to get uniqueness and privacy by design
25 As a consequence, when importing/inserting a new document in Gargantext,
26 a policy for the uniqueness of the inserted docuemnts has to be defined.
28 That is the purpose of this module which defines its main concepts.
30 Unique identifier in database is of a 3-tuple of 3 policies that
31 together define uniqueness:
33 - Design policy: type of node is needed as TypenameId, that is a
34 Document or Individual or something else;
36 - Privacy policy: with ParentId, parent becomes unique, then it enables
37 users to get their own copy without sharing it with all the users of the
38 database (in others words parent_id is necessary to preserve privacy for
41 - Hash policy: this UniqId is a sha256 uniq id which is the result of
42 the concatenation of the parameters defined by @hashParameters@.
45 > insertTest :: FromRow r => CorpusId -> [Node HyperdataDocument] -> IO [r]
46 > insertTest :: IO [ReturnId]
47 > insertTest = runCmdDev $ insertDocuments 1 452162 hyperdataDocuments
50 ------------------------------------------------------------------------
51 {-# LANGUAGE DeriveDataTypeable #-}
52 {-# LANGUAGE DeriveGeneric #-}
53 {-# LANGUAGE FlexibleInstances #-}
54 {-# LANGUAGE NoImplicitPrelude #-}
55 {-# LANGUAGE OverloadedStrings #-}
56 {-# LANGUAGE QuasiQuotes #-}
57 {-# LANGUAGE RankNTypes #-}
58 {-# LANGUAGE TypeSynonymInstances #-}
59 ------------------------------------------------------------------------
60 module Gargantext.Database.Node.Document.Insert where
62 import Control.Lens (set, view)
63 import Control.Lens.Prism
64 import Control.Lens.Cons
65 import Data.Aeson (toJSON, Value)
66 import Data.Maybe (maybe)
67 import Data.Text (Text)
68 import Data.Typeable (Typeable)
69 import Database.PostgreSQL.Simple (FromRow, Query, Only(..))
70 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
71 import Database.PostgreSQL.Simple.SqlQQ
72 import Database.PostgreSQL.Simple.ToField (toField)
73 import Database.PostgreSQL.Simple.ToRow (ToRow(..))
74 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
75 import GHC.Generics (Generic)
76 import Gargantext.Database.Config (nodeTypeId)
77 import Gargantext.Database.Utils (Cmd, runPGSQuery)
78 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
79 import Gargantext.Database.Types.Node
80 import Gargantext.Prelude
81 import qualified Data.ByteString.Lazy.Char8 as DC (pack)
82 import qualified Data.Digest.Pure.SHA as SHA (sha256, showDigest)
83 import qualified Data.Text as DT (pack, unpack, concat, take)
85 -- TODO : the import of Document constructor below does not work
86 -- import Gargantext.Database.Types.Node (Document)
87 --import Gargantext.Database.Types.Node (docExample, hyperdataDocument, HyperdataDocument(..)
88 -- , hyperdataDocument_uniqId
89 -- , hyperdataDocument_title
90 -- , hyperdataDocument_abstract
91 -- , hyperdataDocument_source
92 -- , Node(..), node_typename
94 -- , node_parentId, node_name, node_hyperdata, hyperdataDocuments
97 {-| To Print result query
98 import Data.ByteString.Internal (ByteString)
99 import Database.PostgreSQL.Simple (formatQuery)
102 ---------------------------------------------------------------------------
103 -- * Main Insert functions
105 -- ** Database configuration
106 -- Administrator of the database has to create a uniq index as following SQL command:
107 -- `create unique index on nodes (typename, parent_id, (hyperdata ->> 'uniqId'));`
109 -- | Insert Document main function
110 -- UserId : user who is inserting the documents
111 -- ParentId : folder ID which is parent of the inserted documents
114 data ToDbData = ToDbDocument HyperdataDocument | ToDbContact HyperdataContact
116 insertDocuments :: UserId -> ParentId -> NodeType -> [ToDbData] -> Cmd err [ReturnId]
117 insertDocuments uId pId nodeType =
118 runPGSQuery queryInsert . Only . Values fields . prepare uId pId nodeType
120 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
122 -- | Debug SQL function
124 -- to print rendered query (Debug purpose) use @formatQuery@ function.
126 insertDocuments_Debug :: (Hyperdata a, ToJSON a, ToRow a) => UserId -> ParentId -> [a] -> Cmd ByteString
127 insertDocuments_Debug uId pId hs = formatPGSQuery queryInsert (Only $ Values fields inputData)
129 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
130 inputData = prepare uId pId hs
134 -- | Input Tables: types of the tables
135 inputSqlTypes :: [Text]
136 inputSqlTypes = map DT.pack ["int4","int4","int4","text","jsonb"]
138 -- | SQL query to insert documents inside the database
141 WITH input_rows(typename,user_id,parent_id,name,hyperdata) AS (?)
143 INSERT INTO nodes (typename,user_id,parent_id,name,hyperdata)
144 SELECT * FROM input_rows
145 ON CONFLICT ((hyperdata ->> 'uniqIdBdd')) DO NOTHING -- on unique index
146 -- ON CONFLICT (typename, parent_id, (hyperdata ->> 'uniqId')) DO NOTHING -- on unique index
147 RETURNING id,hyperdata
150 SELECT true AS source -- true for 'newly inserted'
152 , hyperdata ->> 'uniqId' as doi
155 SELECT false AS source -- false for 'not inserted'
157 , hyperdata ->> 'uniqId' as doi
159 JOIN nodes c USING (hyperdata); -- columns of unique index
162 prepare :: UserId -> ParentId -> NodeType -> [ToDbData] -> [InputData]
163 prepare uId pId nodeType = map (\h -> InputData tId uId pId (name h) (toJSON' h))
165 tId = nodeTypeId nodeType
167 toJSON' (ToDbDocument hd) = toJSON hd
168 toJSON' (ToDbContact hc) = toJSON hc
170 name h = DT.take 255 <$> maybe "No Title" identity $ f h
172 f (ToDbDocument hd) = _hyperdataDocument_title hd
173 f (ToDbContact _ ) = Just "Contact" -- TODO view FirstName . LastName
175 ------------------------------------------------------------------------
180 -- | When documents are inserted
181 -- ReturnType after insertion
182 data ReturnId = ReturnId { reInserted :: Bool -- ^ if the document is inserted (True: is new, False: is not new)
183 , reId :: NodeId -- ^ always return the id of the document (even new or not new)
184 -- this is the uniq id in the database
185 , reUniqId :: Text -- ^ Hash Id with concatenation of hash parameters
186 } deriving (Show, Generic)
188 instance FromRow ReturnId where
189 fromRow = ReturnId <$> field <*> field <*> field
193 data InputData = InputData { inTypenameId :: NodeTypeId
195 , inParentId :: ParentId
198 } deriving (Show, Generic, Typeable)
200 instance ToRow InputData where
201 toRow inputData = [ toField (inTypenameId inputData)
202 , toField (inUserId inputData)
203 , toField (inParentId inputData)
204 , toField (inName inputData)
205 , toField (inHyper inputData)
208 ---------------------------------------------------------------------------
209 -- * Uniqueness of document definition
211 addUniqIdsDoc :: HyperdataDocument -> HyperdataDocument
212 addUniqIdsDoc doc = set hyperdataDocument_uniqIdBdd (Just hashBdd)
213 $ set hyperdataDocument_uniqId (Just hash) doc
215 hash = uniqId $ DT.concat $ map ($ doc) hashParametersDoc
216 hashBdd = uniqId $ DT.concat $ map ($ doc) ([(\d -> maybe' (_hyperdataDocument_bdd d))] <> hashParametersDoc)
218 uniqId :: Text -> Text
219 uniqId = DT.pack . SHA.showDigest . SHA.sha256 . DC.pack . DT.unpack
222 hashParametersDoc :: [(HyperdataDocument -> Text)]
223 hashParametersDoc = [ \d -> maybe' (_hyperdataDocument_title d)
224 , \d -> maybe' (_hyperdataDocument_abstract d)
225 , \d -> maybe' (_hyperdataDocument_source d)
226 , \d -> maybe' (_hyperdataDocument_publication_date d)
228 ---------------------------------------------------------------------------
229 -- * Uniqueness of document definition
230 -- TODO factorize with above (use the function below for tests)
231 addUniqIdsContact :: HyperdataContact -> HyperdataContact
232 addUniqIdsContact hc = set (hc_uniqIdBdd) (Just hashBdd)
233 $ set (hc_uniqId) (Just hash) hc
235 hash = uniqId $ DT.concat $ map ($ hc) hashParametersContact
236 hashBdd = uniqId $ DT.concat $ map ($ hc) ([\d -> maybe' (view hc_bdd d)] <> hashParametersContact)
238 uniqId :: Text -> Text
239 uniqId = DT.pack . SHA.showDigest . SHA.sha256 . DC.pack . DT.unpack
241 -- | TODO add more hashparameters
242 hashParametersContact :: [(HyperdataContact -> Text)]
243 hashParametersContact = [ \d -> maybe' $ view (hc_who . _Just . cw_firstName) d
244 , \d -> maybe' $ view (hc_who . _Just . cw_lastName ) d
245 , \d -> maybe' $ view (hc_where . _head . cw_touch . _Just . ct_mail) d
250 maybe' :: Maybe Text -> Text
251 maybe' = maybe (DT.pack "") identity
253 ---------------------------------------------------------------------------