2 Module : Gargantext.Database.Node.Document.Import
3 Description : Importing context of texts (documents)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 * Main purpose of this module: enabling "common goods" of text data and respecting privacy.
12 Gargantext shares as "common good" the links between context of texts
13 and terms / words / ngrams.
15 Basically a context of text can be defined as a document (see 'Gargantext.Text').
17 Issue to tackle in that module: each global document of Gargantext has
18 to be unique, then shared, but how to respect privacy if needed ?
21 * Methodology to get uniqueness and privacy by design
23 As a consequence, when importing a new document in Gargantext, a policy
24 for the uniqueness of the inserted docuemnts has to be defined.
26 That is the purpose of this module which defines its main concepts.
28 Unique identifier in database is of a 3-tuple of 3 policies that
29 together define uniqueness:
31 - Design policy: type of node is needed as TypenameId, that is a
32 Document or Individual or something else;
34 - Privacy policy: with ParentId, parent becomes unique, then it enables
35 users to get their own copy without sharing it with all the users of the
36 database (in others words parent_id is necessary to preserve privacy for
39 - Hash policy: this UniqId is a sha256 uniq id which is the result of
40 the concatenation of the parameters defined by @hashParameters@.
43 * Database configuration
45 Administrator of the database has to create a uniq index as following SQL command:
46 `create unique index on nodes (typename, parent_id, (hyperdata ->> 'uniqId'));`
49 {-# LANGUAGE DeriveGeneric #-}
50 {-# LANGUAGE NoImplicitPrelude #-}
51 {-# LANGUAGE QuasiQuotes #-}
52 {-# LANGUAGE DeriveDataTypeable #-}
53 {-# LANGUAGE FlexibleInstances #-}
54 {-# LANGUAGE TypeSynonymInstances #-}
56 ------------------------------------------------------------------------
57 ------------------------------------------------------------------------
58 module Gargantext.Database.Node.Document.Import where
60 import Control.Lens (set)
61 import Control.Monad ((>>=))
63 import Data.Aeson (toJSON, Value)
64 import Data.ByteString.Internal (ByteString)
65 import Data.Maybe (maybe)
66 import Data.Typeable (Typeable)
67 import Database.PostgreSQL.Simple (Connection, FromRow, Query, formatQuery, query, Only(..))
68 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
69 import Database.PostgreSQL.Simple.SqlQQ
70 import Database.PostgreSQL.Simple.ToField (toField)
71 import Database.PostgreSQL.Simple.ToRow (ToRow(..))
72 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
74 import Data.Text (Text)
75 import qualified Data.Text as DT (pack, unpack, concat)
76 import qualified Data.Digest.Pure.SHA as SHA (sha256, showDigest)
77 import qualified Data.ByteString.Lazy.Char8 as DC (pack)
79 import Gargantext (connectGargandb)
80 import Gargantext.Core.Types.Main (nodeTypeId)
81 import Gargantext.Core.Types.Node
82 -- FIXME : the import of Document constructor below does not work
83 -- import Gargantext.Core.Types.Node (Document)
84 --import Gargantext.Core.Types.Node (docExample, hyperdataDocument, HyperdataDocument(..)
85 -- , hyperdataDocument_uniqId
86 -- , hyperdataDocument_title
87 -- , hyperdataDocument_abstract
88 -- , hyperdataDocument_source
89 -- , Node(..), node_typename
91 -- , node_parentId, node_name, node_hyperdata, hyperdataDocuments
94 import Gargantext.Prelude
96 import GHC.Generics (Generic)
97 ---------------------------------------------------------------------------
98 -- * Main Insert functions
100 -- | Insert Document main function
101 -- UserId : user who is inserting the documents
102 -- ParentId : folder ID which is parent of the inserted documents
103 insertDocuments :: Connection -> UserId -> ParentId -> [HyperdataDocument] -> IO [ReturnId]
104 insertDocuments conn uId pId hs = query conn queryInsert (Only $ Values fields inputData)
106 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
107 inputData = prepare uId pId hs
109 -- | Debug SQL function
111 -- to print rendered query (Debug purpose) use @formatQuery@ function.
112 insertDocuments_Debug :: Connection -> UserId -> ParentId -> [HyperdataDocument] -> IO ByteString
113 insertDocuments_Debug conn uId pId hs = formatQuery conn queryInsert (Only $ Values fields inputData)
115 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
116 inputData = prepare uId pId hs
119 -- | Input Tables: types of the tables
120 inputSqlTypes :: [Text]
121 inputSqlTypes = map DT.pack ["int4","int4","int4","text","jsonb"]
123 -- | SQL query to insert documents inside the database
126 WITH input_rows(typename,user_id,parent_id,name,hyperdata) AS (?)
128 INSERT INTO nodes (typename,user_id,parent_id,name,hyperdata)
129 SELECT * FROM input_rows
130 ON CONFLICT (typename, parent_id, (hyperdata ->> 'uniqId')) DO NOTHING -- on unique index
131 RETURNING id,hyperdata
134 SELECT true AS source -- true for 'newly inserted'
136 , hyperdata ->> 'uniqId' as doi
139 SELECT false AS source -- false for 'not inserted'
141 , hyperdata ->> 'uniqId' as doi
143 JOIN nodes c USING (hyperdata); -- columns of unique index
146 prepare :: UserId -> ParentId -> [HyperdataDocument] -> [InputData]
147 prepare uId pId = map (\h -> InputData tId uId pId (DT.pack "Doc") (toJSON $ unicize h))
149 tId = nodeTypeId Document
151 ------------------------------------------------------------------------
156 -- | When documents are inserted
157 -- ReturnType after insertion
158 data ReturnId = ReturnId { reInserted :: Bool -- ^ if the document is inserted (True: is new, False: is not new)
159 , reId :: Int -- ^ always return the id of the document (even new or not new)
160 -- this is the uniq id in the database
161 , reUniqId :: Maybe Text -- ^ Hash Id with concatenation of hash parameters
162 } deriving (Show, Generic)
164 instance FromRow ReturnId where
165 fromRow = ReturnId <$> field <*> field <*> field
172 data InputData = InputData { inTypenameId :: NodeTypeId
174 , inParentId :: ParentId
177 } deriving (Show, Generic, Typeable)
179 instance ToRow InputData where
180 toRow inputData = [ toField (inTypenameId inputData)
181 , toField (inUserId inputData)
182 , toField (inParentId inputData)
183 , toField (inName inputData)
184 , toField (inHyper inputData)
187 ---------------------------------------------------------------------------
188 -- * Uniqueness of document definition
190 hashParameters :: [(HyperdataDocument -> Text)]
191 hashParameters = [ \d -> maybe' (_hyperdataDocument_title d)
192 , \d -> maybe' (_hyperdataDocument_abstract d)
193 , \d -> maybe' (_hyperdataDocument_source d)
194 , \d -> maybe' (_hyperdataDocument_publication_date d)
197 maybe' = maybe (DT.pack "") identity
199 unicize :: HyperdataDocument -> HyperdataDocument
200 unicize = unicize' hashParameters
202 unicize' :: [(HyperdataDocument -> Text)] -> HyperdataDocument -> HyperdataDocument
203 unicize' fields doc = set hyperdataDocument_uniqId (Just hash) doc
205 hash = uniqId $ DT.concat $ map (\f -> f doc) fields
207 uniqId :: Text -> Text
208 uniqId txt = (sha256 txt)
210 sha256 :: Text -> Text
211 sha256 = DT.pack . SHA.showDigest . SHA.sha256 . DC.pack . DT.unpack
213 ---------------------------------------------------------------------------
216 --insertTest :: FromRow r => CorpusId -> [Node HyperdataDocument] -> IO [r]
217 insertTest :: IO [ReturnId]
218 insertTest = connectGargandb "gargantext.ini"
219 >>= \conn -> insertDocuments conn 1 452162 hyperdataDocuments