]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Node/Document/Import.hs
[FEAT] Insert function of context of text in database.
[gargantext.git] / src / Gargantext / Database / Node / Document / Import.hs
1 {-|
2 Module : Gargantext.Database.Node.Document.Import
3 Description : Importing context of texts (documents)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 * Main purpose of this module: enabling "common goods" of text data and respecting privacy.
11
12 Gargantext shares as "common good" the links between context of texts
13 and terms / words / ngrams.
14
15 Basically a context of text can be defined as a document (see 'Gargantext.Text').
16
17 Issue to tackle in that module: each global document of Gargantext has
18 to be unique, then shared, but how to respect privacy if needed ?
19
20
21 * Methodology to get uniqueness and privacy by design
22
23 As a consequence, when importing a new document in Gargantext, a policy
24 for the uniqueness of the inserted docuemnts has to be defined.
25
26 That is the purpose of this module which defines its main concepts.
27
28 Unique identifier in database is of a 3-tuple of 3 policies that
29 together define uniqueness:
30
31 - Design policy: type of node is needed as TypenameId, that is a
32 Document or Individual or something else;
33
34 - Privacy policy: with ParentId, parent becomes unique, then it enables
35 users to get their own copy without sharing it with all the users of the
36 database (in others words parent_id is necessary to preserve privacy for
37 instance).
38
39 - Hash policy: this UniqId is a sha256 uniq id which is the result of
40 the concatenation of the parameters defined by @hashParameters@.
41
42
43 * Database configuration
44
45 Administrator of the database has to create a uniq index as following SQL command:
46 `create unique index on nodes (typename, parent_id, (hyperdata ->> 'uniqId'));`
47 -}
48
49 {-# LANGUAGE DeriveGeneric #-}
50 {-# LANGUAGE NoImplicitPrelude #-}
51 {-# LANGUAGE QuasiQuotes #-}
52 {-# LANGUAGE DeriveDataTypeable #-}
53 {-# LANGUAGE FlexibleInstances #-}
54 {-# LANGUAGE TypeSynonymInstances #-}
55
56 ------------------------------------------------------------------------
57 ------------------------------------------------------------------------
58 module Gargantext.Database.Node.Document.Import where
59
60 import Control.Lens (set)
61 import Control.Monad ((>>=))
62
63 import Data.Aeson (toJSON, Value)
64 import Data.ByteString.Internal (ByteString)
65 import Data.Maybe (maybe)
66 import Data.Typeable (Typeable)
67 import Database.PostgreSQL.Simple (Connection, FromRow, Query, formatQuery, query, Only(..))
68 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
69 import Database.PostgreSQL.Simple.SqlQQ
70 import Database.PostgreSQL.Simple.ToField (toField)
71 import Database.PostgreSQL.Simple.ToRow (ToRow(..))
72 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
73
74 import Data.Text (Text)
75 import qualified Data.Text as DT (pack, unpack, concat)
76 import qualified Data.Digest.Pure.SHA as SHA (sha256, showDigest)
77 import qualified Data.ByteString.Lazy.Char8 as DC (pack)
78
79 import Gargantext (connectGargandb)
80 import Gargantext.Core.Types.Main (nodeTypeId)
81 import Gargantext.Core.Types.Node
82 -- FIXME : the import of Document constructor below does not work
83 -- import Gargantext.Core.Types.Node (Document)
84 --import Gargantext.Core.Types.Node (docExample, hyperdataDocument, HyperdataDocument(..)
85 -- , hyperdataDocument_uniqId
86 -- , hyperdataDocument_title
87 -- , hyperdataDocument_abstract
88 -- , hyperdataDocument_source
89 -- , Node(..), node_typename
90 -- , node_userId
91 -- , node_parentId, node_name, node_hyperdata, hyperdataDocuments
92 -- , NodeTypeId
93 -- )
94 import Gargantext.Prelude
95
96 import GHC.Generics (Generic)
97 ---------------------------------------------------------------------------
98 -- * Main Insert functions
99
100 -- | Insert Document main function
101 -- UserId : user who is inserting the documents
102 -- ParentId : folder ID which is parent of the inserted documents
103 insertDocuments :: Connection -> UserId -> ParentId -> [HyperdataDocument] -> IO [ReturnId]
104 insertDocuments conn uId pId hs = query conn queryInsert (Only $ Values fields inputData)
105 where
106 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
107 inputData = prepare uId pId hs
108
109 -- | Debug SQL function
110 --
111 -- to print rendered query (Debug purpose) use @formatQuery@ function.
112 insertDocuments_Debug :: Connection -> UserId -> ParentId -> [HyperdataDocument] -> IO ByteString
113 insertDocuments_Debug conn uId pId hs = formatQuery conn queryInsert (Only $ Values fields inputData)
114 where
115 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
116 inputData = prepare uId pId hs
117
118
119 -- | Input Tables: types of the tables
120 inputSqlTypes :: [Text]
121 inputSqlTypes = map DT.pack ["int4","int4","int4","text","jsonb"]
122
123 -- | SQL query to insert documents inside the database
124 queryInsert :: Query
125 queryInsert = [sql|
126 WITH input_rows(typename,user_id,parent_id,name,hyperdata) AS (?)
127 , ins AS (
128 INSERT INTO nodes (typename,user_id,parent_id,name,hyperdata)
129 SELECT * FROM input_rows
130 ON CONFLICT (typename, parent_id, (hyperdata ->> 'uniqId')) DO NOTHING -- on unique index
131 RETURNING id,hyperdata
132 )
133
134 SELECT true AS source -- true for 'newly inserted'
135 , id
136 , hyperdata ->> 'uniqId' as doi
137 FROM ins
138 UNION ALL
139 SELECT false AS source -- false for 'not inserted'
140 , c.id
141 , hyperdata ->> 'uniqId' as doi
142 FROM input_rows
143 JOIN nodes c USING (hyperdata); -- columns of unique index
144 |]
145
146 prepare :: UserId -> ParentId -> [HyperdataDocument] -> [InputData]
147 prepare uId pId = map (\h -> InputData tId uId pId (DT.pack "Doc") (toJSON $ unicize h))
148 where
149 tId = nodeTypeId Document
150
151 ------------------------------------------------------------------------
152 -- * Main Types used
153
154 -- ** Return Types
155
156 -- | When documents are inserted
157 -- ReturnType after insertion
158 data ReturnId = ReturnId { reInserted :: Bool -- ^ if the document is inserted (True: is new, False: is not new)
159 , reId :: Int -- ^ always return the id of the document (even new or not new)
160 -- this is the uniq id in the database
161 , reUniqId :: Maybe Text -- ^ Hash Id with concatenation of hash parameters
162 } deriving (Show, Generic)
163
164 instance FromRow ReturnId where
165 fromRow = ReturnId <$> field <*> field <*> field
166
167 -- ** Insert Types
168
169 type UserId = Int
170 type ParentId = Int
171
172 data InputData = InputData { inTypenameId :: NodeTypeId
173 , inUserId :: UserId
174 , inParentId :: ParentId
175 , inName :: Text
176 , inHyper :: Value
177 } deriving (Show, Generic, Typeable)
178
179 instance ToRow InputData where
180 toRow inputData = [ toField (inTypenameId inputData)
181 , toField (inUserId inputData)
182 , toField (inParentId inputData)
183 , toField (inName inputData)
184 , toField (inHyper inputData)
185 ]
186
187 ---------------------------------------------------------------------------
188 -- * Uniqueness of document definition
189
190 hashParameters :: [(HyperdataDocument -> Text)]
191 hashParameters = [ \d -> maybe' (_hyperdataDocument_title d)
192 , \d -> maybe' (_hyperdataDocument_abstract d)
193 , \d -> maybe' (_hyperdataDocument_source d)
194 , \d -> maybe' (_hyperdataDocument_publication_date d)
195 ]
196 where
197 maybe' = maybe (DT.pack "") identity
198
199 unicize :: HyperdataDocument -> HyperdataDocument
200 unicize = unicize' hashParameters
201 where
202 unicize' :: [(HyperdataDocument -> Text)] -> HyperdataDocument -> HyperdataDocument
203 unicize' fields doc = set hyperdataDocument_uniqId (Just hash) doc
204 where
205 hash = uniqId $ DT.concat $ map (\f -> f doc) fields
206
207 uniqId :: Text -> Text
208 uniqId txt = (sha256 txt)
209 where
210 sha256 :: Text -> Text
211 sha256 = DT.pack . SHA.showDigest . SHA.sha256 . DC.pack . DT.unpack
212
213 ---------------------------------------------------------------------------
214 -- * Tests
215
216 --insertTest :: FromRow r => CorpusId -> [Node HyperdataDocument] -> IO [r]
217 insertTest :: IO [ReturnId]
218 insertTest = connectGargandb "gargantext.ini"
219 >>= \conn -> insertDocuments conn 1 452162 hyperdataDocuments
220