]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Node/Document/Import.hs
[DB-FLOW] functions to create nodeTypes.
[gargantext.git] / src / Gargantext / Database / Node / Document / Import.hs
1 {-|
2 Module : Gargantext.Database.Node.Document.Import
3 Description : Importing context of texts (documents)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 * Purpose of this module
11
12 Enabling "common goods" of text data and respecting privacy.
13
14 Gargantext shares as "common good" the links between context of texts
15 and terms / words / ngrams.
16
17 Basically a context of text can be defined as a document (see 'Gargantext.Text').
18
19 Issue to tackle in that module: each global document of Gargantext has
20 to be unique, then shared, but how to respect privacy if needed ?
21
22
23 * Methodology to get uniqueness and privacy by design
24
25 As a consequence, when importing a new document in Gargantext, a policy
26 for the uniqueness of the inserted docuemnts has to be defined.
27
28 That is the purpose of this module which defines its main concepts.
29
30 Unique identifier in database is of a 3-tuple of 3 policies that
31 together define uniqueness:
32
33 - Design policy: type of node is needed as TypenameId, that is a
34 Document or Individual or something else;
35
36 - Privacy policy: with ParentId, parent becomes unique, then it enables
37 users to get their own copy without sharing it with all the users of the
38 database (in others words parent_id is necessary to preserve privacy for
39 instance).
40
41 - Hash policy: this UniqId is a sha256 uniq id which is the result of
42 the concatenation of the parameters defined by @hashParameters@.
43
44 > -- * Example
45 > insertTest :: FromRow r => CorpusId -> [Node HyperdataDocument] -> IO [r]
46 > insertTest :: IO [ReturnId]
47 > insertTest = connectGargandb "gargantext.ini"
48 > >>= \conn -> insertDocuments conn 1 452162 hyperdataDocuments
49
50 -}
51 ------------------------------------------------------------------------
52 {-# LANGUAGE DeriveGeneric #-}
53 {-# LANGUAGE NoImplicitPrelude #-}
54 {-# LANGUAGE QuasiQuotes #-}
55 {-# LANGUAGE DeriveDataTypeable #-}
56 {-# LANGUAGE FlexibleInstances #-}
57 {-# LANGUAGE TypeSynonymInstances #-}
58 ------------------------------------------------------------------------
59 module Gargantext.Database.Node.Document.Import where
60
61 import Control.Lens (set)
62
63 import Data.Aeson (toJSON, Value)
64 import Data.ByteString.Internal (ByteString)
65 import Data.Maybe (maybe)
66 import Data.Typeable (Typeable)
67 import Database.PostgreSQL.Simple (Connection, FromRow, Query, formatQuery, query, Only(..))
68 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
69 import Database.PostgreSQL.Simple.SqlQQ
70 import Database.PostgreSQL.Simple.ToField (toField)
71 import Database.PostgreSQL.Simple.ToRow (ToRow(..))
72 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
73
74 import Data.Text (Text)
75 import qualified Data.Text as DT (pack, unpack, concat)
76 import qualified Data.Digest.Pure.SHA as SHA (sha256, showDigest)
77 import qualified Data.ByteString.Lazy.Char8 as DC (pack)
78
79 import Gargantext.Database.Config (nodeTypeId)
80 import Gargantext.Database.Types.Node
81 -- TODO : the import of Document constructor below does not work
82 -- import Gargantext.Database.Types.Node (Document)
83 --import Gargantext.Database.Types.Node (docExample, hyperdataDocument, HyperdataDocument(..)
84 -- , hyperdataDocument_uniqId
85 -- , hyperdataDocument_title
86 -- , hyperdataDocument_abstract
87 -- , hyperdataDocument_source
88 -- , Node(..), node_typename
89 -- , node_userId
90 -- , node_parentId, node_name, node_hyperdata, hyperdataDocuments
91 -- , NodeTypeId
92 -- )
93 import Gargantext.Prelude
94
95 import GHC.Generics (Generic)
96 ---------------------------------------------------------------------------
97 -- * Main Insert functions
98
99 -- ** Database configuration
100 -- Administrator of the database has to create a uniq index as following SQL command:
101 -- `create unique index on nodes (typename, parent_id, (hyperdata ->> 'uniqId'));`
102
103 -- | Insert Document main function
104 -- UserId : user who is inserting the documents
105 -- ParentId : folder ID which is parent of the inserted documents
106 insertDocuments :: Connection -> UserId -> ParentId -> [HyperdataDocument] -> IO [ReturnId]
107 insertDocuments conn uId pId hs = query conn queryInsert (Only $ Values fields inputData)
108 where
109 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
110 inputData = prepare uId pId hs
111
112 -- | Debug SQL function
113 --
114 -- to print rendered query (Debug purpose) use @formatQuery@ function.
115 insertDocuments_Debug :: Connection -> UserId -> ParentId -> [HyperdataDocument] -> IO ByteString
116 insertDocuments_Debug conn uId pId hs = formatQuery conn queryInsert (Only $ Values fields inputData)
117 where
118 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
119 inputData = prepare uId pId hs
120
121
122 -- | Input Tables: types of the tables
123 inputSqlTypes :: [Text]
124 inputSqlTypes = map DT.pack ["int4","int4","int4","text","jsonb"]
125
126 -- | SQL query to insert documents inside the database
127 queryInsert :: Query
128 queryInsert = [sql|
129 WITH input_rows(typename,user_id,parent_id,name,hyperdata) AS (?)
130 , ins AS (
131 INSERT INTO nodes (typename,user_id,parent_id,name,hyperdata)
132 SELECT * FROM input_rows
133 ON CONFLICT (typename, parent_id, (hyperdata ->> 'uniqId')) DO NOTHING -- on unique index
134 RETURNING id,hyperdata
135 )
136
137 SELECT true AS source -- true for 'newly inserted'
138 , id
139 , hyperdata ->> 'uniqId' as doi
140 FROM ins
141 UNION ALL
142 SELECT false AS source -- false for 'not inserted'
143 , c.id
144 , hyperdata ->> 'uniqId' as doi
145 FROM input_rows
146 JOIN nodes c USING (hyperdata); -- columns of unique index
147 |]
148
149 prepare :: UserId -> ParentId -> [HyperdataDocument] -> [InputData]
150 prepare uId pId = map (\h -> InputData tId uId pId (DT.pack "Doc") (toJSON $ addUniqId h))
151 where
152 tId = nodeTypeId NodeDocument
153
154 ------------------------------------------------------------------------
155 -- * Main Types used
156
157 -- ** Return Types
158
159 -- | When documents are inserted
160 -- ReturnType after insertion
161 data ReturnId = ReturnId { reInserted :: Bool -- ^ if the document is inserted (True: is new, False: is not new)
162 , reId :: Int -- ^ always return the id of the document (even new or not new)
163 -- this is the uniq id in the database
164 , reUniqId :: Maybe Text -- ^ Hash Id with concatenation of hash parameters
165 } deriving (Show, Generic)
166
167 instance FromRow ReturnId where
168 fromRow = ReturnId <$> field <*> field <*> field
169
170 -- ** Insert Types
171
172 type UserId = Int
173 type ParentId = Int
174
175 data InputData = InputData { inTypenameId :: NodeTypeId
176 , inUserId :: UserId
177 , inParentId :: ParentId
178 , inName :: Text
179 , inHyper :: Value
180 } deriving (Show, Generic, Typeable)
181
182 instance ToRow InputData where
183 toRow inputData = [ toField (inTypenameId inputData)
184 , toField (inUserId inputData)
185 , toField (inParentId inputData)
186 , toField (inName inputData)
187 , toField (inHyper inputData)
188 ]
189
190 ---------------------------------------------------------------------------
191 -- * Uniqueness of document definition
192
193 hashParameters :: [(HyperdataDocument -> Text)]
194 hashParameters = [ \d -> maybe' (_hyperdataDocument_title d)
195 , \d -> maybe' (_hyperdataDocument_abstract d)
196 , \d -> maybe' (_hyperdataDocument_source d)
197 , \d -> maybe' (_hyperdataDocument_publication_date d)
198 ]
199 where
200 maybe' = maybe (DT.pack "") identity
201
202 addUniqId :: HyperdataDocument -> HyperdataDocument
203 addUniqId doc = set hyperdataDocument_uniqId (Just hash) doc
204 where
205 hash = uniqId $ DT.concat $ map ($ doc) hashParameters
206
207 uniqId :: Text -> Text
208 uniqId = DT.pack . SHA.showDigest . SHA.sha256 . DC.pack . DT.unpack
209
210 ---------------------------------------------------------------------------
211