]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Node/Document/Import.hs
[MERGE]
[gargantext.git] / src / Gargantext / Database / Node / Document / Import.hs
1 {-|
2 Module : Gargantext.Database.Node.Document.Import
3 Description : Importing context of texts (documents)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 * Purpose of this module
11
12 Enabling "common goods" of text data and respecting privacy.
13
14 Gargantext shares as "common good" the links between context of texts
15 and terms / words / ngrams.
16
17 Basically a context of text can be defined as a document (see 'Gargantext.Text').
18
19 Issue to tackle in that module: each global document of Gargantext has
20 to be unique, then shared, but how to respect privacy if needed ?
21
22
23 * Methodology to get uniqueness and privacy by design
24
25 As a consequence, when importing a new document in Gargantext, a policy
26 for the uniqueness of the inserted docuemnts has to be defined.
27
28 That is the purpose of this module which defines its main concepts.
29
30 Unique identifier in database is of a 3-tuple of 3 policies that
31 together define uniqueness:
32
33 - Design policy: type of node is needed as TypenameId, that is a
34 Document or Individual or something else;
35
36 - Privacy policy: with ParentId, parent becomes unique, then it enables
37 users to get their own copy without sharing it with all the users of the
38 database (in others words parent_id is necessary to preserve privacy for
39 instance).
40
41 - Hash policy: this UniqId is a sha256 uniq id which is the result of
42 the concatenation of the parameters defined by @hashParameters@.
43
44 > -- * Example
45 > insertTest :: FromRow r => CorpusId -> [Node HyperdataDocument] -> IO [r]
46 > insertTest :: IO [ReturnId]
47 > insertTest = connectGargandb "gargantext.ini"
48 > >>= \conn -> insertDocuments conn 1 452162 hyperdataDocuments
49
50 -}
51 ------------------------------------------------------------------------
52 {-# LANGUAGE DeriveGeneric #-}
53 {-# LANGUAGE NoImplicitPrelude #-}
54 {-# LANGUAGE QuasiQuotes #-}
55 {-# LANGUAGE DeriveDataTypeable #-}
56 {-# LANGUAGE FlexibleInstances #-}
57 {-# LANGUAGE TypeSynonymInstances #-}
58 ------------------------------------------------------------------------
59 module Gargantext.Database.Node.Document.Import where
60
61 import Control.Lens (set)
62
63 import Data.Aeson (toJSON, Value)
64 import Data.ByteString.Internal (ByteString)
65 import Data.Maybe (maybe)
66 import Data.Typeable (Typeable)
67 import Database.PostgreSQL.Simple (Connection, FromRow, Query, formatQuery, query, Only(..))
68 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
69 import Database.PostgreSQL.Simple.SqlQQ
70 import Database.PostgreSQL.Simple.ToField (toField)
71 import Database.PostgreSQL.Simple.ToRow (ToRow(..))
72 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
73
74 import Data.Text (Text)
75 import qualified Data.Text as DT (pack, unpack, concat)
76 import qualified Data.Digest.Pure.SHA as SHA (sha256, showDigest)
77 import qualified Data.ByteString.Lazy.Char8 as DC (pack)
78
79 import Gargantext.Database.Config (nodeTypeId)
80 import Gargantext.Database.Node (mkCmd, Cmd(..))
81 import Gargantext.Database.Types.Node
82 -- TODO : the import of Document constructor below does not work
83 -- import Gargantext.Database.Types.Node (Document)
84 --import Gargantext.Database.Types.Node (docExample, hyperdataDocument, HyperdataDocument(..)
85 -- , hyperdataDocument_uniqId
86 -- , hyperdataDocument_title
87 -- , hyperdataDocument_abstract
88 -- , hyperdataDocument_source
89 -- , Node(..), node_typename
90 -- , node_userId
91 -- , node_parentId, node_name, node_hyperdata, hyperdataDocuments
92 -- , NodeTypeId
93 -- )
94 import Gargantext.Prelude
95
96 import GHC.Generics (Generic)
97 ---------------------------------------------------------------------------
98 -- * Main Insert functions
99
100 -- ** Database configuration
101 -- Administrator of the database has to create a uniq index as following SQL command:
102 -- `create unique index on nodes (typename, parent_id, (hyperdata ->> 'uniqId'));`
103
104 -- | Insert Document main function
105 -- UserId : user who is inserting the documents
106 -- ParentId : folder ID which is parent of the inserted documents
107 insertDocuments :: UserId -> ParentId -> [HyperdataDocument] -> Cmd [ReturnId]
108 insertDocuments uId pId hs = mkCmd $ \c -> query c queryInsert (Only $ Values fields inputData)
109 where
110 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
111 inputData = prepare uId pId hs
112
113 -- | Debug SQL function
114 --
115 -- to print rendered query (Debug purpose) use @formatQuery@ function.
116 insertDocuments_Debug :: Connection -> UserId -> ParentId -> [HyperdataDocument] -> IO ByteString
117 insertDocuments_Debug conn uId pId hs = formatQuery conn queryInsert (Only $ Values fields inputData)
118 where
119 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
120 inputData = prepare uId pId hs
121
122
123 -- | Input Tables: types of the tables
124 inputSqlTypes :: [Text]
125 inputSqlTypes = map DT.pack ["int4","int4","int4","text","jsonb"]
126
127 -- | SQL query to insert documents inside the database
128 queryInsert :: Query
129 queryInsert = [sql|
130 WITH input_rows(typename,user_id,parent_id,name,hyperdata) AS (?)
131 , ins AS (
132 INSERT INTO nodes (typename,user_id,parent_id,name,hyperdata)
133 SELECT * FROM input_rows
134 ON CONFLICT (typename, parent_id, (hyperdata ->> 'uniqId')) DO NOTHING -- on unique index
135 RETURNING id,hyperdata
136 )
137
138 SELECT true AS source -- true for 'newly inserted'
139 , id
140 , hyperdata ->> 'uniqId' as doi
141 FROM ins
142 UNION ALL
143 SELECT false AS source -- false for 'not inserted'
144 , c.id
145 , hyperdata ->> 'uniqId' as doi
146 FROM input_rows
147 JOIN nodes c USING (hyperdata); -- columns of unique index
148 |]
149
150 prepare :: UserId -> ParentId -> [HyperdataDocument] -> [InputData]
151 prepare uId pId = map (\h -> InputData tId uId pId (DT.pack "Doc") (toJSON $ addUniqId h))
152 where
153 tId = nodeTypeId NodeDocument
154
155 ------------------------------------------------------------------------
156 -- * Main Types used
157
158 -- ** Return Types
159
160 -- | When documents are inserted
161 -- ReturnType after insertion
162 data ReturnId = ReturnId { reInserted :: Bool -- ^ if the document is inserted (True: is new, False: is not new)
163 , reId :: Int -- ^ always return the id of the document (even new or not new)
164 -- this is the uniq id in the database
165 , reUniqId :: Maybe Text -- ^ Hash Id with concatenation of hash parameters
166 } deriving (Show, Generic)
167
168 instance FromRow ReturnId where
169 fromRow = ReturnId <$> field <*> field <*> field
170
171 -- ** Insert Types
172
173 type UserId = Int
174 type ParentId = Int
175
176 data InputData = InputData { inTypenameId :: NodeTypeId
177 , inUserId :: UserId
178 , inParentId :: ParentId
179 , inName :: Text
180 , inHyper :: Value
181 } deriving (Show, Generic, Typeable)
182
183 instance ToRow InputData where
184 toRow inputData = [ toField (inTypenameId inputData)
185 , toField (inUserId inputData)
186 , toField (inParentId inputData)
187 , toField (inName inputData)
188 , toField (inHyper inputData)
189 ]
190
191 ---------------------------------------------------------------------------
192 -- * Uniqueness of document definition
193
194 hashParameters :: [(HyperdataDocument -> Text)]
195 hashParameters = [ \d -> maybe' (_hyperdataDocument_title d)
196 , \d -> maybe' (_hyperdataDocument_abstract d)
197 , \d -> maybe' (_hyperdataDocument_source d)
198 , \d -> maybe' (_hyperdataDocument_publication_date d)
199 ]
200 where
201 maybe' = maybe (DT.pack "") identity
202
203 addUniqId :: HyperdataDocument -> HyperdataDocument
204 addUniqId doc = set hyperdataDocument_uniqId (Just hash) doc
205 where
206 hash = uniqId $ DT.concat $ map ($ doc) hashParameters
207
208 uniqId :: Text -> Text
209 uniqId = DT.pack . SHA.showDigest . SHA.sha256 . DC.pack . DT.unpack
210
211 ---------------------------------------------------------------------------
212