]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Node/Document/Insert.hs
[API][NGRAMS-TABLE] WIP for meeting (adding file).
[gargantext.git] / src / Gargantext / Database / Node / Document / Insert.hs
1 {-|
2 Module : Gargantext.Database.Node.Document.Insert
3 Description : Importing context of texts (documents)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 * Purpose of this module
11
12 Enabling "common goods" of text data and respecting privacy.
13
14 Gargantext shares as "common good" the links between context of texts
15 and terms / words / ngrams.
16
17 Basically a context of text can be defined as a document (see 'Gargantext.Text').
18
19 Issue to tackle in that module: each global document of Gargantext has
20 to be unique, then shared, but how to respect privacy if needed ?
21
22
23 * Methodology to get uniqueness and privacy by design
24
25 As a consequence, when importing/inserting a new document in Gargantext,
26 a policy for the uniqueness of the inserted docuemnts has to be defined.
27
28 That is the purpose of this module which defines its main concepts.
29
30 Unique identifier in database is of a 3-tuple of 3 policies that
31 together define uniqueness:
32
33 - Design policy: type of node is needed as TypenameId, that is a
34 Document or Individual or something else;
35
36 - Privacy policy: with ParentId, parent becomes unique, then it enables
37 users to get their own copy without sharing it with all the users of the
38 database (in others words parent_id is necessary to preserve privacy for
39 instance).
40
41 - Hash policy: this UniqId is a sha256 uniq id which is the result of
42 the concatenation of the parameters defined by @hashParameters@.
43
44 > -- * Example
45 > insertTest :: FromRow r => CorpusId -> [Node HyperdataDocument] -> IO [r]
46 > insertTest :: IO [ReturnId]
47 > insertTest = connectGargandb "gargantext.ini"
48 > >>= \conn -> insertDocuments conn 1 452162 hyperdataDocuments
49
50 -}
51 ------------------------------------------------------------------------
52 {-# LANGUAGE DeriveDataTypeable #-}
53 {-# LANGUAGE DeriveGeneric #-}
54 {-# LANGUAGE FlexibleInstances #-}
55 {-# LANGUAGE NoImplicitPrelude #-}
56 {-# LANGUAGE OverloadedStrings #-}
57 {-# LANGUAGE QuasiQuotes #-}
58 {-# LANGUAGE TypeSynonymInstances #-}
59 ------------------------------------------------------------------------
60 module Gargantext.Database.Node.Document.Insert where
61
62 import Control.Lens (set)
63
64 import Data.Aeson (toJSON, Value)
65 import Data.ByteString.Internal (ByteString)
66 import Data.Maybe (maybe)
67 import Data.Typeable (Typeable)
68 import Database.PostgreSQL.Simple (FromRow, Query, formatQuery, query, Only(..))
69 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
70 import Database.PostgreSQL.Simple.SqlQQ
71 import Database.PostgreSQL.Simple.ToField (toField)
72 import Database.PostgreSQL.Simple.ToRow (ToRow(..))
73 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
74
75 import Data.Text (Text)
76 import qualified Data.Text as DT (pack, unpack, concat)
77 import qualified Data.Digest.Pure.SHA as SHA (sha256, showDigest)
78 import qualified Data.ByteString.Lazy.Char8 as DC (pack)
79
80 import Gargantext.Database.Config (nodeTypeId)
81 import Gargantext.Database.Node (mkCmd, Cmd(..))
82 import Gargantext.Database.Types.Node
83 -- TODO : the import of Document constructor below does not work
84 -- import Gargantext.Database.Types.Node (Document)
85 --import Gargantext.Database.Types.Node (docExample, hyperdataDocument, HyperdataDocument(..)
86 -- , hyperdataDocument_uniqId
87 -- , hyperdataDocument_title
88 -- , hyperdataDocument_abstract
89 -- , hyperdataDocument_source
90 -- , Node(..), node_typename
91 -- , node_userId
92 -- , node_parentId, node_name, node_hyperdata, hyperdataDocuments
93 -- , NodeTypeId
94 -- )
95 import Gargantext.Prelude
96
97 import GHC.Generics (Generic)
98 ---------------------------------------------------------------------------
99 -- * Main Insert functions
100
101 -- ** Database configuration
102 -- Administrator of the database has to create a uniq index as following SQL command:
103 -- `create unique index on nodes (typename, parent_id, (hyperdata ->> 'uniqId'));`
104
105 -- | Insert Document main function
106 -- UserId : user who is inserting the documents
107 -- ParentId : folder ID which is parent of the inserted documents
108 insertDocuments :: UserId -> ParentId -> [HyperdataDocument] -> Cmd [ReturnId]
109 insertDocuments uId pId hs = mkCmd $ \c -> query c queryInsert (Only $ Values fields inputData)
110 where
111 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
112 inputData = prepare uId pId hs
113
114 -- | Debug SQL function
115 --
116 -- to print rendered query (Debug purpose) use @formatQuery@ function.
117 insertDocuments_Debug :: UserId -> ParentId -> [HyperdataDocument] -> Cmd ByteString
118 insertDocuments_Debug uId pId hs = mkCmd $ \conn -> formatQuery conn queryInsert (Only $ Values fields inputData)
119 where
120 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
121 inputData = prepare uId pId hs
122
123
124 -- | Input Tables: types of the tables
125 inputSqlTypes :: [Text]
126 inputSqlTypes = map DT.pack ["int4","int4","int4","text","jsonb"]
127
128 -- | SQL query to insert documents inside the database
129 queryInsert :: Query
130 queryInsert = [sql|
131 WITH input_rows(typename,user_id,parent_id,name,hyperdata) AS (?)
132 , ins AS (
133 INSERT INTO nodes (typename,user_id,parent_id,name,hyperdata)
134 SELECT * FROM input_rows
135 ON CONFLICT ((hyperdata ->> 'uniqIdBdd')) DO NOTHING -- on unique index
136 -- ON CONFLICT (typename, parent_id, (hyperdata ->> 'uniqId')) DO NOTHING -- on unique index
137 RETURNING id,hyperdata
138 )
139
140 SELECT true AS source -- true for 'newly inserted'
141 , id
142 , hyperdata ->> 'uniqId' as doi
143 FROM ins
144 UNION ALL
145 SELECT false AS source -- false for 'not inserted'
146 , c.id
147 , hyperdata ->> 'uniqId' as doi
148 FROM input_rows
149 JOIN nodes c USING (hyperdata); -- columns of unique index
150 |]
151
152 prepare :: UserId -> ParentId -> [HyperdataDocument] -> [InputData]
153 prepare uId pId = map (\h -> InputData tId uId pId
154 (maybe "No Title of Document" identity $ _hyperdataDocument_title h)
155 (toJSON h)
156 )
157 where
158 tId = nodeTypeId NodeDocument
159
160 ------------------------------------------------------------------------
161 -- * Main Types used
162
163 -- ** Return Types
164
165 -- | When documents are inserted
166 -- ReturnType after insertion
167 data ReturnId = ReturnId { reInserted :: Bool -- ^ if the document is inserted (True: is new, False: is not new)
168 , reId :: Int -- ^ always return the id of the document (even new or not new)
169 -- this is the uniq id in the database
170 , reUniqId :: Text -- ^ Hash Id with concatenation of hash parameters
171 } deriving (Show, Generic)
172
173 instance FromRow ReturnId where
174 fromRow = ReturnId <$> field <*> field <*> field
175
176 -- ** Insert Types
177
178 type UserId = Int
179 type ParentId = Int
180
181 data InputData = InputData { inTypenameId :: NodeTypeId
182 , inUserId :: UserId
183 , inParentId :: ParentId
184 , inName :: Text
185 , inHyper :: Value
186 } deriving (Show, Generic, Typeable)
187
188 instance ToRow InputData where
189 toRow inputData = [ toField (inTypenameId inputData)
190 , toField (inUserId inputData)
191 , toField (inParentId inputData)
192 , toField (inName inputData)
193 , toField (inHyper inputData)
194 ]
195
196 ---------------------------------------------------------------------------
197 -- * Uniqueness of document definition
198
199 addUniqIds :: HyperdataDocument -> HyperdataDocument
200 addUniqIds doc = set hyperdataDocument_uniqIdBdd (Just hashBdd)
201 $ set hyperdataDocument_uniqId (Just hash) doc
202 where
203 hash = uniqId $ DT.concat $ map ($ doc) hashParameters
204 hashBdd = uniqId $ DT.concat $ map ($ doc) ([(\d -> maybe' (_hyperdataDocument_bdd d))] <> hashParameters)
205
206 uniqId :: Text -> Text
207 uniqId = DT.pack . SHA.showDigest . SHA.sha256 . DC.pack . DT.unpack
208
209
210 hashParameters :: [(HyperdataDocument -> Text)]
211 hashParameters = [ \d -> maybe' (_hyperdataDocument_title d)
212 , \d -> maybe' (_hyperdataDocument_abstract d)
213 , \d -> maybe' (_hyperdataDocument_source d)
214 , \d -> maybe' (_hyperdataDocument_publication_date d)
215 ]
216
217 maybe' :: Maybe Text -> Text
218 maybe' = maybe (DT.pack "") identity
219
220 ---------------------------------------------------------------------------
221