]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Node/Document/Insert.hs
[DB][FLOW] fix duplicate ngrams insertion.
[gargantext.git] / src / Gargantext / Database / Node / Document / Insert.hs
1 {-|
2 Module : Gargantext.Database.Node.Document.Insert
3 Description : Importing context of texts (documents)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 * Purpose of this module
11
12 Enabling "common goods" of text data and respecting privacy.
13
14 Gargantext shares as "common good" the links between context of texts
15 and terms / words / ngrams.
16
17 Basically a context of text can be defined as a document (see 'Gargantext.Text').
18
19 Issue to tackle in that module: each global document of Gargantext has
20 to be unique, then shared, but how to respect privacy if needed ?
21
22
23 * Methodology to get uniqueness and privacy by design
24
25 As a consequence, when importing/inserting a new document in Gargantext,
26 a policy for the uniqueness of the inserted docuemnts has to be defined.
27
28 That is the purpose of this module which defines its main concepts.
29
30 Unique identifier in database is of a 3-tuple of 3 policies that
31 together define uniqueness:
32
33 - Design policy: type of node is needed as TypenameId, that is a
34 Document or Individual or something else;
35
36 - Privacy policy: with ParentId, parent becomes unique, then it enables
37 users to get their own copy without sharing it with all the users of the
38 database (in others words parent_id is necessary to preserve privacy for
39 instance).
40
41 - Hash policy: this UniqId is a sha256 uniq id which is the result of
42 the concatenation of the parameters defined by @hashParameters@.
43
44 > -- * Example
45 > insertTest :: FromRow r => CorpusId -> [Node HyperdataDocument] -> IO [r]
46 > insertTest :: IO [ReturnId]
47 > insertTest = connectGargandb "gargantext.ini"
48 > >>= \conn -> insertDocuments conn 1 452162 hyperdataDocuments
49
50 -}
51 ------------------------------------------------------------------------
52 {-# LANGUAGE DeriveDataTypeable #-}
53 {-# LANGUAGE DeriveGeneric #-}
54 {-# LANGUAGE FlexibleInstances #-}
55 {-# LANGUAGE NoImplicitPrelude #-}
56 {-# LANGUAGE OverloadedStrings #-}
57 {-# LANGUAGE QuasiQuotes #-}
58 {-# LANGUAGE TypeSynonymInstances #-}
59 ------------------------------------------------------------------------
60 module Gargantext.Database.Node.Document.Insert where
61
62 import Control.Lens (set)
63 import Data.Aeson (toJSON, Value)
64 import Data.Maybe (maybe)
65 import Data.Text (Text)
66 import Data.Typeable (Typeable)
67 import Database.PostgreSQL.Simple (FromRow, Query, query, Only(..))
68 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
69 import Database.PostgreSQL.Simple.SqlQQ
70 import Database.PostgreSQL.Simple.ToField (toField)
71 import Database.PostgreSQL.Simple.ToRow (ToRow(..))
72 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
73 import GHC.Generics (Generic)
74 import Gargantext.Database.Config (nodeTypeId)
75 import Gargantext.Database.Node (mkCmd, Cmd(..))
76 import Gargantext.Database.Node.Contact (HyperdataContact(..))
77 import Gargantext.Database.Types.Node
78 import Gargantext.Prelude
79 import qualified Data.ByteString.Lazy.Char8 as DC (pack)
80 import qualified Data.Digest.Pure.SHA as SHA (sha256, showDigest)
81 import qualified Data.Text as DT (pack, unpack, concat, take)
82
83 -- TODO : the import of Document constructor below does not work
84 -- import Gargantext.Database.Types.Node (Document)
85 --import Gargantext.Database.Types.Node (docExample, hyperdataDocument, HyperdataDocument(..)
86 -- , hyperdataDocument_uniqId
87 -- , hyperdataDocument_title
88 -- , hyperdataDocument_abstract
89 -- , hyperdataDocument_source
90 -- , Node(..), node_typename
91 -- , node_userId
92 -- , node_parentId, node_name, node_hyperdata, hyperdataDocuments
93 -- , NodeTypeId
94 -- )
95 {-| To Print result query
96 import Data.ByteString.Internal (ByteString)
97 import Database.PostgreSQL.Simple (formatQuery)
98 -}
99
100 ---------------------------------------------------------------------------
101 -- * Main Insert functions
102
103 -- ** Database configuration
104 -- Administrator of the database has to create a uniq index as following SQL command:
105 -- `create unique index on nodes (typename, parent_id, (hyperdata ->> 'uniqId'));`
106
107 -- | Insert Document main function
108 -- UserId : user who is inserting the documents
109 -- ParentId : folder ID which is parent of the inserted documents
110
111
112 data ToDbData = ToDbDocument HyperdataDocument | ToDbContact HyperdataContact
113
114 insertDocuments :: UserId -> ParentId -> [ToDbData] -> Cmd [ReturnId]
115 insertDocuments uId pId hs = mkCmd $ \c -> query c queryInsert (Only $ Values fields $ prepare uId pId hs)
116 where
117 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
118
119 -- | Debug SQL function
120 --
121 -- to print rendered query (Debug purpose) use @formatQuery@ function.
122 {-
123 insertDocuments_Debug :: (Hyperdata a, ToJSON a, ToRow a) => UserId -> ParentId -> [a] -> Cmd ByteString
124 insertDocuments_Debug uId pId hs = mkCmd $ \conn -> formatQuery conn queryInsert (Only $ Values fields inputData)
125 where
126 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
127 inputData = prepare uId pId hs
128 -}
129
130
131 -- | Input Tables: types of the tables
132 inputSqlTypes :: [Text]
133 inputSqlTypes = map DT.pack ["int4","int4","int4","text","jsonb"]
134
135 -- | SQL query to insert documents inside the database
136 queryInsert :: Query
137 queryInsert = [sql|
138 WITH input_rows(typename,user_id,parent_id,name,hyperdata) AS (?)
139 , ins AS (
140 INSERT INTO nodes (typename,user_id,parent_id,name,hyperdata)
141 SELECT * FROM input_rows
142 ON CONFLICT ((hyperdata ->> 'uniqIdBdd')) DO NOTHING -- on unique index
143 -- ON CONFLICT (typename, parent_id, (hyperdata ->> 'uniqId')) DO NOTHING -- on unique index
144 RETURNING id,hyperdata
145 )
146
147 SELECT true AS source -- true for 'newly inserted'
148 , id
149 , hyperdata ->> 'uniqId' as doi
150 FROM ins
151 UNION ALL
152 SELECT false AS source -- false for 'not inserted'
153 , c.id
154 , hyperdata ->> 'uniqId' as doi
155 FROM input_rows
156 JOIN nodes c USING (hyperdata); -- columns of unique index
157 |]
158
159 prepare :: UserId -> ParentId -> [ToDbData] -> [InputData]
160 prepare uId pId = map (\h -> InputData tId uId pId (name h) (toJSON' h))
161 where
162 tId = nodeTypeId NodeDocument
163
164 toJSON' (ToDbDocument hd) = toJSON hd
165 toJSON' (ToDbContact hc) = toJSON hc
166
167 name h = DT.take 255 <$> maybe "No Title" identity $ f h
168 where
169 f (ToDbDocument hd) = _hyperdataDocument_title hd
170 f (ToDbContact _ ) = Just "Contact" -- TODO view FirstName . LastName
171
172 ------------------------------------------------------------------------
173 -- * Main Types used
174
175 -- ** Return Types
176
177 -- | When documents are inserted
178 -- ReturnType after insertion
179 data ReturnId = ReturnId { reInserted :: Bool -- ^ if the document is inserted (True: is new, False: is not new)
180 , reId :: Int -- ^ always return the id of the document (even new or not new)
181 -- this is the uniq id in the database
182 , reUniqId :: Text -- ^ Hash Id with concatenation of hash parameters
183 } deriving (Show, Generic)
184
185 instance FromRow ReturnId where
186 fromRow = ReturnId <$> field <*> field <*> field
187
188 -- ** Insert Types
189
190 type UserId = Int
191 type ParentId = Int
192
193 data InputData = InputData { inTypenameId :: NodeTypeId
194 , inUserId :: UserId
195 , inParentId :: ParentId
196 , inName :: Text
197 , inHyper :: Value
198 } deriving (Show, Generic, Typeable)
199
200 instance ToRow InputData where
201 toRow inputData = [ toField (inTypenameId inputData)
202 , toField (inUserId inputData)
203 , toField (inParentId inputData)
204 , toField (inName inputData)
205 , toField (inHyper inputData)
206 ]
207
208 ---------------------------------------------------------------------------
209 -- * Uniqueness of document definition
210
211 addUniqIds :: HyperdataDocument -> HyperdataDocument
212 addUniqIds doc = set hyperdataDocument_uniqIdBdd (Just hashBdd)
213 $ set hyperdataDocument_uniqId (Just hash) doc
214 where
215 hash = uniqId $ DT.concat $ map ($ doc) hashParameters
216 hashBdd = uniqId $ DT.concat $ map ($ doc) ([(\d -> maybe' (_hyperdataDocument_bdd d))] <> hashParameters)
217
218 uniqId :: Text -> Text
219 uniqId = DT.pack . SHA.showDigest . SHA.sha256 . DC.pack . DT.unpack
220
221
222 hashParameters :: [(HyperdataDocument -> Text)]
223 hashParameters = [ \d -> maybe' (_hyperdataDocument_title d)
224 , \d -> maybe' (_hyperdataDocument_abstract d)
225 , \d -> maybe' (_hyperdataDocument_source d)
226 , \d -> maybe' (_hyperdataDocument_publication_date d)
227 ]
228
229 maybe' :: Maybe Text -> Text
230 maybe' = maybe (DT.pack "") identity
231
232 ---------------------------------------------------------------------------
233