]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Node/Document/Insert.hs
[NGRAMS-TABLE] comment out the weights
[gargantext.git] / src / Gargantext / Database / Node / Document / Insert.hs
1 {-|
2 Module : Gargantext.Database.Node.Document.Insert
3 Description : Importing context of texts (documents)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 * Purpose of this module
11
12 Enabling "common goods" of text data and respecting privacy.
13
14 Gargantext shares as "common good" the links between context of texts
15 and terms / words / ngrams.
16
17 Basically a context of text can be defined as a document (see 'Gargantext.Text').
18
19 Issue to tackle in that module: each global document of Gargantext has
20 to be unique, then shared, but how to respect privacy if needed ?
21
22
23 * Methodology to get uniqueness and privacy by design
24
25 As a consequence, when importing/inserting a new document in Gargantext,
26 a policy for the uniqueness of the inserted docuemnts has to be defined.
27
28 That is the purpose of this module which defines its main concepts.
29
30 Unique identifier in database is of a 3-tuple of 3 policies that
31 together define uniqueness:
32
33 - Design policy: type of node is needed as TypenameId, that is a
34 Document or Individual or something else;
35
36 - Privacy policy: with ParentId, parent becomes unique, then it enables
37 users to get their own copy without sharing it with all the users of the
38 database (in others words parent_id is necessary to preserve privacy for
39 instance).
40
41 - Hash policy: this UniqId is a sha256 uniq id which is the result of
42 the concatenation of the parameters defined by @hashParameters@.
43
44 > -- * Example
45 > insertTest :: FromRow r => CorpusId -> [Node HyperdataDocument] -> IO [r]
46 > insertTest :: IO [ReturnId]
47 > insertTest = runCmdDev $ insertDocuments 1 452162 hyperdataDocuments
48
49 -}
50 ------------------------------------------------------------------------
51 {-# LANGUAGE DeriveDataTypeable #-}
52 {-# LANGUAGE DeriveGeneric #-}
53 {-# LANGUAGE FlexibleInstances #-}
54 {-# LANGUAGE NoImplicitPrelude #-}
55 {-# LANGUAGE OverloadedStrings #-}
56 {-# LANGUAGE QuasiQuotes #-}
57 {-# LANGUAGE RankNTypes #-}
58 {-# LANGUAGE TypeSynonymInstances #-}
59 ------------------------------------------------------------------------
60 module Gargantext.Database.Node.Document.Insert where
61
62 import Control.Lens (set, view)
63 import Control.Lens.Prism
64 import Control.Lens.Cons
65 import Data.Aeson (toJSON, Value)
66 import Data.Maybe (maybe)
67 import Data.Text (Text)
68 import Data.Typeable (Typeable)
69 import Database.PostgreSQL.Simple (FromRow, Query, Only(..))
70 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
71 import Database.PostgreSQL.Simple.SqlQQ
72 import Database.PostgreSQL.Simple.ToField (toField)
73 import Database.PostgreSQL.Simple.ToRow (ToRow(..))
74 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
75 import GHC.Generics (Generic)
76 import Gargantext.Database.Config (nodeTypeId)
77 import Gargantext.Database.Utils (Cmd, runPGSQuery)
78 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
79 import Gargantext.Database.Types.Node
80 import Gargantext.Prelude
81 import qualified Data.ByteString.Lazy.Char8 as DC (pack)
82 import qualified Data.Digest.Pure.SHA as SHA (sha256, showDigest)
83 import qualified Data.Text as DT (pack, unpack, concat, take)
84
85 -- TODO : the import of Document constructor below does not work
86 -- import Gargantext.Database.Types.Node (Document)
87 --import Gargantext.Database.Types.Node (docExample, hyperdataDocument, HyperdataDocument(..)
88 -- , hyperdataDocument_uniqId
89 -- , hyperdataDocument_title
90 -- , hyperdataDocument_abstract
91 -- , hyperdataDocument_source
92 -- , Node(..), node_typename
93 -- , node_userId
94 -- , node_parentId, node_name, node_hyperdata, hyperdataDocuments
95 -- , NodeTypeId
96 -- )
97 {-| To Print result query
98 import Data.ByteString.Internal (ByteString)
99 import Database.PostgreSQL.Simple (formatQuery)
100 -}
101
102 ---------------------------------------------------------------------------
103 -- * Main Insert functions
104
105 -- ** Database configuration
106 -- Administrator of the database has to create a uniq index as following SQL command:
107 -- `create unique index on nodes (typename, parent_id, (hyperdata ->> 'uniqId'));`
108
109 -- | Insert Document main function
110 -- UserId : user who is inserting the documents
111 -- ParentId : folder ID which is parent of the inserted documents
112
113
114 data ToDbData = ToDbDocument HyperdataDocument | ToDbContact HyperdataContact
115
116 insertDocuments :: UserId -> ParentId -> NodeType -> [ToDbData] -> Cmd err [ReturnId]
117 insertDocuments uId pId nodeType =
118 runPGSQuery queryInsert . Only . Values fields . prepare uId pId nodeType
119 where
120 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
121
122 -- | Debug SQL function
123 --
124 -- to print rendered query (Debug purpose) use @formatQuery@ function.
125 {-
126 insertDocuments_Debug :: (Hyperdata a, ToJSON a, ToRow a) => UserId -> ParentId -> [a] -> Cmd ByteString
127 insertDocuments_Debug uId pId hs = formatPGSQuery queryInsert (Only $ Values fields inputData)
128 where
129 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
130 inputData = prepare uId pId hs
131 -}
132
133
134 -- | Input Tables: types of the tables
135 inputSqlTypes :: [Text]
136 inputSqlTypes = map DT.pack ["int4","int4","int4","text","jsonb"]
137
138 -- | SQL query to insert documents inside the database
139 queryInsert :: Query
140 queryInsert = [sql|
141 WITH input_rows(typename,user_id,parent_id,name,hyperdata) AS (?)
142 , ins AS (
143 INSERT INTO nodes (typename,user_id,parent_id,name,hyperdata)
144 SELECT * FROM input_rows
145 ON CONFLICT ((hyperdata ->> 'uniqIdBdd')) DO NOTHING -- on unique index
146 -- ON CONFLICT (typename, parent_id, (hyperdata ->> 'uniqId')) DO NOTHING -- on unique index
147 RETURNING id,hyperdata
148 )
149
150 SELECT true AS source -- true for 'newly inserted'
151 , id
152 , hyperdata ->> 'uniqId' as doi
153 FROM ins
154 UNION ALL
155 SELECT false AS source -- false for 'not inserted'
156 , c.id
157 , hyperdata ->> 'uniqId' as doi
158 FROM input_rows
159 JOIN nodes c USING (hyperdata); -- columns of unique index
160 |]
161
162 prepare :: UserId -> ParentId -> NodeType -> [ToDbData] -> [InputData]
163 prepare uId pId nodeType = map (\h -> InputData tId uId pId (name h) (toJSON' h))
164 where
165 tId = nodeTypeId nodeType
166
167 toJSON' (ToDbDocument hd) = toJSON hd
168 toJSON' (ToDbContact hc) = toJSON hc
169
170 name h = DT.take 255 <$> maybe "No Title" identity $ f h
171 where
172 f (ToDbDocument hd) = _hyperdataDocument_title hd
173 f (ToDbContact _ ) = Just "Contact" -- TODO view FirstName . LastName
174
175 ------------------------------------------------------------------------
176 -- * Main Types used
177
178 -- ** Return Types
179
180 -- | When documents are inserted
181 -- ReturnType after insertion
182 data ReturnId = ReturnId { reInserted :: Bool -- ^ if the document is inserted (True: is new, False: is not new)
183 , reId :: Int -- ^ always return the id of the document (even new or not new)
184 -- this is the uniq id in the database
185 , reUniqId :: Text -- ^ Hash Id with concatenation of hash parameters
186 } deriving (Show, Generic)
187
188 instance FromRow ReturnId where
189 fromRow = ReturnId <$> field <*> field <*> field
190
191 -- ** Insert Types
192
193 type UserId = Int
194 type ParentId = Int
195
196 data InputData = InputData { inTypenameId :: NodeTypeId
197 , inUserId :: UserId
198 , inParentId :: ParentId
199 , inName :: Text
200 , inHyper :: Value
201 } deriving (Show, Generic, Typeable)
202
203 instance ToRow InputData where
204 toRow inputData = [ toField (inTypenameId inputData)
205 , toField (inUserId inputData)
206 , toField (inParentId inputData)
207 , toField (inName inputData)
208 , toField (inHyper inputData)
209 ]
210
211 ---------------------------------------------------------------------------
212 -- * Uniqueness of document definition
213
214 addUniqIdsDoc :: HyperdataDocument -> HyperdataDocument
215 addUniqIdsDoc doc = set hyperdataDocument_uniqIdBdd (Just hashBdd)
216 $ set hyperdataDocument_uniqId (Just hash) doc
217 where
218 hash = uniqId $ DT.concat $ map ($ doc) hashParametersDoc
219 hashBdd = uniqId $ DT.concat $ map ($ doc) ([(\d -> maybe' (_hyperdataDocument_bdd d))] <> hashParametersDoc)
220
221 uniqId :: Text -> Text
222 uniqId = DT.pack . SHA.showDigest . SHA.sha256 . DC.pack . DT.unpack
223
224
225 hashParametersDoc :: [(HyperdataDocument -> Text)]
226 hashParametersDoc = [ \d -> maybe' (_hyperdataDocument_title d)
227 , \d -> maybe' (_hyperdataDocument_abstract d)
228 , \d -> maybe' (_hyperdataDocument_source d)
229 , \d -> maybe' (_hyperdataDocument_publication_date d)
230 ]
231 ---------------------------------------------------------------------------
232 -- * Uniqueness of document definition
233 -- TODO factorize with above (use the function below for tests)
234 addUniqIdsContact :: HyperdataContact -> HyperdataContact
235 addUniqIdsContact hc = set (hc_uniqIdBdd) (Just hashBdd)
236 $ set (hc_uniqId) (Just hash) hc
237 where
238 hash = uniqId $ DT.concat $ map ($ hc) hashParametersContact
239 hashBdd = uniqId $ DT.concat $ map ($ hc) ([\d -> maybe' (view hc_bdd d)] <> hashParametersContact)
240
241 uniqId :: Text -> Text
242 uniqId = DT.pack . SHA.showDigest . SHA.sha256 . DC.pack . DT.unpack
243
244 -- | TODO add more hashparameters
245 hashParametersContact :: [(HyperdataContact -> Text)]
246 hashParametersContact = [ \d -> maybe' $ view (hc_who . _Just . cw_firstName) d
247 , \d -> maybe' $ view (hc_who . _Just . cw_lastName ) d
248 , \d -> maybe' $ view (hc_where . _head . cw_touch . _Just . ct_mail) d
249 ]
250
251
252
253 maybe' :: Maybe Text -> Text
254 maybe' = maybe (DT.pack "") identity
255
256 ---------------------------------------------------------------------------
257