]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Node/Document/Import.hs
need to fix text parser and apply Pandoc to it,the title parser work
[gargantext.git] / src / Gargantext / Database / Node / Document / Import.hs
1 {-|
2 Module : Gargantext.Database.Node.Document.Import
3 Description : Importing context of texts (documents)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 * Purpose of this module
11
12 Enabling "common goods" of text data and respecting privacy.
13
14 Gargantext shares as "common good" the links between context of texts
15 and terms / words / ngrams.
16
17 Basically a context of text can be defined as a document (see 'Gargantext.Text').
18
19 Issue to tackle in that module: each global document of Gargantext has
20 to be unique, then shared, but how to respect privacy if needed ?
21
22
23 * Methodology to get uniqueness and privacy by design
24
25 As a consequence, when importing a new document in Gargantext, a policy
26 for the uniqueness of the inserted docuemnts has to be defined.
27
28 That is the purpose of this module which defines its main concepts.
29
30 Unique identifier in database is of a 3-tuple of 3 policies that
31 together define uniqueness:
32
33 - Design policy: type of node is needed as TypenameId, that is a
34 Document or Individual or something else;
35
36 - Privacy policy: with ParentId, parent becomes unique, then it enables
37 users to get their own copy without sharing it with all the users of the
38 database (in others words parent_id is necessary to preserve privacy for
39 instance).
40
41 - Hash policy: this UniqId is a sha256 uniq id which is the result of
42 the concatenation of the parameters defined by @hashParameters@.
43
44 -}
45 ------------------------------------------------------------------------
46 {-# LANGUAGE DeriveGeneric #-}
47 {-# LANGUAGE NoImplicitPrelude #-}
48 {-# LANGUAGE QuasiQuotes #-}
49 {-# LANGUAGE DeriveDataTypeable #-}
50 {-# LANGUAGE FlexibleInstances #-}
51 {-# LANGUAGE TypeSynonymInstances #-}
52 ------------------------------------------------------------------------
53 module Gargantext.Database.Node.Document.Import where
54
55 import Control.Lens (set)
56 import Control.Monad ((>>=))
57
58 import Data.Aeson (toJSON, Value)
59 import Data.ByteString.Internal (ByteString)
60 import Data.Maybe (maybe)
61 import Data.Typeable (Typeable)
62 import Database.PostgreSQL.Simple (Connection, FromRow, Query, formatQuery, query, Only(..))
63 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
64 import Database.PostgreSQL.Simple.SqlQQ
65 import Database.PostgreSQL.Simple.ToField (toField)
66 import Database.PostgreSQL.Simple.ToRow (ToRow(..))
67 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
68
69 import Data.Text (Text)
70 import qualified Data.Text as DT (pack, unpack, concat)
71 import qualified Data.Digest.Pure.SHA as SHA (sha256, showDigest)
72 import qualified Data.ByteString.Lazy.Char8 as DC (pack)
73
74 import Gargantext (connectGargandb)
75 import Gargantext.Database.Config (nodeTypeId)
76 import Gargantext.Database.Types.Node
77 -- FIXME : the import of Document constructor below does not work
78 -- import Gargantext.Database.Types.Node (Document)
79 --import Gargantext.Database.Types.Node (docExample, hyperdataDocument, HyperdataDocument(..)
80 -- , hyperdataDocument_uniqId
81 -- , hyperdataDocument_title
82 -- , hyperdataDocument_abstract
83 -- , hyperdataDocument_source
84 -- , Node(..), node_typename
85 -- , node_userId
86 -- , node_parentId, node_name, node_hyperdata, hyperdataDocuments
87 -- , NodeTypeId
88 -- )
89 import Gargantext.Prelude
90
91 import GHC.Generics (Generic)
92 ---------------------------------------------------------------------------
93 -- * Main Insert functions
94
95 -- ** Database configuration
96 -- Administrator of the database has to create a uniq index as following SQL command:
97 -- `create unique index on nodes (typename, parent_id, (hyperdata ->> 'uniqId'));`
98
99 -- | Insert Document main function
100 -- UserId : user who is inserting the documents
101 -- ParentId : folder ID which is parent of the inserted documents
102 insertDocuments :: Connection -> UserId -> ParentId -> [HyperdataDocument] -> IO [ReturnId]
103 insertDocuments conn uId pId hs = query conn queryInsert (Only $ Values fields inputData)
104 where
105 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
106 inputData = prepare uId pId hs
107
108 -- | Debug SQL function
109 --
110 -- to print rendered query (Debug purpose) use @formatQuery@ function.
111 insertDocuments_Debug :: Connection -> UserId -> ParentId -> [HyperdataDocument] -> IO ByteString
112 insertDocuments_Debug conn uId pId hs = formatQuery conn queryInsert (Only $ Values fields inputData)
113 where
114 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
115 inputData = prepare uId pId hs
116
117
118 -- | Input Tables: types of the tables
119 inputSqlTypes :: [Text]
120 inputSqlTypes = map DT.pack ["int4","int4","int4","text","jsonb"]
121
122 -- | SQL query to insert documents inside the database
123 queryInsert :: Query
124 queryInsert = [sql|
125 WITH input_rows(typename,user_id,parent_id,name,hyperdata) AS (?)
126 , ins AS (
127 INSERT INTO nodes (typename,user_id,parent_id,name,hyperdata)
128 SELECT * FROM input_rows
129 ON CONFLICT (typename, parent_id, (hyperdata ->> 'uniqId')) DO NOTHING -- on unique index
130 RETURNING id,hyperdata
131 )
132
133 SELECT true AS source -- true for 'newly inserted'
134 , id
135 , hyperdata ->> 'uniqId' as doi
136 FROM ins
137 UNION ALL
138 SELECT false AS source -- false for 'not inserted'
139 , c.id
140 , hyperdata ->> 'uniqId' as doi
141 FROM input_rows
142 JOIN nodes c USING (hyperdata); -- columns of unique index
143 |]
144
145 prepare :: UserId -> ParentId -> [HyperdataDocument] -> [InputData]
146 prepare uId pId = map (\h -> InputData tId uId pId (DT.pack "Doc") (toJSON $ unicize h))
147 where
148 tId = nodeTypeId Document
149
150 ------------------------------------------------------------------------
151 -- * Main Types used
152
153 -- ** Return Types
154
155 -- | When documents are inserted
156 -- ReturnType after insertion
157 data ReturnId = ReturnId { reInserted :: Bool -- ^ if the document is inserted (True: is new, False: is not new)
158 , reId :: Int -- ^ always return the id of the document (even new or not new)
159 -- this is the uniq id in the database
160 , reUniqId :: Maybe Text -- ^ Hash Id with concatenation of hash parameters
161 } deriving (Show, Generic)
162
163 instance FromRow ReturnId where
164 fromRow = ReturnId <$> field <*> field <*> field
165
166 -- ** Insert Types
167
168 type UserId = Int
169 type ParentId = Int
170
171 data InputData = InputData { inTypenameId :: NodeTypeId
172 , inUserId :: UserId
173 , inParentId :: ParentId
174 , inName :: Text
175 , inHyper :: Value
176 } deriving (Show, Generic, Typeable)
177
178 instance ToRow InputData where
179 toRow inputData = [ toField (inTypenameId inputData)
180 , toField (inUserId inputData)
181 , toField (inParentId inputData)
182 , toField (inName inputData)
183 , toField (inHyper inputData)
184 ]
185
186 ---------------------------------------------------------------------------
187 -- * Uniqueness of document definition
188
189 hashParameters :: [(HyperdataDocument -> Text)]
190 hashParameters = [ \d -> maybe' (_hyperdataDocument_title d)
191 , \d -> maybe' (_hyperdataDocument_abstract d)
192 , \d -> maybe' (_hyperdataDocument_source d)
193 , \d -> maybe' (_hyperdataDocument_publication_date d)
194 ]
195 where
196 maybe' = maybe (DT.pack "") identity
197
198 unicize :: HyperdataDocument -> HyperdataDocument
199 unicize = unicize' hashParameters
200 where
201 unicize' :: [(HyperdataDocument -> Text)] -> HyperdataDocument -> HyperdataDocument
202 unicize' fields doc = set hyperdataDocument_uniqId (Just hash) doc
203 where
204 hash = uniqId $ DT.concat $ map (\f -> f doc) fields
205
206 uniqId :: Text -> Text
207 uniqId txt = (sha256 txt)
208 where
209 sha256 :: Text -> Text
210 sha256 = DT.pack . SHA.showDigest . SHA.sha256 . DC.pack . DT.unpack
211
212 ---------------------------------------------------------------------------
213 -- * Tests
214
215 --insertTest :: FromRow r => CorpusId -> [Node HyperdataDocument] -> IO [r]
216 insertTest :: IO [ReturnId]
217 insertTest = connectGargandb "gargantext.ini"
218 >>= \conn -> insertDocuments conn 1 452162 hyperdataDocuments
219