]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Database/Node/Document/Insert.hs
add a filter for fis with too few ngrams
[gargantext.git] / src / Gargantext / Database / Node / Document / Insert.hs
1 {-|
2 Module : Gargantext.Database.Node.Document.Insert
3 Description : Importing context of texts (documents)
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 * Purpose of this module
11
12 Enabling "common goods" of text data and respecting privacy.
13
14 Gargantext shares as "common good" the links between context of texts
15 and terms / words / ngrams.
16
17 Basically a context of text can be defined as a document (see 'Gargantext.Text').
18
19 Issue to tackle in that module: each global document of Gargantext has
20 to be unique, then shared, but how to respect privacy if needed ?
21
22
23 * Methodology to get uniqueness and privacy by design
24
25 As a consequence, when importing/inserting a new document in Gargantext,
26 a policy for the uniqueness of the inserted docuemnts has to be defined.
27
28 That is the purpose of this module which defines its main concepts.
29
30 Unique identifier in database is of a 3-tuple of 3 policies that
31 together define uniqueness:
32
33 - Design policy: type of node is needed as TypenameId, that is a
34 Document or Individual or something else;
35
36 - Privacy policy: with ParentId, parent becomes unique, then it enables
37 users to get their own copy without sharing it with all the users of the
38 database (in others words parent_id is necessary to preserve privacy for
39 instance).
40
41 - Hash policy: this UniqId is a sha256 uniq id which is the result of
42 the concatenation of the parameters defined by @hashParameters@.
43
44 > -- * Example
45 > insertTest :: FromRow r => CorpusId -> [Node HyperdataDocument] -> IO [r]
46 > insertTest :: IO [ReturnId]
47 > insertTest = runCmdDev $ insertDocuments 1 452162 hyperdataDocuments
48
49 -}
50 ------------------------------------------------------------------------
51 {-# LANGUAGE DeriveGeneric #-}
52 {-# LANGUAGE FlexibleInstances #-}
53 {-# LANGUAGE NoImplicitPrelude #-}
54 {-# LANGUAGE OverloadedStrings #-}
55 {-# LANGUAGE QuasiQuotes #-}
56 {-# LANGUAGE RankNTypes #-}
57 {-# LANGUAGE TypeSynonymInstances #-}
58 ------------------------------------------------------------------------
59 module Gargantext.Database.Node.Document.Insert where
60
61 import Control.Lens (set, view)
62 import Control.Lens.Prism
63 import Control.Lens.Cons
64 import Data.Aeson (toJSON)
65 import Data.Maybe (maybe)
66 import Data.Text (Text)
67 import Database.PostgreSQL.Simple (FromRow, Query, Only(..))
68 import Database.PostgreSQL.Simple.FromRow (fromRow, field)
69 import Database.PostgreSQL.Simple.SqlQQ
70 import Database.PostgreSQL.Simple.ToField (toField, Action)
71 import Database.PostgreSQL.Simple.Types (Values(..), QualifiedIdentifier(..))
72 import GHC.Generics (Generic)
73 import Gargantext.Database.Config (nodeTypeId)
74 import Gargantext.Database.Utils (Cmd, runPGSQuery)
75 import Gargantext.Database.Node.Contact -- (HyperdataContact(..), ContactWho(..))
76 import Gargantext.Database.Types.Node
77 import Gargantext.Prelude
78 import qualified Data.ByteString.Lazy.Char8 as DC (pack)
79 import qualified Data.Digest.Pure.SHA as SHA (sha256, showDigest)
80 import qualified Data.Text as DT (pack, unpack, concat, take)
81 import Gargantext.Prelude.Utils (hash)
82 -- TODO : the import of Document constructor below does not work
83 -- import Gargantext.Database.Types.Node (Document)
84 --import Gargantext.Database.Types.Node (docExample, hyperdataDocument, HyperdataDocument(..)
85 -- , hyperdataDocument_uniqId
86 -- , hyperdataDocument_title
87 -- , hyperdataDocument_abstract
88 -- , hyperdataDocument_source
89 -- , Node(..), node_typename
90 -- , node_userId
91 -- , node_parentId, node_name, node_hyperdata, hyperdataDocuments
92 -- , NodeTypeId
93 -- )
94 {-| To Print result query
95 import Data.ByteString.Internal (ByteString)
96 import Database.PostgreSQL.Simple (formatQuery)
97 -}
98
99 ---------------------------------------------------------------------------
100 -- * Main Insert functions
101
102 -- | Insert Document main function
103 -- UserId : user who is inserting the documents
104 -- ParentId : folder ID which is parent of the inserted documents
105 -- Administrator of the database has to create a uniq index as following SQL command:
106 -- `create unique index on nodes (typename, parent_id, (hyperdata ->> 'uniqId'));`
107 insertDb :: InsertDb a => UserId -> ParentId -> [a] -> Cmd err [ReturnId]
108 insertDb u p = runPGSQuery queryInsert . Only . Values fields . map (insertDb' u p)
109 where
110 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
111
112 class InsertDb a
113 where
114 insertDb' :: UserId -> ParentId -> a -> [Action]
115
116
117 instance InsertDb HyperdataDocument
118 where
119 insertDb' u p h = [ toField $ nodeTypeId NodeDocument
120 , toField u
121 , toField p
122 , toField $ maybe "No Title" (DT.take 255) (_hyperdataDocument_title h)
123 , (toField . toJSON) h
124 ]
125
126 instance InsertDb HyperdataContact
127 where
128 insertDb' u p h = [ toField $ nodeTypeId NodeContact
129 , toField u
130 , toField p
131 , toField $ maybe "Contact" (DT.take 255) (Just "Name") -- (_hc_name h)
132 , (toField . toJSON) h
133 ]
134
135
136 -- | Debug SQL function
137 --
138 -- to print rendered query (Debug purpose) use @formatQuery@ function.
139 {-
140 insertDocuments_Debug :: (Hyperdata a, ToJSON a, ToRow a) => UserId -> ParentId -> [a] -> Cmd ByteString
141 insertDocuments_Debug uId pId hs = formatPGSQuery queryInsert (Only $ Values fields inputData)
142 where
143 fields = map (\t-> QualifiedIdentifier Nothing t) inputSqlTypes
144 inputData = prepare uId pId hs
145 -}
146
147
148 -- | Input Tables: types of the tables
149 inputSqlTypes :: [Text]
150 inputSqlTypes = map DT.pack ["int4","int4","int4","text","jsonb"]
151
152 -- | SQL query to insert documents inside the database
153 queryInsert :: Query
154 queryInsert = [sql|
155 WITH input_rows(typename,user_id,parent_id,name,hyperdata) AS (?)
156 , ins AS (
157 INSERT INTO nodes (typename,user_id,parent_id,name,hyperdata)
158 SELECT * FROM input_rows
159 ON CONFLICT ((hyperdata ->> 'uniqIdBdd')) DO NOTHING -- on unique index
160 -- ON CONFLICT (typename, parent_id, (hyperdata ->> 'uniqId')) DO NOTHING -- on unique index
161 RETURNING id,hyperdata
162 )
163
164 SELECT true AS source -- true for 'newly inserted'
165 , id
166 , hyperdata ->> 'uniqId' as doi
167 FROM ins
168 UNION ALL
169 SELECT false AS source -- false for 'not inserted'
170 , c.id
171 , hyperdata ->> 'uniqId' as doi
172 FROM input_rows
173 JOIN nodes c USING (hyperdata); -- columns of unique index
174 |]
175
176 ------------------------------------------------------------------------
177 -- * Main Types used
178
179 -- ** Return Types
180
181 -- | When documents are inserted
182 -- ReturnType after insertion
183 data ReturnId = ReturnId { reInserted :: Bool -- ^ if the document is inserted (True: is new, False: is not new)
184 , reId :: NodeId -- ^ always return the id of the document (even new or not new)
185 -- this is the uniq id in the database
186 , reUniqId :: Text -- ^ Hash Id with concatenation of hash parameters
187 } deriving (Show, Generic)
188
189 instance FromRow ReturnId where
190 fromRow = ReturnId <$> field <*> field <*> field
191
192 ---------------------------------------------------------------------------
193 -- * Uniqueness of document definition
194
195 class AddUniqId a
196 where
197 addUniqId :: a -> a
198
199 instance AddUniqId HyperdataDocument
200 where
201 addUniqId = addUniqIdsDoc
202 where
203 addUniqIdsDoc :: HyperdataDocument -> HyperdataDocument
204 addUniqIdsDoc doc = set hyperdataDocument_uniqIdBdd (Just hashBdd)
205 $ set hyperdataDocument_uniqId (Just hashUni) doc
206 where
207 hashUni = hash $ DT.concat $ map ($ doc) hashParametersDoc
208 hashBdd = hash $ DT.concat $ map ($ doc) ([(\d -> maybeText (_hyperdataDocument_bdd d))] <> hashParametersDoc)
209
210 hashParametersDoc :: [(HyperdataDocument -> Text)]
211 hashParametersDoc = [ \d -> maybeText (_hyperdataDocument_title d)
212 , \d -> maybeText (_hyperdataDocument_abstract d)
213 , \d -> maybeText (_hyperdataDocument_source d)
214 , \d -> maybeText (_hyperdataDocument_publication_date d)
215 ]
216
217 ---------------------------------------------------------------------------
218 -- * Uniqueness of document definition
219 -- TODO factorize with above (use the function below for tests)
220
221 instance AddUniqId HyperdataContact
222 where
223 addUniqId = addUniqIdsContact
224
225 addUniqIdsContact :: HyperdataContact -> HyperdataContact
226 addUniqIdsContact hc = set (hc_uniqIdBdd) (Just hashBdd)
227 $ set (hc_uniqId ) (Just hashUni) hc
228 where
229 hashUni = uniqId $ DT.concat $ map ($ hc) hashParametersContact
230 hashBdd = uniqId $ DT.concat $ map ($ hc) ([\d -> maybeText (view hc_bdd d)] <> hashParametersContact)
231
232 uniqId :: Text -> Text
233 uniqId = DT.pack . SHA.showDigest . SHA.sha256 . DC.pack . DT.unpack
234
235 -- | TODO add more hashparameters
236 hashParametersContact :: [(HyperdataContact -> Text)]
237 hashParametersContact = [ \d -> maybeText $ view (hc_who . _Just . cw_firstName) d
238 , \d -> maybeText $ view (hc_who . _Just . cw_lastName ) d
239 , \d -> maybeText $ view (hc_where . _head . cw_touch . _Just . ct_mail) d
240 ]
241
242
243
244 maybeText :: Maybe Text -> Text
245 maybeText = maybe (DT.pack "") identity
246
247 ---------------------------------------------------------------------------