2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17 {-# OPTIONS_GHC -fno-warn-orphans #-}
19 module Gargantext.API.Node.Corpus.New
22 import Control.Lens hiding (elements, Empty)
24 import Data.Aeson.TH (deriveJSON)
25 import qualified Data.ByteString as BS
26 import qualified Data.ByteString.Base64 as BSB64
28 import Data.Maybe (fromMaybe)
30 import Data.Text (Text)
31 import qualified Data.Text as T
32 import qualified Data.Text.Encoding as TE
33 import GHC.Generics (Generic)
35 import Servant.Job.Core
36 import Servant.Job.Types
37 import Servant.Job.Utils (jsonOptions)
38 -- import Servant.Multipart
39 -- import Test.QuickCheck (elements)
40 import Test.QuickCheck.Arbitrary
41 import Web.FormUrlEncoded (FromForm)
43 import Gargantext.Prelude
45 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
46 import qualified Gargantext.API.Admin.Orchestrator.Types as T
47 import Gargantext.API.Admin.Settings (HasSettings)
48 import Gargantext.API.Node.Corpus.New.File
49 import Gargantext.Core (Lang(..){-, allLangs-})
50 import Gargantext.Core.Types.Individu (User(..))
51 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
52 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
53 import Gargantext.Database.Action.Flow.Utils (getUserId)
54 import Gargantext.Database.Action.Node (mkNodeWithParent)
55 import Gargantext.Database.Admin.Types.Hyperdata
56 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
57 import Gargantext.Database.Query.Table.Node (getNodeWith)
58 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
59 import Gargantext.Database.Schema.Node (node_hyperdata)
60 import qualified Gargantext.Prelude.Utils as GPU
61 import qualified Gargantext.Core.Text.Corpus.API as API
62 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
64 ------------------------------------------------------------------------
66 data Query = Query { query_query :: Text
67 , query_node_id :: Int
69 , query_databases :: [DataOrigin]
71 deriving (Eq, Generic)
73 deriveJSON (unPrefix "query_") 'Query
75 instance Arbitrary Query where
76 arbitrary = elements [ Query q n la fs
77 | q <- ["honeybee* AND collapse"
82 , fs <- take 3 $ repeat allDataOrigins
85 instance ToSchema Query where
86 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
89 ------------------------------------------------------------------------
95 type PostApi = Summary "New Corpus endpoint"
96 :> ReqBody '[JSON] Query
97 :> Post '[JSON] CorpusId
98 type GetApi = Get '[JSON] ApiInfo
101 -- | TODO manage several apis
103 -- TODO this is only the POST
105 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
106 api uid (Query q _ as) = do
107 cId <- case head as of
108 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
109 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
111 docs <- liftBase $ API.get a q (Just 1000)
112 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
118 ------------------------------------------------
119 -- TODO use this route for Client implementation
120 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
122 instance Arbitrary ApiInfo where
123 arbitrary = ApiInfo <$> arbitrary
125 deriveJSON (unPrefix "") 'ApiInfo
127 instance ToSchema ApiInfo
129 info :: FlowCmdM env err m => UserId -> m ApiInfo
130 info _u = pure $ ApiInfo API.externalAPIs
132 ------------------------------------------------------------------------
134 data Database = Empty
139 deriving (Eq, Show, Generic)
141 deriveJSON (unPrefix "") ''Database
142 instance ToSchema Database
144 database2origin :: Database -> DataOrigin
145 database2origin Empty = InternalOrigin T.IsTex
146 database2origin PubMed = ExternalOrigin T.PubMed
147 database2origin HAL = ExternalOrigin T.HAL
148 database2origin IsTex = ExternalOrigin T.IsTex
149 database2origin Isidore = ExternalOrigin T.Isidore
151 ------------------------------------------------------------------------
152 data WithQuery = WithQuery
154 , _wq_databases :: !Database
156 , _wq_node_id :: !Int
160 makeLenses ''WithQuery
161 instance FromJSON WithQuery where
162 parseJSON = genericParseJSON $ jsonOptions "_wq_"
163 instance ToSchema WithQuery where
164 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
166 -------------------------------------------------------
167 data NewWithForm = NewWithForm
168 { _wf_filetype :: !FileType
170 , _wf_lang :: !(Maybe Lang)
172 } deriving (Eq, Show, Generic)
174 makeLenses ''NewWithForm
175 instance FromForm NewWithForm
176 instance FromJSON NewWithForm where
177 parseJSON = genericParseJSON $ jsonOptions "_wf_"
178 instance ToSchema NewWithForm where
179 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
181 -------------------------------------------------------
182 data NewWithFile = NewWithFile
183 { _wfi_b64_data :: !Text
184 , _wfi_lang :: !(Maybe Lang)
186 } deriving (Eq, Show, Generic)
188 makeLenses ''NewWithFile
189 instance FromForm NewWithFile
190 instance FromJSON NewWithFile where
191 parseJSON = genericParseJSON $ jsonOptions "_wfi_"
192 instance ToSchema NewWithFile where
193 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wfi_")
195 instance GPU.SaveFile NewWithFile where
196 saveFile' fp (NewWithFile b64d _ _) = do
197 let eDecoded = BSB64.decode $ TE.encodeUtf8 b64d
199 Left err -> panic $ T.pack $ "Error decoding: " <> err
200 Right decoded -> BS.writeFile fp decoded
201 -- BS.writeFile fp $ BSB64.decodeLenient $ TE.encodeUtf8 b64d
203 --instance GPU.ReadFile NewWithFile where
204 -- readFile' = TIO.readFile
206 ------------------------------------------------------------------------
207 type AsyncJobs event ctI input output =
208 AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output
209 ------------------------------------------------------------------------
211 type AddWithQuery = Summary "Add with Query to corpus endpoint"
213 :> Capture "corpus_id" CorpusId
215 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
218 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
220 :> Capture "corpus_id" CorpusId
223 :> MultipartForm Mem (MultipartData Mem)
224 :> QueryParam "fileType" FileType
226 :> AsyncJobs JobLog '[JSON] () JobLog
230 ------------------------------------------------------------------------
231 -- TODO WithQuery also has a corpus id
232 addToCorpusWithQuery :: FlowCmdM env err m
238 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) logStatus = do
240 logStatus JobLog { _scst_succeeded = Just 0
241 , _scst_failed = Just 0
242 , _scst_remaining = Just 5
243 , _scst_events = Just []
245 printDebug "addToCorpusWithQuery" (cid, dbs)
247 -- TODO if cid is folder -> create Corpus
248 -- if cid is corpus -> add to corpus
249 -- if cid is root -> create corpus in Private
250 txts <- mapM (\db -> getDataText db (Multi l) q Nothing) [database2origin dbs]
252 logStatus JobLog { _scst_succeeded = Just 2
253 , _scst_failed = Just 0
254 , _scst_remaining = Just 1
255 , _scst_events = Just []
258 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
259 printDebug "corpus id" cids
261 pure JobLog { _scst_succeeded = Just 3
262 , _scst_failed = Just 0
263 , _scst_remaining = Just 0
264 , _scst_events = Just []
268 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
270 :> Capture "corpus_id" CorpusId
274 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
276 addToCorpusWithForm :: FlowCmdM env err m
282 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
284 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
285 printDebug "[addToCorpusWithForm] fileType" ft
286 logStatus JobLog { _scst_succeeded = Just 0
287 , _scst_failed = Just 0
288 , _scst_remaining = Just 2
289 , _scst_events = Just []
293 CSV_HAL -> Parser.parseFormat Parser.CsvHal
294 CSV -> Parser.parseFormat Parser.CsvGargV3
295 WOS -> Parser.parseFormat Parser.WOS
296 PresseRIS -> Parser.parseFormat Parser.RisPresse
298 -- TODO granularity of the logStatus
299 docs <- liftBase $ splitEvery 500
303 printDebug "Parsing corpus finished : " cid
304 logStatus JobLog { _scst_succeeded = Just 1
305 , _scst_failed = Just 0
306 , _scst_remaining = Just 1
307 , _scst_events = Just []
311 printDebug "Starting extraction : " cid
312 -- TODO granularity of the logStatus
313 _cid' <- flowCorpus user
315 (Multi $ fromMaybe EN l)
316 (map (map toHyperdataDocument) docs)
318 printDebug "Extraction finished : " cid
319 pure JobLog { _scst_succeeded = Just 2
320 , _scst_failed = Just 0
321 , _scst_remaining = Just 0
322 , _scst_events = Just []
326 addToCorpusWithFile :: FlowCmdM env err m
332 addToCorpusWithFile cid input filetype logStatus = do
333 logStatus JobLog { _scst_succeeded = Just 10
334 , _scst_failed = Just 2
335 , _scst_remaining = Just 138
336 , _scst_events = Just []
338 printDebug "addToCorpusWithFile" cid
339 _h <- postUpload cid filetype input
341 pure JobLog { _scst_succeeded = Just 137
342 , _scst_failed = Just 13
343 , _scst_remaining = Just 0
344 , _scst_events = Just []
350 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
352 :> Capture "corpus_id" CorpusId
356 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
358 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
364 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
366 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
367 logStatus JobLog { _scst_succeeded = Just 0
368 , _scst_failed = Just 0
369 , _scst_remaining = Just 1
370 , _scst_events = Just []
373 fPath <- GPU.writeFile nwf
374 printDebug "[addToCorpusWithFile] File saved as: " fPath
376 uId <- getUserId user
377 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
381 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
382 let hl = node ^. node_hyperdata
383 _ <- updateHyperdata nId $ hl { _hff_name = fName
384 , _hff_path = T.pack fPath }
386 printDebug "[addToCorpusWithFile] Created node with id: " nId
389 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
390 pure $ JobLog { _scst_succeeded = Just 1
391 , _scst_failed = Just 0
392 , _scst_remaining = Just 0
393 , _scst_events = Just []