]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[upload] file upload works now
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17 {-# OPTIONS_GHC -fno-warn-orphans #-}
18
19 module Gargantext.API.Node.Corpus.New
20 where
21
22 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson
24 import Data.Aeson.TH (deriveJSON)
25 import Data.Either
26 import Data.Maybe (fromMaybe)
27 import Data.Swagger
28 import Data.Text (Text)
29 import qualified Data.Text as T
30 import qualified Data.Text.IO as TIO
31 import GHC.Generics (Generic)
32 import Servant
33 import Servant.Job.Core
34 import Servant.Job.Types
35 import Servant.Job.Utils (jsonOptions)
36 -- import Servant.Multipart
37 -- import Test.QuickCheck (elements)
38 import Test.QuickCheck.Arbitrary
39 import Web.FormUrlEncoded (FromForm)
40
41 import Gargantext.Prelude
42
43 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
44 import qualified Gargantext.API.Admin.Orchestrator.Types as T
45 import Gargantext.API.Admin.Settings (HasSettings)
46 import Gargantext.API.Node.Corpus.New.File
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import Gargantext.Core.Types.Individu (User(..))
49 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
50 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
51 import Gargantext.Database.Action.Flow.Utils (getUserId)
52 import Gargantext.Database.Action.Node (mkNodeWithParent)
53 import Gargantext.Database.Admin.Types.Hyperdata
54 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
55 import Gargantext.Database.Query.Table.Node (getNodeWith)
56 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
57 import Gargantext.Database.Schema.Node (node_hyperdata)
58 import qualified Gargantext.Prelude.Utils as GPU
59 import qualified Gargantext.Text.Corpus.API as API
60 import qualified Gargantext.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
61
62 ------------------------------------------------------------------------
63 {-
64 data Query = Query { query_query :: Text
65 , query_node_id :: Int
66 , query_lang :: Lang
67 , query_databases :: [DataOrigin]
68 }
69 deriving (Eq, Generic)
70
71 deriveJSON (unPrefix "query_") 'Query
72
73 instance Arbitrary Query where
74 arbitrary = elements [ Query q n la fs
75 | q <- ["honeybee* AND collapse"
76 ,"covid 19"
77 ]
78 , n <- [0..10]
79 , la <- allLangs
80 , fs <- take 3 $ repeat allDataOrigins
81 ]
82
83 instance ToSchema Query where
84 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
85 -}
86
87 ------------------------------------------------------------------------
88
89 {-
90 type Api = PostApi
91 :<|> GetApi
92
93 type PostApi = Summary "New Corpus endpoint"
94 :> ReqBody '[JSON] Query
95 :> Post '[JSON] CorpusId
96 type GetApi = Get '[JSON] ApiInfo
97 -}
98
99 -- | TODO manage several apis
100 -- TODO-ACCESS
101 -- TODO this is only the POST
102 {-
103 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
104 api uid (Query q _ as) = do
105 cId <- case head as of
106 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
107 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
108 Just a -> do
109 docs <- liftBase $ API.get a q (Just 1000)
110 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
111 pure cId'
112
113 pure cId
114 -}
115
116 ------------------------------------------------
117 -- TODO use this route for Client implementation
118 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
119 deriving (Generic)
120 instance Arbitrary ApiInfo where
121 arbitrary = ApiInfo <$> arbitrary
122
123 deriveJSON (unPrefix "") 'ApiInfo
124
125 instance ToSchema ApiInfo
126
127 info :: FlowCmdM env err m => UserId -> m ApiInfo
128 info _u = pure $ ApiInfo API.externalAPIs
129
130 ------------------------------------------------------------------------
131
132 data Database = Empty
133 | PubMed
134 | HAL
135 | IsTex
136 | Isidore
137 deriving (Eq, Show, Generic)
138
139 deriveJSON (unPrefix "") ''Database
140 instance ToSchema Database
141
142 database2origin :: Database -> DataOrigin
143 database2origin Empty = InternalOrigin T.IsTex
144 database2origin PubMed = ExternalOrigin T.PubMed
145 database2origin HAL = ExternalOrigin T.HAL
146 database2origin IsTex = ExternalOrigin T.IsTex
147 database2origin Isidore = ExternalOrigin T.Isidore
148
149 ------------------------------------------------------------------------
150 data WithQuery = WithQuery
151 { _wq_query :: !Text
152 , _wq_databases :: !Database
153 , _wq_lang :: !Lang
154 , _wq_node_id :: !Int
155 }
156 deriving Generic
157
158 makeLenses ''WithQuery
159 instance FromJSON WithQuery where
160 parseJSON = genericParseJSON $ jsonOptions "_wq_"
161 instance ToSchema WithQuery where
162 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
163
164 -------------------------------------------------------
165 data NewWithForm = NewWithForm
166 { _wf_filetype :: !FileType
167 , _wf_data :: !Text
168 , _wf_lang :: !(Maybe Lang)
169 , _wf_name :: !Text
170 } deriving (Eq, Show, Generic)
171
172 makeLenses ''NewWithForm
173 instance FromForm NewWithForm
174 instance FromJSON NewWithForm where
175 parseJSON = genericParseJSON $ jsonOptions "_wf_"
176 instance ToSchema NewWithForm where
177 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
178
179 -------------------------------------------------------
180 data NewWithFile = NewWithFile
181 { _wfi_data :: !Text
182 , _wfi_lang :: !(Maybe Lang)
183 , _wfi_name :: !Text
184 } deriving (Eq, Show, Generic)
185
186 makeLenses ''NewWithFile
187 instance FromForm NewWithFile
188 instance FromJSON NewWithFile where
189 parseJSON = genericParseJSON $ jsonOptions "_wfi_"
190 instance ToSchema NewWithFile where
191 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wfi_")
192
193 instance GPU.SaveFile NewWithFile where
194 saveFile' fp (NewWithFile d _ _) = TIO.writeFile fp d
195
196 --instance GPU.ReadFile NewWithFile where
197 -- readFile' = TIO.readFile
198
199 ------------------------------------------------------------------------
200 type AsyncJobs event ctI input output =
201 AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output
202 ------------------------------------------------------------------------
203
204 type AddWithQuery = Summary "Add with Query to corpus endpoint"
205 :> "corpus"
206 :> Capture "corpus_id" CorpusId
207 :> "query"
208 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
209
210 {-
211 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
212 :> "corpus"
213 :> Capture "corpus_id" CorpusId
214 :> "add"
215 :> "file"
216 :> MultipartForm Mem (MultipartData Mem)
217 :> QueryParam "fileType" FileType
218 :> "async"
219 :> AsyncJobs JobLog '[JSON] () JobLog
220 -}
221
222
223 ------------------------------------------------------------------------
224 -- TODO WithQuery also has a corpus id
225 addToCorpusWithQuery :: FlowCmdM env err m
226 => User
227 -> CorpusId
228 -> WithQuery
229 -> (JobLog -> m ())
230 -> m JobLog
231 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) logStatus = do
232 -- TODO ...
233 logStatus JobLog { _scst_succeeded = Just 0
234 , _scst_failed = Just 0
235 , _scst_remaining = Just 5
236 , _scst_events = Just []
237 }
238 printDebug "addToCorpusWithQuery" (cid, dbs)
239 -- TODO add cid
240 -- TODO if cid is folder -> create Corpus
241 -- if cid is corpus -> add to corpus
242 -- if cid is root -> create corpus in Private
243 txts <- mapM (\db -> getDataText db (Multi l) q Nothing) [database2origin dbs]
244
245 logStatus JobLog { _scst_succeeded = Just 2
246 , _scst_failed = Just 0
247 , _scst_remaining = Just 1
248 , _scst_events = Just []
249 }
250
251 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
252 printDebug "corpus id" cids
253 -- TODO ...
254 pure JobLog { _scst_succeeded = Just 3
255 , _scst_failed = Just 0
256 , _scst_remaining = Just 0
257 , _scst_events = Just []
258 }
259
260
261 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
262 :> "corpus"
263 :> Capture "corpus_id" CorpusId
264 :> "add"
265 :> "form"
266 :> "async"
267 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
268
269 addToCorpusWithForm :: FlowCmdM env err m
270 => User
271 -> CorpusId
272 -> NewWithForm
273 -> (JobLog -> m ())
274 -> m JobLog
275 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
276
277 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
278 printDebug "[addToCorpusWithForm] fileType" ft
279 logStatus JobLog { _scst_succeeded = Just 0
280 , _scst_failed = Just 0
281 , _scst_remaining = Just 2
282 , _scst_events = Just []
283 }
284 let
285 parse = case ft of
286 CSV_HAL -> Parser.parseFormat Parser.CsvHal
287 CSV -> Parser.parseFormat Parser.CsvGargV3
288 WOS -> Parser.parseFormat Parser.WOS
289 PresseRIS -> Parser.parseFormat Parser.RisPresse
290
291 -- TODO granularity of the logStatus
292 docs <- liftBase $ splitEvery 500
293 <$> take 1000000
294 <$> parse (cs d)
295
296 printDebug "Parsing corpus finished : " cid
297 logStatus JobLog { _scst_succeeded = Just 1
298 , _scst_failed = Just 0
299 , _scst_remaining = Just 1
300 , _scst_events = Just []
301 }
302
303
304 printDebug "Starting extraction : " cid
305 -- TODO granularity of the logStatus
306 _cid' <- flowCorpus user
307 (Right [cid])
308 (Multi $ fromMaybe EN l)
309 (map (map toHyperdataDocument) docs)
310
311 printDebug "Extraction finished : " cid
312 pure JobLog { _scst_succeeded = Just 2
313 , _scst_failed = Just 0
314 , _scst_remaining = Just 0
315 , _scst_events = Just []
316 }
317
318 {-
319 addToCorpusWithFile :: FlowCmdM env err m
320 => CorpusId
321 -> MultipartData Mem
322 -> Maybe FileType
323 -> (JobLog -> m ())
324 -> m JobLog
325 addToCorpusWithFile cid input filetype logStatus = do
326 logStatus JobLog { _scst_succeeded = Just 10
327 , _scst_failed = Just 2
328 , _scst_remaining = Just 138
329 , _scst_events = Just []
330 }
331 printDebug "addToCorpusWithFile" cid
332 _h <- postUpload cid filetype input
333
334 pure JobLog { _scst_succeeded = Just 137
335 , _scst_failed = Just 13
336 , _scst_remaining = Just 0
337 , _scst_events = Just []
338 }
339 -}
340
341
342
343 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
344 :> "corpus"
345 :> Capture "corpus_id" CorpusId
346 :> "add"
347 :> "file"
348 :> "async"
349 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
350
351 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
352 => User
353 -> CorpusId
354 -> NewWithFile
355 -> (JobLog -> m ())
356 -> m JobLog
357 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
358
359 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
360 logStatus JobLog { _scst_succeeded = Just 0
361 , _scst_failed = Just 0
362 , _scst_remaining = Just 1
363 , _scst_events = Just []
364 }
365
366 fPath <- GPU.writeFile nwf
367 printDebug "[addToCorpusWithFile] File saved as: " fPath
368
369 uId <- getUserId user
370 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
371
372 _ <- case nIds of
373 [nId] -> do
374 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
375 let hl = node ^. node_hyperdata
376 _ <- updateHyperdata nId $ hl { _hff_name = fName
377 , _hff_path = T.pack fPath }
378
379 printDebug "[addToCorpusWithFile] Created node with id: " nId
380 _ -> pure ()
381
382 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
383 pure $ JobLog { _scst_succeeded = Just 1
384 , _scst_failed = Just 0
385 , _scst_remaining = Just 0
386 , _scst_events = Just []
387 }