]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[upload] fix csv upload job response
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Control.Lens hiding (elements, Empty)
22 import Data.Aeson
23 import Data.Aeson.TH (deriveJSON)
24 import Data.Either
25 import Data.Maybe (fromMaybe)
26 import Data.Swagger
27 import Data.Text (Text)
28 import qualified Data.Text as T
29 import GHC.Generics (Generic)
30 import qualified Prelude as Prelude
31 import Protolude (readFile)
32 import Servant
33 import Servant.Job.Utils (jsonOptions)
34 -- import Servant.Multipart
35 -- import Test.QuickCheck (elements)
36 import Test.QuickCheck.Arbitrary
37
38 import Gargantext.Prelude
39
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal)
43 import Gargantext.API.Node.Corpus.New.File
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import qualified Gargantext.Core.Text.Corpus.API as API
49 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
50 import Gargantext.Core.Types.Individu (User(..))
51 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
52 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
53 import Gargantext.Database.Action.Mail (sendMail)
54 import Gargantext.Database.Action.Node (mkNodeWithParent)
55 import Gargantext.Database.Action.User (getUserId)
56 import Gargantext.Database.Admin.Types.Hyperdata
57 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
58 import Gargantext.Database.Query.Table.Node (getNodeWith)
59 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
60 import Gargantext.Database.Schema.Node (node_hyperdata)
61 import qualified Gargantext.Database.GargDB as GargDB
62
63 ------------------------------------------------------------------------
64 {-
65 data Query = Query { query_query :: Text
66 , query_node_id :: Int
67 , query_lang :: Lang
68 , query_databases :: [DataOrigin]
69 }
70 deriving (Eq, Generic)
71
72 deriveJSON (unPrefix "query_") 'Query
73
74 instance Arbitrary Query where
75 arbitrary = elements [ Query q n la fs
76 | q <- ["honeybee* AND collapse"
77 ,"covid 19"
78 ]
79 , n <- [0..10]
80 , la <- allLangs
81 , fs <- take 3 $ repeat allDataOrigins
82 ]
83
84 instance ToSchema Query where
85 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
86 -}
87
88 ------------------------------------------------------------------------
89
90 {-
91 type Api = PostApi
92 :<|> GetApi
93
94 type PostApi = Summary "New Corpus endpoint"
95 :> ReqBody '[JSON] Query
96 :> Post '[JSON] CorpusId
97 type GetApi = Get '[JSON] ApiInfo
98 -}
99
100 -- | TODO manage several apis
101 -- TODO-ACCESS
102 -- TODO this is only the POST
103 {-
104 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
105 api uid (Query q _ as) = do
106 cId <- case head as of
107 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
108 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
109 Just a -> do
110 docs <- liftBase $ API.get a q (Just 1000)
111 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
112 pure cId'
113
114 pure cId
115 -}
116
117 ------------------------------------------------
118 -- TODO use this route for Client implementation
119 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
120 deriving (Generic)
121 instance Arbitrary ApiInfo where
122 arbitrary = ApiInfo <$> arbitrary
123
124 deriveJSON (unPrefix "") 'ApiInfo
125
126 instance ToSchema ApiInfo
127
128 info :: FlowCmdM env err m => UserId -> m ApiInfo
129 info _u = pure $ ApiInfo API.externalAPIs
130
131 ------------------------------------------------------------------------
132 ------------------------------------------------------------------------
133 data WithQuery = WithQuery
134 { _wq_query :: !Text
135 , _wq_databases :: !Database
136 , _wq_datafield :: !Datafield
137 , _wq_lang :: !Lang
138 , _wq_node_id :: !Int
139 }
140 deriving Generic
141
142 makeLenses ''WithQuery
143 instance FromJSON WithQuery where
144 parseJSON = genericParseJSON $ jsonOptions "_wq_"
145 instance ToJSON WithQuery where
146 toJSON = genericToJSON $ jsonOptions "_wq_"
147 instance ToSchema WithQuery where
148 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
149
150 ------------------------------------------------------------------------
151
152 type AddWithQuery = Summary "Add with Query to corpus endpoint"
153 :> "corpus"
154 :> Capture "corpus_id" CorpusId
155 :> "query"
156 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
157
158 {-
159 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
160 :> "corpus"
161 :> Capture "corpus_id" CorpusId
162 :> "add"
163 :> "file"
164 :> MultipartForm Mem (MultipartData Mem)
165 :> QueryParam "fileType" FileType
166 :> "async"
167 :> AsyncJobs JobLog '[JSON] () JobLog
168 -}
169
170
171 ------------------------------------------------------------------------
172 -- TODO WithQuery also has a corpus id
173 addToCorpusWithQuery :: FlowCmdM env err m
174 => User
175 -> CorpusId
176 -> WithQuery
177 -> Maybe Integer
178 -> (JobLog -> m ())
179 -> m JobLog
180 addToCorpusWithQuery user cid (WithQuery q dbs datafield l _nid) maybeLimit logStatus = do
181 -- TODO ...
182 logStatus JobLog { _scst_succeeded = Just 0
183 , _scst_failed = Just 0
184 , _scst_remaining = Just 3
185 , _scst_events = Just []
186 }
187 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
188 printDebug "[addToCorpusWithQuery] datafield" datafield
189
190 case datafield of
191 Web -> do
192 printDebug "[addToCorpusWithQuery] processing web request" datafield
193
194 _ <- triggerSearxSearch cid q l
195
196 pure JobLog { _scst_succeeded = Just 3
197 , _scst_failed = Just 0
198 , _scst_remaining = Just 0
199 , _scst_events = Just []
200 }
201
202 _ -> do
203 -- TODO add cid
204 -- TODO if cid is folder -> create Corpus
205 -- if cid is corpus -> add to corpus
206 -- if cid is root -> create corpus in Private
207 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
208
209 logStatus JobLog { _scst_succeeded = Just 2
210 , _scst_failed = Just 0
211 , _scst_remaining = Just 1
212 , _scst_events = Just []
213 }
214
215 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid) txts
216 printDebug "corpus id" cids
217 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
218 sendMail user
219 -- TODO ...
220 pure JobLog { _scst_succeeded = Just 3
221 , _scst_failed = Just 0
222 , _scst_remaining = Just 0
223 , _scst_events = Just []
224 }
225
226
227 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
228 :> "corpus"
229 :> Capture "corpus_id" CorpusId
230 :> "add"
231 :> "form"
232 :> "async"
233 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
234
235 addToCorpusWithForm :: FlowCmdM env err m
236 => User
237 -> CorpusId
238 -> NewWithForm
239 -> (JobLog -> m ())
240 -> JobLog
241 -> m JobLog
242 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
243 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
244 printDebug "[addToCorpusWithForm] fileType" ft
245 logStatus jobLog
246 let
247 parse = case ft of
248 CSV_HAL -> Parser.parseFormat Parser.CsvHal
249 CSV -> Parser.parseFormat Parser.CsvGargV3
250 WOS -> Parser.parseFormat Parser.WOS
251 PresseRIS -> Parser.parseFormat Parser.RisPresse
252
253 -- TODO granularity of the logStatus
254 eDocs <- liftBase $ parse $ cs d
255 case eDocs of
256 Right docs' -> do
257 let docs = splitEvery 500 $ take 1000000 docs'
258
259 printDebug "Parsing corpus finished : " cid
260 logStatus jobLog2
261
262 printDebug "Starting extraction : " cid
263 -- TODO granularity of the logStatus
264 _cid' <- flowCorpus user
265 (Right [cid])
266 (Multi $ fromMaybe EN l)
267 (map (map toHyperdataDocument) docs)
268
269 printDebug "Extraction finished : " cid
270 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
271 sendMail user
272
273 logStatus jobLog3
274 pure $ jobLog3
275 Left e -> do
276 printDebug "Error" e
277
278 logStatus jobLogE
279 pure jobLogE
280 where
281 jobLog2 = jobLogSuccess jobLog
282 jobLog3 = jobLogSuccess jobLog2
283 jobLogE = jobLogFailTotal jobLog
284
285 parseCsvGargV3Path :: [Char] -> IO (Either Prelude.String [HyperdataDocument])
286 parseCsvGargV3Path fp = do
287 contents <- readFile fp
288 Parser.parseFormat Parser.CsvGargV3 $ cs contents
289
290 {-
291 addToCorpusWithFile :: FlowCmdM env err m
292 => CorpusId
293 -> MultipartData Mem
294 -> Maybe FileType
295 -> (JobLog -> m ())
296 -> m JobLog
297 addToCorpusWithFile cid input filetype logStatus = do
298 logStatus JobLog { _scst_succeeded = Just 10
299 , _scst_failed = Just 2
300 , _scst_remaining = Just 138
301 , _scst_events = Just []
302 }
303 printDebug "addToCorpusWithFile" cid
304 _h <- postUpload cid filetype input
305
306 pure JobLog { _scst_succeeded = Just 137
307 , _scst_failed = Just 13
308 , _scst_remaining = Just 0
309 , _scst_events = Just []
310 }
311 -}
312
313
314
315 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
316 :> "corpus"
317 :> Capture "corpus_id" CorpusId
318 :> "add"
319 :> "file"
320 :> "async"
321 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
322
323 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
324 => User
325 -> CorpusId
326 -> NewWithFile
327 -> (JobLog -> m ())
328 -> m JobLog
329 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
330
331 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
332 logStatus JobLog { _scst_succeeded = Just 0
333 , _scst_failed = Just 0
334 , _scst_remaining = Just 1
335 , _scst_events = Just []
336 }
337
338 fPath <- GargDB.writeFile nwf
339 printDebug "[addToCorpusWithFile] File saved as: " fPath
340
341 uId <- getUserId user
342 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
343
344 _ <- case nIds of
345 [nId] -> do
346 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
347 let hl = node ^. node_hyperdata
348 _ <- updateHyperdata nId $ hl { _hff_name = fName
349 , _hff_path = T.pack fPath }
350
351 printDebug "[addToCorpusWithFile] Created node with id: " nId
352 _ -> pure ()
353
354 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
355
356 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
357 sendMail user
358
359 pure $ JobLog { _scst_succeeded = Just 1
360 , _scst_failed = Just 0
361 , _scst_remaining = Just 0
362 , _scst_events = Just []
363 }