]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[REFACT] Text -> Core
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17 {-# OPTIONS_GHC -fno-warn-orphans #-}
18
19 module Gargantext.API.Node.Corpus.New
20 where
21
22 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson
24 import Data.Aeson.TH (deriveJSON)
25 import qualified Data.ByteString as BS
26 import qualified Data.ByteString.Base64 as BSB64
27 import Data.Either
28 import Data.Maybe (fromMaybe)
29 import Data.Swagger
30 import Data.Text (Text)
31 import qualified Data.Text as T
32 import qualified Data.Text.Encoding as TE
33 import GHC.Generics (Generic)
34 import Servant
35 import Servant.Job.Core
36 import Servant.Job.Types
37 import Servant.Job.Utils (jsonOptions)
38 -- import Servant.Multipart
39 -- import Test.QuickCheck (elements)
40 import Test.QuickCheck.Arbitrary
41 import Web.FormUrlEncoded (FromForm)
42
43 import Gargantext.Prelude
44
45 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
46 import qualified Gargantext.API.Admin.Orchestrator.Types as T
47 import Gargantext.API.Admin.Settings (HasSettings)
48 import Gargantext.API.Node.Corpus.New.File
49 import Gargantext.Core (Lang(..){-, allLangs-})
50 import Gargantext.Core.Types.Individu (User(..))
51 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
52 import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
53 import Gargantext.Database.Action.Flow.Utils (getUserId)
54 import Gargantext.Database.Action.Node (mkNodeWithParent)
55 import Gargantext.Database.Admin.Types.Hyperdata
56 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
57 import Gargantext.Database.Query.Table.Node (getNodeWith)
58 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
59 import Gargantext.Database.Schema.Node (node_hyperdata)
60 import qualified Gargantext.Prelude.Utils as GPU
61 import qualified Gargantext.Core.Text.Corpus.API as API
62 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
63
64 ------------------------------------------------------------------------
65 {-
66 data Query = Query { query_query :: Text
67 , query_node_id :: Int
68 , query_lang :: Lang
69 , query_databases :: [DataOrigin]
70 }
71 deriving (Eq, Generic)
72
73 deriveJSON (unPrefix "query_") 'Query
74
75 instance Arbitrary Query where
76 arbitrary = elements [ Query q n la fs
77 | q <- ["honeybee* AND collapse"
78 ,"covid 19"
79 ]
80 , n <- [0..10]
81 , la <- allLangs
82 , fs <- take 3 $ repeat allDataOrigins
83 ]
84
85 instance ToSchema Query where
86 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
87 -}
88
89 ------------------------------------------------------------------------
90
91 {-
92 type Api = PostApi
93 :<|> GetApi
94
95 type PostApi = Summary "New Corpus endpoint"
96 :> ReqBody '[JSON] Query
97 :> Post '[JSON] CorpusId
98 type GetApi = Get '[JSON] ApiInfo
99 -}
100
101 -- | TODO manage several apis
102 -- TODO-ACCESS
103 -- TODO this is only the POST
104 {-
105 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
106 api uid (Query q _ as) = do
107 cId <- case head as of
108 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
109 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
110 Just a -> do
111 docs <- liftBase $ API.get a q (Just 1000)
112 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
113 pure cId'
114
115 pure cId
116 -}
117
118 ------------------------------------------------
119 -- TODO use this route for Client implementation
120 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
121 deriving (Generic)
122 instance Arbitrary ApiInfo where
123 arbitrary = ApiInfo <$> arbitrary
124
125 deriveJSON (unPrefix "") 'ApiInfo
126
127 instance ToSchema ApiInfo
128
129 info :: FlowCmdM env err m => UserId -> m ApiInfo
130 info _u = pure $ ApiInfo API.externalAPIs
131
132 ------------------------------------------------------------------------
133
134 data Database = Empty
135 | PubMed
136 | HAL
137 | IsTex
138 | Isidore
139 deriving (Eq, Show, Generic)
140
141 deriveJSON (unPrefix "") ''Database
142 instance ToSchema Database
143
144 database2origin :: Database -> DataOrigin
145 database2origin Empty = InternalOrigin T.IsTex
146 database2origin PubMed = ExternalOrigin T.PubMed
147 database2origin HAL = ExternalOrigin T.HAL
148 database2origin IsTex = ExternalOrigin T.IsTex
149 database2origin Isidore = ExternalOrigin T.Isidore
150
151 ------------------------------------------------------------------------
152 data WithQuery = WithQuery
153 { _wq_query :: !Text
154 , _wq_databases :: !Database
155 , _wq_lang :: !Lang
156 , _wq_node_id :: !Int
157 }
158 deriving Generic
159
160 makeLenses ''WithQuery
161 instance FromJSON WithQuery where
162 parseJSON = genericParseJSON $ jsonOptions "_wq_"
163 instance ToSchema WithQuery where
164 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
165
166 -------------------------------------------------------
167 data NewWithForm = NewWithForm
168 { _wf_filetype :: !FileType
169 , _wf_data :: !Text
170 , _wf_lang :: !(Maybe Lang)
171 , _wf_name :: !Text
172 } deriving (Eq, Show, Generic)
173
174 makeLenses ''NewWithForm
175 instance FromForm NewWithForm
176 instance FromJSON NewWithForm where
177 parseJSON = genericParseJSON $ jsonOptions "_wf_"
178 instance ToSchema NewWithForm where
179 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
180
181 -------------------------------------------------------
182 data NewWithFile = NewWithFile
183 { _wfi_b64_data :: !Text
184 , _wfi_lang :: !(Maybe Lang)
185 , _wfi_name :: !Text
186 } deriving (Eq, Show, Generic)
187
188 makeLenses ''NewWithFile
189 instance FromForm NewWithFile
190 instance FromJSON NewWithFile where
191 parseJSON = genericParseJSON $ jsonOptions "_wfi_"
192 instance ToSchema NewWithFile where
193 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wfi_")
194
195 instance GPU.SaveFile NewWithFile where
196 saveFile' fp (NewWithFile b64d _ _) = do
197 let eDecoded = BSB64.decode $ TE.encodeUtf8 b64d
198 case eDecoded of
199 Left err -> panic $ T.pack $ "Error decoding: " <> err
200 Right decoded -> BS.writeFile fp decoded
201 -- BS.writeFile fp $ BSB64.decodeLenient $ TE.encodeUtf8 b64d
202
203 --instance GPU.ReadFile NewWithFile where
204 -- readFile' = TIO.readFile
205
206 ------------------------------------------------------------------------
207 type AsyncJobs event ctI input output =
208 AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output
209 ------------------------------------------------------------------------
210
211 type AddWithQuery = Summary "Add with Query to corpus endpoint"
212 :> "corpus"
213 :> Capture "corpus_id" CorpusId
214 :> "query"
215 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
216
217 {-
218 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
219 :> "corpus"
220 :> Capture "corpus_id" CorpusId
221 :> "add"
222 :> "file"
223 :> MultipartForm Mem (MultipartData Mem)
224 :> QueryParam "fileType" FileType
225 :> "async"
226 :> AsyncJobs JobLog '[JSON] () JobLog
227 -}
228
229
230 ------------------------------------------------------------------------
231 -- TODO WithQuery also has a corpus id
232 addToCorpusWithQuery :: FlowCmdM env err m
233 => User
234 -> CorpusId
235 -> WithQuery
236 -> (JobLog -> m ())
237 -> m JobLog
238 addToCorpusWithQuery u cid (WithQuery q dbs l _nid) logStatus = do
239 -- TODO ...
240 logStatus JobLog { _scst_succeeded = Just 0
241 , _scst_failed = Just 0
242 , _scst_remaining = Just 5
243 , _scst_events = Just []
244 }
245 printDebug "addToCorpusWithQuery" (cid, dbs)
246 -- TODO add cid
247 -- TODO if cid is folder -> create Corpus
248 -- if cid is corpus -> add to corpus
249 -- if cid is root -> create corpus in Private
250 txts <- mapM (\db -> getDataText db (Multi l) q Nothing) [database2origin dbs]
251
252 logStatus JobLog { _scst_succeeded = Just 2
253 , _scst_failed = Just 0
254 , _scst_remaining = Just 1
255 , _scst_events = Just []
256 }
257
258 cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
259 printDebug "corpus id" cids
260 -- TODO ...
261 pure JobLog { _scst_succeeded = Just 3
262 , _scst_failed = Just 0
263 , _scst_remaining = Just 0
264 , _scst_events = Just []
265 }
266
267
268 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
269 :> "corpus"
270 :> Capture "corpus_id" CorpusId
271 :> "add"
272 :> "form"
273 :> "async"
274 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
275
276 addToCorpusWithForm :: FlowCmdM env err m
277 => User
278 -> CorpusId
279 -> NewWithForm
280 -> (JobLog -> m ())
281 -> m JobLog
282 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
283
284 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
285 printDebug "[addToCorpusWithForm] fileType" ft
286 logStatus JobLog { _scst_succeeded = Just 0
287 , _scst_failed = Just 0
288 , _scst_remaining = Just 2
289 , _scst_events = Just []
290 }
291 let
292 parse = case ft of
293 CSV_HAL -> Parser.parseFormat Parser.CsvHal
294 CSV -> Parser.parseFormat Parser.CsvGargV3
295 WOS -> Parser.parseFormat Parser.WOS
296 PresseRIS -> Parser.parseFormat Parser.RisPresse
297
298 -- TODO granularity of the logStatus
299 docs <- liftBase $ splitEvery 500
300 <$> take 1000000
301 <$> parse (cs d)
302
303 printDebug "Parsing corpus finished : " cid
304 logStatus JobLog { _scst_succeeded = Just 1
305 , _scst_failed = Just 0
306 , _scst_remaining = Just 1
307 , _scst_events = Just []
308 }
309
310
311 printDebug "Starting extraction : " cid
312 -- TODO granularity of the logStatus
313 _cid' <- flowCorpus user
314 (Right [cid])
315 (Multi $ fromMaybe EN l)
316 (map (map toHyperdataDocument) docs)
317
318 printDebug "Extraction finished : " cid
319 pure JobLog { _scst_succeeded = Just 2
320 , _scst_failed = Just 0
321 , _scst_remaining = Just 0
322 , _scst_events = Just []
323 }
324
325 {-
326 addToCorpusWithFile :: FlowCmdM env err m
327 => CorpusId
328 -> MultipartData Mem
329 -> Maybe FileType
330 -> (JobLog -> m ())
331 -> m JobLog
332 addToCorpusWithFile cid input filetype logStatus = do
333 logStatus JobLog { _scst_succeeded = Just 10
334 , _scst_failed = Just 2
335 , _scst_remaining = Just 138
336 , _scst_events = Just []
337 }
338 printDebug "addToCorpusWithFile" cid
339 _h <- postUpload cid filetype input
340
341 pure JobLog { _scst_succeeded = Just 137
342 , _scst_failed = Just 13
343 , _scst_remaining = Just 0
344 , _scst_events = Just []
345 }
346 -}
347
348
349
350 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
351 :> "corpus"
352 :> Capture "corpus_id" CorpusId
353 :> "add"
354 :> "file"
355 :> "async"
356 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
357
358 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
359 => User
360 -> CorpusId
361 -> NewWithFile
362 -> (JobLog -> m ())
363 -> m JobLog
364 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
365
366 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
367 logStatus JobLog { _scst_succeeded = Just 0
368 , _scst_failed = Just 0
369 , _scst_remaining = Just 1
370 , _scst_events = Just []
371 }
372
373 fPath <- GPU.writeFile nwf
374 printDebug "[addToCorpusWithFile] File saved as: " fPath
375
376 uId <- getUserId user
377 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
378
379 _ <- case nIds of
380 [nId] -> do
381 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
382 let hl = node ^. node_hyperdata
383 _ <- updateHyperdata nId $ hl { _hff_name = fName
384 , _hff_path = T.pack fPath }
385
386 printDebug "[addToCorpusWithFile] Created node with id: " nId
387 _ -> pure ()
388
389 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
390 pure $ JobLog { _scst_succeeded = Just 1
391 , _scst_failed = Just 0
392 , _scst_remaining = Just 0
393 , _scst_events = Just []
394 }