]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[corpus new] add info about current doc id
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21
22 import Conduit
23 import Control.Lens hiding (elements, Empty)
24 import Data.Aeson
25 import Data.Aeson.TH (deriveJSON)
26 import qualified Data.ByteString.Base64 as BSB64
27 import Data.Either
28 import Data.Maybe (fromMaybe)
29 import Data.Swagger
30 import Data.Text (Text)
31 import qualified Data.Text as T
32 import GHC.Generics (Generic)
33 import Servant
34 import Servant.Job.Utils (jsonOptions)
35 -- import Servant.Multipart
36 import qualified Data.Text.Encoding as TE
37 -- import Test.QuickCheck (elements)
38 import Test.QuickCheck.Arbitrary
39
40 import Gargantext.Prelude
41
42 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
43 import Gargantext.API.Admin.Types (HasSettings)
44 import Gargantext.API.Job (addEvent, jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage)
45 import Gargantext.API.Node.Corpus.New.File
46 import Gargantext.API.Node.Corpus.Searx
47 import Gargantext.API.Node.Corpus.Types
48 import Gargantext.API.Node.Types
49 import Gargantext.Core (Lang(..){-, allLangs-})
50 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
51 import qualified Gargantext.Core.Text.Corpus.API as API
52 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
53 import Gargantext.Core.Types.Individu (User(..))
54 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
55 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, DataText(..), TermType(..){-, allDataOrigins-})
56 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
57 import Gargantext.Database.Action.Mail (sendMail)
58 import Gargantext.Database.Action.Node (mkNodeWithParent)
59 import Gargantext.Database.Action.User (getUserId)
60 import Gargantext.Database.Admin.Types.Hyperdata
61 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
62 import Gargantext.Database.Prelude (hasConfig)
63 import Gargantext.Database.Query.Table.Node (getNodeWith)
64 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
65 import Gargantext.Database.Schema.Node (node_hyperdata)
66 import qualified Gargantext.Database.GargDB as GargDB
67 import Gargantext.Prelude.Config (gc_max_docs_parsers)
68 ------------------------------------------------------------------------
69 {-
70 data Query = Query { query_query :: Text
71 , query_node_id :: Int
72 , query_lang :: Lang
73 , query_databases :: [DataOrigin]
74 }
75 deriving (Eq, Generic)
76
77 deriveJSON (unPrefix "query_") 'Query
78
79 instance Arbitrary Query where
80 arbitrary = elements [ Query q n la fs
81 | q <- ["honeybee* AND collapse"
82 ,"covid 19"
83 ]
84 , n <- [0..10]
85 , la <- allLangs
86 , fs <- take 3 $ repeat allDataOrigins
87 ]
88
89 instance ToSchema Query where
90 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
91 -}
92
93 ------------------------------------------------------------------------
94
95 {-
96 type Api = PostApi
97 :<|> GetApi
98
99 type PostApi = Summary "New Corpus endpoint"
100 :> ReqBody '[JSON] Query
101 :> Post '[JSON] CorpusId
102 type GetApi = Get '[JSON] ApiInfo
103 -}
104
105 -- | TODO manage several apis
106 -- TODO-ACCESS
107 -- TODO this is only the POST
108 {-
109 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
110 api uid (Query q _ as) = do
111 cId <- case head as of
112 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
113 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
114 Just a -> do
115 docs <- liftBase $ API.get a q (Just 1000)
116 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
117 pure cId'
118
119 pure cId
120 -}
121
122 ------------------------------------------------
123 -- TODO use this route for Client implementation
124 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
125 deriving (Generic)
126 instance Arbitrary ApiInfo where
127 arbitrary = ApiInfo <$> arbitrary
128
129 deriveJSON (unPrefix "") 'ApiInfo
130
131 instance ToSchema ApiInfo
132
133 info :: FlowCmdM env err m => UserId -> m ApiInfo
134 info _u = pure $ ApiInfo API.externalAPIs
135
136 ------------------------------------------------------------------------
137 ------------------------------------------------------------------------
138 data WithQuery = WithQuery
139 { _wq_query :: !Text
140 , _wq_databases :: !Database
141 , _wq_datafield :: !(Maybe Datafield)
142 , _wq_lang :: !Lang
143 , _wq_node_id :: !Int
144 , _wq_flowListWith :: !FlowSocialListWith
145 }
146 deriving Generic
147
148 makeLenses ''WithQuery
149 instance FromJSON WithQuery where
150 parseJSON = genericParseJSON $ jsonOptions "_wq_"
151 instance ToJSON WithQuery where
152 toJSON = genericToJSON $ jsonOptions "_wq_"
153 instance ToSchema WithQuery where
154 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
155
156 ------------------------------------------------------------------------
157
158 type AddWithQuery = Summary "Add with Query to corpus endpoint"
159 :> "corpus"
160 :> Capture "corpus_id" CorpusId
161 :> "query"
162 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
163
164 {-
165 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
166 :> "corpus"
167 :> Capture "corpus_id" CorpusId
168 :> "add"
169 :> "file"
170 :> MultipartForm Mem (MultipartData Mem)
171 :> QueryParam "fileType" FileType
172 :> "async"
173 :> AsyncJobs JobLog '[JSON] () JobLog
174 -}
175
176
177 ------------------------------------------------------------------------
178 -- TODO WithQuery also has a corpus id
179 addToCorpusWithQuery :: FlowCmdM env err m
180 => User
181 -> CorpusId
182 -> WithQuery
183 -> Maybe Integer
184 -> (JobLog -> m ())
185 -> m JobLog
186 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
187 , _wq_databases = dbs
188 , _wq_datafield = datafield
189 , _wq_lang = l
190 , _wq_flowListWith = flw }) maybeLimit logStatus = do
191 -- TODO ...
192 logStatus JobLog { _scst_succeeded = Just 0
193 , _scst_failed = Just 0
194 , _scst_remaining = Just 3
195 , _scst_events = Just []
196 }
197 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
198 printDebug "[addToCorpusWithQuery] datafield" datafield
199 printDebug "[addToCorpusWithQuery] flowListWith" flw
200
201 case datafield of
202 Just Web -> do
203 printDebug "[addToCorpusWithQuery] processing web request" datafield
204
205 _ <- triggerSearxSearch user cid q l logStatus
206
207 pure JobLog { _scst_succeeded = Just 3
208 , _scst_failed = Just 0
209 , _scst_remaining = Just 0
210 , _scst_events = Just []
211 }
212
213 _ -> do
214 -- TODO add cid
215 -- TODO if cid is folder -> create Corpus
216 -- if cid is corpus -> add to corpus
217 -- if cid is root -> create corpus in Private
218 eTxts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
219 let lTxts = lefts eTxts
220 case lTxts of
221 [] -> do
222 let txts = rights eTxts
223 -- TODO Sum lenghts of each txt elements
224 let jl = JobLog { _scst_succeeded = Just 2
225 , _scst_failed = Just 0
226 , _scst_remaining = Just $ 1 + length txts
227 , _scst_events = Just []
228 }
229 logStatus jl
230
231 cids <- mapM (\txt -> do
232 let id = case txt of
233 (DataNew (i, _)) -> i
234 _ -> (Just 0)
235 logStatus $ addEvent "INFO: doc id" (T.pack $ show id) jl
236 flowDataText user txt (Multi l) cid Nothing logStatus) txts
237 printDebug "corpus id" cids
238 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
239 sendMail user
240 -- TODO ...
241 pure JobLog { _scst_succeeded = Just 3
242 , _scst_failed = Just 0
243 , _scst_remaining = Just 0
244 , _scst_events = Just []
245 }
246
247 (err:_) -> do
248 pure $ addEvent "ERROR" (T.pack $ show err) $
249 JobLog { _scst_succeeded = Just 2
250 , _scst_failed = Just 1
251 , _scst_remaining = Just 0
252 , _scst_events = Just []
253 }
254
255
256 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
257 :> "corpus"
258 :> Capture "corpus_id" CorpusId
259 :> "add"
260 :> "form"
261 :> "async"
262 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
263
264 addToCorpusWithForm :: (FlowCmdM env err m)
265 => User
266 -> CorpusId
267 -> NewWithForm
268 -> (JobLog -> m ())
269 -> JobLog
270 -> m JobLog
271 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
272 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
273 printDebug "[addToCorpusWithForm] fileType" ft
274 logStatus jobLog
275 let
276 parse = case ft of
277 CSV_HAL -> Parser.parseFormat Parser.CsvHal
278 CSV -> Parser.parseFormat Parser.CsvGargV3
279 WOS -> Parser.parseFormat Parser.WOS
280 PresseRIS -> Parser.parseFormat Parser.RisPresse
281 ZIP -> Parser.parseFormat Parser.ZIP
282
283 -- TODO granularity of the logStatus
284 let data' = case ft of
285 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
286 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
287 Right decoded -> decoded
288 _ -> cs d
289 eDocs <- liftBase $ parse data'
290 case eDocs of
291 Right docs -> do
292 -- TODO Add progress (jobStatus) update for docs - this is a
293 -- long action
294
295 limit' <- view $ hasConfig . gc_max_docs_parsers
296 let limit = fromIntegral limit'
297 if length docs > limit then do
298 printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs)
299 let panicMsg' = [ "[addToCorpusWithForm] number of docs ("
300 , show $ length docs
301 , ") exceeds the MAX_DOCS_PARSERS limit ("
302 , show limit
303 , ")" ]
304 let panicMsg = T.concat $ T.pack <$> panicMsg'
305 logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
306 panic panicMsg
307 else
308 pure ()
309
310 printDebug "Parsing corpus finished : " cid
311 logStatus jobLog2
312
313 printDebug "Starting extraction : " cid
314 -- TODO granularity of the logStatus
315 _cid' <- flowCorpus user
316 (Right [cid])
317 (Multi $ fromMaybe EN l)
318 Nothing
319 (Just $ fromIntegral $ length docs, yieldMany docs .| mapC toHyperdataDocument)
320 --(map (map toHyperdataDocument) docs)
321 logStatus
322
323 printDebug "Extraction finished : " cid
324 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
325 sendMail user
326
327 logStatus jobLog3
328 pure $ jobLog3
329 Left e -> do
330 printDebug "[addToCorpusWithForm] parse error" e
331
332 let evt = ScraperEvent { _scev_message = Just $ T.pack e
333 , _scev_level = Just "ERROR"
334 , _scev_date = Nothing }
335
336 logStatus $ over (scst_events . _Just) (\evt' -> evt' <> [evt]) jobLogE
337 pure jobLogE
338 where
339 jobLog2 = jobLogSuccess jobLog
340 jobLog3 = jobLogSuccess jobLog2
341 jobLogE = jobLogFailTotal jobLog
342
343 {-
344 addToCorpusWithFile :: FlowCmdM env err m
345 => CorpusId
346 -> MultipartData Mem
347 -> Maybe FileType
348 -> (JobLog -> m ())
349 -> m JobLog
350 addToCorpusWithFile cid input filetype logStatus = do
351 logStatus JobLog { _scst_succeeded = Just 10
352 , _scst_failed = Just 2
353 , _scst_remaining = Just 138
354 , _scst_events = Just []
355 }
356 printDebug "addToCorpusWithFile" cid
357 _h <- postUpload cid filetype input
358
359 pure JobLog { _scst_succeeded = Just 137
360 , _scst_failed = Just 13
361 , _scst_remaining = Just 0
362 , _scst_events = Just []
363 }
364 -}
365
366
367
368 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
369 :> "corpus"
370 :> Capture "corpus_id" CorpusId
371 :> "add"
372 :> "file"
373 :> "async"
374 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
375
376 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
377 => User
378 -> CorpusId
379 -> NewWithFile
380 -> (JobLog -> m ())
381 -> m JobLog
382 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
383
384 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
385 logStatus JobLog { _scst_succeeded = Just 0
386 , _scst_failed = Just 0
387 , _scst_remaining = Just 1
388 , _scst_events = Just []
389 }
390
391 fPath <- GargDB.writeFile nwf
392 printDebug "[addToCorpusWithFile] File saved as: " fPath
393
394 uId <- getUserId user
395 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
396
397 _ <- case nIds of
398 [nId] -> do
399 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
400 let hl = node ^. node_hyperdata
401 _ <- updateHyperdata nId $ hl { _hff_name = fName
402 , _hff_path = T.pack fPath }
403
404 printDebug "[addToCorpusWithFile] Created node with id: " nId
405 _ -> pure ()
406
407 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
408
409 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
410 sendMail user
411
412 pure $ JobLog { _scst_succeeded = Just 1
413 , _scst_failed = Just 0
414 , _scst_remaining = Just 0
415 , _scst_events = Just []
416 }
417