]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[hal] use conduit for hal parsing
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21
22 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson
24 import Data.Aeson.TH (deriveJSON)
25 import qualified Data.ByteString.Base64 as BSB64
26 import Data.Either
27 import Data.Maybe (fromMaybe)
28 import Data.Swagger
29 import Data.Text (Text)
30 import qualified Data.Text as T
31 import GHC.Generics (Generic)
32 import Servant
33 import Servant.Job.Utils (jsonOptions)
34 -- import Servant.Multipart
35 import qualified Data.Text.Encoding as TE
36 -- import Test.QuickCheck (elements)
37 import Test.QuickCheck.Arbitrary
38
39 import Gargantext.Prelude
40
41 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
42 import Gargantext.API.Admin.Types (HasSettings)
43 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage)
44 import Gargantext.API.Node.Corpus.New.File
45 import Gargantext.API.Node.Corpus.Searx
46 import Gargantext.API.Node.Corpus.Types
47 import Gargantext.API.Node.Types
48 import Gargantext.Core (Lang(..){-, allLangs-})
49 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
50 import qualified Gargantext.Core.Text.Corpus.API as API
51 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
52 import Gargantext.Core.Types.Individu (User(..))
53 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
54 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
55 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
56 import Gargantext.Database.Action.Mail (sendMail)
57 import Gargantext.Database.Action.Node (mkNodeWithParent)
58 import Gargantext.Database.Action.User (getUserId)
59 import Gargantext.Database.Admin.Types.Hyperdata
60 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
61 import Gargantext.Database.Prelude (hasConfig)
62 import Gargantext.Database.Query.Table.Node (getNodeWith)
63 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
64 import Gargantext.Database.Schema.Node (node_hyperdata)
65 import qualified Gargantext.Database.GargDB as GargDB
66 import Gargantext.Prelude.Config (gc_max_docs_parsers)
67 ------------------------------------------------------------------------
68 {-
69 data Query = Query { query_query :: Text
70 , query_node_id :: Int
71 , query_lang :: Lang
72 , query_databases :: [DataOrigin]
73 }
74 deriving (Eq, Generic)
75
76 deriveJSON (unPrefix "query_") 'Query
77
78 instance Arbitrary Query where
79 arbitrary = elements [ Query q n la fs
80 | q <- ["honeybee* AND collapse"
81 ,"covid 19"
82 ]
83 , n <- [0..10]
84 , la <- allLangs
85 , fs <- take 3 $ repeat allDataOrigins
86 ]
87
88 instance ToSchema Query where
89 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
90 -}
91
92 ------------------------------------------------------------------------
93
94 {-
95 type Api = PostApi
96 :<|> GetApi
97
98 type PostApi = Summary "New Corpus endpoint"
99 :> ReqBody '[JSON] Query
100 :> Post '[JSON] CorpusId
101 type GetApi = Get '[JSON] ApiInfo
102 -}
103
104 -- | TODO manage several apis
105 -- TODO-ACCESS
106 -- TODO this is only the POST
107 {-
108 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
109 api uid (Query q _ as) = do
110 cId <- case head as of
111 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
112 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
113 Just a -> do
114 docs <- liftBase $ API.get a q (Just 1000)
115 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
116 pure cId'
117
118 pure cId
119 -}
120
121 ------------------------------------------------
122 -- TODO use this route for Client implementation
123 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
124 deriving (Generic)
125 instance Arbitrary ApiInfo where
126 arbitrary = ApiInfo <$> arbitrary
127
128 deriveJSON (unPrefix "") 'ApiInfo
129
130 instance ToSchema ApiInfo
131
132 info :: FlowCmdM env err m => UserId -> m ApiInfo
133 info _u = pure $ ApiInfo API.externalAPIs
134
135 ------------------------------------------------------------------------
136 ------------------------------------------------------------------------
137 data WithQuery = WithQuery
138 { _wq_query :: !Text
139 , _wq_databases :: !Database
140 , _wq_datafield :: !(Maybe Datafield)
141 , _wq_lang :: !Lang
142 , _wq_node_id :: !Int
143 , _wq_flowListWith :: !FlowSocialListWith
144 }
145 deriving Generic
146
147 makeLenses ''WithQuery
148 instance FromJSON WithQuery where
149 parseJSON = genericParseJSON $ jsonOptions "_wq_"
150 instance ToJSON WithQuery where
151 toJSON = genericToJSON $ jsonOptions "_wq_"
152 instance ToSchema WithQuery where
153 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
154
155 ------------------------------------------------------------------------
156
157 type AddWithQuery = Summary "Add with Query to corpus endpoint"
158 :> "corpus"
159 :> Capture "corpus_id" CorpusId
160 :> "query"
161 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
162
163 {-
164 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
165 :> "corpus"
166 :> Capture "corpus_id" CorpusId
167 :> "add"
168 :> "file"
169 :> MultipartForm Mem (MultipartData Mem)
170 :> QueryParam "fileType" FileType
171 :> "async"
172 :> AsyncJobs JobLog '[JSON] () JobLog
173 -}
174
175
176 ------------------------------------------------------------------------
177 -- TODO WithQuery also has a corpus id
178 addToCorpusWithQuery :: FlowCmdM env err m
179 => User
180 -> CorpusId
181 -> WithQuery
182 -> Maybe Integer
183 -> (JobLog -> m ())
184 -> m JobLog
185 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
186 , _wq_databases = dbs
187 , _wq_datafield = datafield
188 , _wq_lang = l
189 , _wq_flowListWith = flw }) maybeLimit logStatus = do
190 -- TODO ...
191 logStatus JobLog { _scst_succeeded = Just 0
192 , _scst_failed = Just 0
193 , _scst_remaining = Just 3
194 , _scst_events = Just []
195 }
196 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
197 printDebug "[addToCorpusWithQuery] datafield" datafield
198 printDebug "[addToCorpusWithQuery] flowListWith" flw
199
200 case datafield of
201 Just Web -> do
202 printDebug "[addToCorpusWithQuery] processing web request" datafield
203
204 _ <- triggerSearxSearch user cid q l logStatus
205
206 pure JobLog { _scst_succeeded = Just 3
207 , _scst_failed = Just 0
208 , _scst_remaining = Just 0
209 , _scst_events = Just []
210 }
211
212 _ -> do
213 -- TODO add cid
214 -- TODO if cid is folder -> create Corpus
215 -- if cid is corpus -> add to corpus
216 -- if cid is root -> create corpus in Private
217 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
218
219 logStatus JobLog { _scst_succeeded = Just 2
220 , _scst_failed = Just 0
221 , _scst_remaining = Just $ 1 + length txts
222 , _scst_events = Just []
223 }
224
225 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid Nothing logStatus) txts
226 printDebug "corpus id" cids
227 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
228 sendMail user
229 -- TODO ...
230 pure JobLog { _scst_succeeded = Just 3
231 , _scst_failed = Just 0
232 , _scst_remaining = Just 0
233 , _scst_events = Just []
234 }
235
236
237 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
238 :> "corpus"
239 :> Capture "corpus_id" CorpusId
240 :> "add"
241 :> "form"
242 :> "async"
243 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
244
245 addToCorpusWithForm :: (FlowCmdM env err m)
246 => User
247 -> CorpusId
248 -> NewWithForm
249 -> (JobLog -> m ())
250 -> JobLog
251 -> m JobLog
252 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
253 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
254 printDebug "[addToCorpusWithForm] fileType" ft
255 logStatus jobLog
256 let
257 parse = case ft of
258 CSV_HAL -> Parser.parseFormat Parser.CsvHal
259 CSV -> Parser.parseFormat Parser.CsvGargV3
260 WOS -> Parser.parseFormat Parser.WOS
261 PresseRIS -> Parser.parseFormat Parser.RisPresse
262 ZIP -> Parser.parseFormat Parser.ZIP
263
264 -- TODO granularity of the logStatus
265 let data' = case ft of
266 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
267 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
268 Right decoded -> decoded
269 _ -> cs d
270 eDocs <- liftBase $ parse data'
271 case eDocs of
272 Right docs' -> do
273 -- TODO Add progress (jobStatus) update for docs - this is a
274 -- long action
275 limit' <- view $ hasConfig . gc_max_docs_parsers
276 let limit = fromIntegral limit'
277 if length docs' > limit then do
278 printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs')
279 let panicMsg' = [ "[addToCorpusWithForm] number of docs ("
280 , show $ length docs'
281 , ") exceeds the MAX_DOCS_PARSERS limit ("
282 , show limit
283 , ")" ]
284 let panicMsg = T.concat $ T.pack <$> panicMsg'
285 logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
286 panic panicMsg
287 else
288 pure ()
289 let docs = splitEvery 500 $ take limit docs'
290
291 printDebug "Parsing corpus finished : " cid
292 logStatus jobLog2
293
294 printDebug "Starting extraction : " cid
295 -- TODO granularity of the logStatus
296 _cid' <- flowCorpus user
297 (Right [cid])
298 (Multi $ fromMaybe EN l)
299 Nothing
300 (map (map toHyperdataDocument) docs)
301 logStatus
302
303 printDebug "Extraction finished : " cid
304 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
305 sendMail user
306
307 logStatus jobLog3
308 pure $ jobLog3
309 Left e -> do
310 printDebug "[addToCorpusWithForm] parse error" e
311
312 let evt = ScraperEvent { _scev_message = Just $ T.pack e
313 , _scev_level = Just "ERROR"
314 , _scev_date = Nothing }
315
316 logStatus $ over (scst_events . _Just) (\evt' -> evt' <> [evt]) jobLogE
317 pure jobLogE
318 where
319 jobLog2 = jobLogSuccess jobLog
320 jobLog3 = jobLogSuccess jobLog2
321 jobLogE = jobLogFailTotal jobLog
322
323 {-
324 addToCorpusWithFile :: FlowCmdM env err m
325 => CorpusId
326 -> MultipartData Mem
327 -> Maybe FileType
328 -> (JobLog -> m ())
329 -> m JobLog
330 addToCorpusWithFile cid input filetype logStatus = do
331 logStatus JobLog { _scst_succeeded = Just 10
332 , _scst_failed = Just 2
333 , _scst_remaining = Just 138
334 , _scst_events = Just []
335 }
336 printDebug "addToCorpusWithFile" cid
337 _h <- postUpload cid filetype input
338
339 pure JobLog { _scst_succeeded = Just 137
340 , _scst_failed = Just 13
341 , _scst_remaining = Just 0
342 , _scst_events = Just []
343 }
344 -}
345
346
347
348 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
349 :> "corpus"
350 :> Capture "corpus_id" CorpusId
351 :> "add"
352 :> "file"
353 :> "async"
354 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
355
356 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
357 => User
358 -> CorpusId
359 -> NewWithFile
360 -> (JobLog -> m ())
361 -> m JobLog
362 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
363
364 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
365 logStatus JobLog { _scst_succeeded = Just 0
366 , _scst_failed = Just 0
367 , _scst_remaining = Just 1
368 , _scst_events = Just []
369 }
370
371 fPath <- GargDB.writeFile nwf
372 printDebug "[addToCorpusWithFile] File saved as: " fPath
373
374 uId <- getUserId user
375 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
376
377 _ <- case nIds of
378 [nId] -> do
379 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
380 let hl = node ^. node_hyperdata
381 _ <- updateHyperdata nId $ hl { _hff_name = fName
382 , _hff_path = T.pack fPath }
383
384 printDebug "[addToCorpusWithFile] Created node with id: " nId
385 _ -> pure ()
386
387 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
388
389 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
390 sendMail user
391
392 pure $ JobLog { _scst_succeeded = Just 1
393 , _scst_failed = Just 0
394 , _scst_remaining = Just 0
395 , _scst_events = Just []
396 }
397