]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[text-api] first rewrite using Conduit
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21
22 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson
24 import Data.Aeson.TH (deriveJSON)
25 import qualified Data.ByteString.Base64 as BSB64
26 import Data.Either
27 import Data.Maybe (fromMaybe)
28 import Data.Swagger
29 import Data.Text (Text)
30 import qualified Data.Text as T
31 import GHC.Generics (Generic)
32 import Servant
33 import Servant.Job.Utils (jsonOptions)
34 -- import Servant.Multipart
35 import qualified Data.Text.Encoding as TE
36 -- import Test.QuickCheck (elements)
37 import Test.QuickCheck.Arbitrary
38
39 import Gargantext.Prelude
40
41 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
42 import Gargantext.API.Admin.Types (HasSettings)
43 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage)
44 import Gargantext.API.Node.Corpus.New.File
45 import Gargantext.API.Node.Corpus.Searx
46 import Gargantext.API.Node.Corpus.Types
47 import Gargantext.API.Node.Types
48 import Gargantext.Core (Lang(..){-, allLangs-})
49 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
50 import qualified Gargantext.Core.Text.Corpus.API as API
51 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
52 import Gargantext.Core.Types.Individu (User(..))
53 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
54 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
55 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
56 import Gargantext.Database.Action.Mail (sendMail)
57 import Gargantext.Database.Action.Node (mkNodeWithParent)
58 import Gargantext.Database.Action.User (getUserId)
59 import Gargantext.Database.Admin.Types.Hyperdata
60 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
61 import Gargantext.Database.Prelude (hasConfig)
62 import Gargantext.Database.Query.Table.Node (getNodeWith)
63 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
64 import Gargantext.Database.Schema.Node (node_hyperdata)
65 import qualified Gargantext.Database.GargDB as GargDB
66 import Gargantext.Prelude.Config (gc_max_docs_parsers)
67 ------------------------------------------------------------------------
68 {-
69 data Query = Query { query_query :: Text
70 , query_node_id :: Int
71 , query_lang :: Lang
72 , query_databases :: [DataOrigin]
73 }
74 deriving (Eq, Generic)
75
76 deriveJSON (unPrefix "query_") 'Query
77
78 instance Arbitrary Query where
79 arbitrary = elements [ Query q n la fs
80 | q <- ["honeybee* AND collapse"
81 ,"covid 19"
82 ]
83 , n <- [0..10]
84 , la <- allLangs
85 , fs <- take 3 $ repeat allDataOrigins
86 ]
87
88 instance ToSchema Query where
89 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
90 -}
91
92 ------------------------------------------------------------------------
93
94 {-
95 type Api = PostApi
96 :<|> GetApi
97
98 type PostApi = Summary "New Corpus endpoint"
99 :> ReqBody '[JSON] Query
100 :> Post '[JSON] CorpusId
101 type GetApi = Get '[JSON] ApiInfo
102 -}
103
104 -- | TODO manage several apis
105 -- TODO-ACCESS
106 -- TODO this is only the POST
107 {-
108 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
109 api uid (Query q _ as) = do
110 cId <- case head as of
111 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
112 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
113 Just a -> do
114 docs <- liftBase $ API.get a q (Just 1000)
115 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
116 pure cId'
117
118 pure cId
119 -}
120
121 ------------------------------------------------
122 -- TODO use this route for Client implementation
123 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
124 deriving (Generic)
125 instance Arbitrary ApiInfo where
126 arbitrary = ApiInfo <$> arbitrary
127
128 deriveJSON (unPrefix "") 'ApiInfo
129
130 instance ToSchema ApiInfo
131
132 info :: FlowCmdM env err m => UserId -> m ApiInfo
133 info _u = pure $ ApiInfo API.externalAPIs
134
135 ------------------------------------------------------------------------
136 ------------------------------------------------------------------------
137 data WithQuery = WithQuery
138 { _wq_query :: !Text
139 , _wq_databases :: !Database
140 , _wq_datafield :: !(Maybe Datafield)
141 , _wq_lang :: !Lang
142 , _wq_node_id :: !Int
143 , _wq_flowListWith :: !FlowSocialListWith
144 }
145 deriving Generic
146
147 makeLenses ''WithQuery
148 instance FromJSON WithQuery where
149 parseJSON = genericParseJSON $ jsonOptions "_wq_"
150 instance ToJSON WithQuery where
151 toJSON = genericToJSON $ jsonOptions "_wq_"
152 instance ToSchema WithQuery where
153 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
154
155 ------------------------------------------------------------------------
156
157 type AddWithQuery = Summary "Add with Query to corpus endpoint"
158 :> "corpus"
159 :> Capture "corpus_id" CorpusId
160 :> "query"
161 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
162
163 {-
164 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
165 :> "corpus"
166 :> Capture "corpus_id" CorpusId
167 :> "add"
168 :> "file"
169 :> MultipartForm Mem (MultipartData Mem)
170 :> QueryParam "fileType" FileType
171 :> "async"
172 :> AsyncJobs JobLog '[JSON] () JobLog
173 -}
174
175
176 ------------------------------------------------------------------------
177 -- TODO WithQuery also has a corpus id
178 addToCorpusWithQuery :: FlowCmdM env err m
179 => User
180 -> CorpusId
181 -> WithQuery
182 -> Maybe Integer
183 -> (JobLog -> m ())
184 -> m JobLog
185 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
186 , _wq_databases = dbs
187 , _wq_datafield = datafield
188 , _wq_lang = l
189 , _wq_flowListWith = flw }) maybeLimit logStatus = do
190 -- TODO ...
191 logStatus JobLog { _scst_succeeded = Just 0
192 , _scst_failed = Just 0
193 , _scst_remaining = Just 3
194 , _scst_events = Just []
195 }
196 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
197 printDebug "[addToCorpusWithQuery] datafield" datafield
198 printDebug "[addToCorpusWithQuery] flowListWith" flw
199
200 case datafield of
201 Just Web -> do
202 printDebug "[addToCorpusWithQuery] processing web request" datafield
203
204 _ <- triggerSearxSearch user cid q l logStatus
205
206 pure JobLog { _scst_succeeded = Just 3
207 , _scst_failed = Just 0
208 , _scst_remaining = Just 0
209 , _scst_events = Just []
210 }
211
212 _ -> do
213 -- TODO add cid
214 -- TODO if cid is folder -> create Corpus
215 -- if cid is corpus -> add to corpus
216 -- if cid is root -> create corpus in Private
217 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
218
219 -- TODO Sum lenghts of each txt elements
220 logStatus JobLog { _scst_succeeded = Just 2
221 , _scst_failed = Just 0
222 , _scst_remaining = Just $ 1 + length txts
223 , _scst_events = Just []
224 }
225
226 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid Nothing logStatus) txts
227 printDebug "corpus id" cids
228 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
229 sendMail user
230 -- TODO ...
231 pure JobLog { _scst_succeeded = Just 3
232 , _scst_failed = Just 0
233 , _scst_remaining = Just 0
234 , _scst_events = Just []
235 }
236
237
238 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
239 :> "corpus"
240 :> Capture "corpus_id" CorpusId
241 :> "add"
242 :> "form"
243 :> "async"
244 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
245
246 addToCorpusWithForm :: (FlowCmdM env err m)
247 => User
248 -> CorpusId
249 -> NewWithForm
250 -> (JobLog -> m ())
251 -> JobLog
252 -> m JobLog
253 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
254 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
255 printDebug "[addToCorpusWithForm] fileType" ft
256 logStatus jobLog
257 let
258 parse = case ft of
259 CSV_HAL -> Parser.parseFormat Parser.CsvHal
260 CSV -> Parser.parseFormat Parser.CsvGargV3
261 WOS -> Parser.parseFormat Parser.WOS
262 PresseRIS -> Parser.parseFormat Parser.RisPresse
263 ZIP -> Parser.parseFormat Parser.ZIP
264
265 -- TODO granularity of the logStatus
266 let data' = case ft of
267 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
268 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
269 Right decoded -> decoded
270 _ -> cs d
271 eDocs <- liftBase $ parse data'
272 case eDocs of
273 Right docs' -> do
274 -- TODO Add progress (jobStatus) update for docs - this is a
275 -- long action
276 limit' <- view $ hasConfig . gc_max_docs_parsers
277 let limit = fromIntegral limit'
278 if length docs' > limit then do
279 printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs')
280 let panicMsg' = [ "[addToCorpusWithForm] number of docs ("
281 , show $ length docs'
282 , ") exceeds the MAX_DOCS_PARSERS limit ("
283 , show limit
284 , ")" ]
285 let panicMsg = T.concat $ T.pack <$> panicMsg'
286 logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
287 panic panicMsg
288 else
289 pure ()
290 let docs = splitEvery 500 $ take limit docs'
291
292 printDebug "Parsing corpus finished : " cid
293 logStatus jobLog2
294
295 printDebug "Starting extraction : " cid
296 -- TODO granularity of the logStatus
297 _cid' <- flowCorpus user
298 (Right [cid])
299 (Multi $ fromMaybe EN l)
300 Nothing
301 (map (map toHyperdataDocument) docs)
302 logStatus
303
304 printDebug "Extraction finished : " cid
305 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
306 sendMail user
307
308 logStatus jobLog3
309 pure $ jobLog3
310 Left e -> do
311 printDebug "[addToCorpusWithForm] parse error" e
312
313 let evt = ScraperEvent { _scev_message = Just $ T.pack e
314 , _scev_level = Just "ERROR"
315 , _scev_date = Nothing }
316
317 logStatus $ over (scst_events . _Just) (\evt' -> evt' <> [evt]) jobLogE
318 pure jobLogE
319 where
320 jobLog2 = jobLogSuccess jobLog
321 jobLog3 = jobLogSuccess jobLog2
322 jobLogE = jobLogFailTotal jobLog
323
324 {-
325 addToCorpusWithFile :: FlowCmdM env err m
326 => CorpusId
327 -> MultipartData Mem
328 -> Maybe FileType
329 -> (JobLog -> m ())
330 -> m JobLog
331 addToCorpusWithFile cid input filetype logStatus = do
332 logStatus JobLog { _scst_succeeded = Just 10
333 , _scst_failed = Just 2
334 , _scst_remaining = Just 138
335 , _scst_events = Just []
336 }
337 printDebug "addToCorpusWithFile" cid
338 _h <- postUpload cid filetype input
339
340 pure JobLog { _scst_succeeded = Just 137
341 , _scst_failed = Just 13
342 , _scst_remaining = Just 0
343 , _scst_events = Just []
344 }
345 -}
346
347
348
349 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
350 :> "corpus"
351 :> Capture "corpus_id" CorpusId
352 :> "add"
353 :> "file"
354 :> "async"
355 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
356
357 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
358 => User
359 -> CorpusId
360 -> NewWithFile
361 -> (JobLog -> m ())
362 -> m JobLog
363 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
364
365 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
366 logStatus JobLog { _scst_succeeded = Just 0
367 , _scst_failed = Just 0
368 , _scst_remaining = Just 1
369 , _scst_events = Just []
370 }
371
372 fPath <- GargDB.writeFile nwf
373 printDebug "[addToCorpusWithFile] File saved as: " fPath
374
375 uId <- getUserId user
376 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
377
378 _ <- case nIds of
379 [nId] -> do
380 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
381 let hl = node ^. node_hyperdata
382 _ <- updateHyperdata nId $ hl { _hff_name = fName
383 , _hff_path = T.pack fPath }
384
385 printDebug "[addToCorpusWithFile] Created node with id: " nId
386 _ -> pure ()
387
388 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
389
390 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
391 sendMail user
392
393 pure $ JobLog { _scst_succeeded = Just 1
394 , _scst_failed = Just 0
395 , _scst_remaining = Just 0
396 , _scst_events = Just []
397 }
398