]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/API/Node/Corpus/New.hs
[FEAT] FrameWrite Corpus improvement
[gargantext.git] / src / Gargantext / API / Node / Corpus / New.hs
1 {-|
2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 New corpus means either:
11 - new corpus
12 - new data in existing corpus
13 -}
14
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
17
18 module Gargantext.API.Node.Corpus.New
19 where
20
21 import Conduit
22 import Control.Lens hiding (elements, Empty)
23 import Data.Aeson
24 import Data.Aeson.TH (deriveJSON)
25 import qualified Data.ByteString.Base64 as BSB64
26 import Data.Conduit.Internal (zipSources)
27 import Data.Either
28 import Data.Maybe (fromMaybe)
29 import Data.Swagger
30 import Data.Text (Text)
31 import qualified Data.Text as T
32 import GHC.Generics (Generic)
33 import Servant
34 import Servant.Job.Utils (jsonOptions)
35 -- import Servant.Multipart
36 import qualified Data.Text.Encoding as TE
37 -- import Test.QuickCheck (elements)
38 import Test.QuickCheck.Arbitrary
39
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Job (addEvent, jobLogSuccess, jobLogFailTotal)
43 import Gargantext.API.Node.Corpus.New.Types
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import Gargantext.Core.Types.Individu (User(..))
50 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
51 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
52 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
53 import Gargantext.Database.Action.Mail (sendMail)
54 import Gargantext.Database.Action.Node (mkNodeWithParent)
55 import Gargantext.Database.Action.User (getUserId)
56 import Gargantext.Database.Admin.Types.Hyperdata
57 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
58 import Gargantext.Database.Prelude (hasConfig)
59 import Gargantext.Database.Query.Table.Node (getNodeWith)
60 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
61 import Gargantext.Database.Schema.Node (node_hyperdata)
62 import Gargantext.Prelude
63 import Gargantext.Prelude.Config (gc_max_docs_parsers)
64 import qualified Gargantext.Core.Text.Corpus.API as API
65 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
66 import qualified Gargantext.Database.GargDB as GargDB
67 ------------------------------------------------------------------------
68 {-
69 data Query = Query { query_query :: Text
70 , query_node_id :: Int
71 , query_lang :: Lang
72 , query_databases :: [DataOrigin]
73 }
74 deriving (Eq, Generic)
75
76 deriveJSON (unPrefix "query_") 'Query
77
78 instance Arbitrary Query where
79 arbitrary = elements [ Query q n la fs
80 | q <- ["honeybee* AND collapse"
81 ,"covid 19"
82 ]
83 , n <- [0..10]
84 , la <- allLangs
85 , fs <- take 3 $ repeat allDataOrigins
86 ]
87
88 instance ToSchema Query where
89 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
90 -}
91
92 ------------------------------------------------------------------------
93
94 {-
95 type Api = PostApi
96 :<|> GetApi
97
98 type PostApi = Summary "New Corpus endpoint"
99 :> ReqBody '[JSON] Query
100 :> Post '[JSON] CorpusId
101 type GetApi = Get '[JSON] ApiInfo
102 -}
103
104 -- | TODO manage several apis
105 -- TODO-ACCESS
106 -- TODO this is only the POST
107 {-
108 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
109 api uid (Query q _ as) = do
110 cId <- case head as of
111 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
112 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
113 Just a -> do
114 docs <- liftBase $ API.get a q (Just 1000)
115 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
116 pure cId'
117
118 pure cId
119 -}
120
121 ------------------------------------------------
122 -- TODO use this route for Client implementation
123 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
124 deriving (Generic)
125 instance Arbitrary ApiInfo where
126 arbitrary = ApiInfo <$> arbitrary
127
128 deriveJSON (unPrefix "") 'ApiInfo
129
130 instance ToSchema ApiInfo
131
132 info :: FlowCmdM env err m => UserId -> m ApiInfo
133 info _u = do
134 ext <- API.externalAPIs
135
136 pure $ ApiInfo ext
137
138 ------------------------------------------------------------------------
139 ------------------------------------------------------------------------
140 data WithQuery = WithQuery
141 { _wq_query :: !Text
142 , _wq_databases :: !Database
143 , _wq_datafield :: !(Maybe Datafield)
144 , _wq_lang :: !Lang
145 , _wq_node_id :: !Int
146 , _wq_flowListWith :: !FlowSocialListWith
147 }
148 deriving Generic
149
150 makeLenses ''WithQuery
151 instance FromJSON WithQuery where
152 parseJSON = genericParseJSON $ jsonOptions "_wq_"
153 instance ToJSON WithQuery where
154 toJSON = genericToJSON $ jsonOptions "_wq_"
155 instance ToSchema WithQuery where
156 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
157
158 ------------------------------------------------------------------------
159
160 type AddWithQuery = Summary "Add with Query to corpus endpoint"
161 :> "corpus"
162 :> Capture "corpus_id" CorpusId
163 :> "query"
164 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
165
166 {-
167 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
168 :> "corpus"
169 :> Capture "corpus_id" CorpusId
170 :> "add"
171 :> "file"
172 :> MultipartForm Mem (MultipartData Mem)
173 :> QueryParam "fileType" FileType
174 :> "async"
175 :> AsyncJobs JobLog '[JSON] () JobLog
176 -}
177
178
179 ------------------------------------------------------------------------
180 -- TODO WithQuery also has a corpus id
181
182
183 addToCorpusWithQuery :: FlowCmdM env err m
184 => User
185 -> CorpusId
186 -> WithQuery
187 -> Maybe Integer
188 -> (JobLog -> m ())
189 -> m JobLog
190 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
191 , _wq_databases = dbs
192 , _wq_datafield = datafield
193 , _wq_lang = l
194 , _wq_flowListWith = flw }) maybeLimit logStatus = do
195 -- TODO ...
196 logStatus JobLog { _scst_succeeded = Just 0
197 , _scst_failed = Just 0
198 , _scst_remaining = Just 3
199 , _scst_events = Just []
200 }
201 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
202 printDebug "[addToCorpusWithQuery] datafield" datafield
203 printDebug "[addToCorpusWithQuery] flowListWith" flw
204
205 case datafield of
206 Just Web -> do
207 printDebug "[addToCorpusWithQuery] processing web request" datafield
208
209 _ <- triggerSearxSearch user cid q l logStatus
210
211 pure JobLog { _scst_succeeded = Just 3
212 , _scst_failed = Just 0
213 , _scst_remaining = Just 0
214 , _scst_events = Just []
215 }
216
217 _ -> do
218 -- TODO add cid
219 -- TODO if cid is folder -> create Corpus
220 -- if cid is corpus -> add to corpus
221 -- if cid is root -> create corpus in Private
222 printDebug "[G.A.N.C.New] getDataText with query" q
223 databaseOrigin <- database2origin dbs
224 eTxts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [databaseOrigin]
225
226 let lTxts = lefts eTxts
227 printDebug "[G.A.N.C.New] lTxts" lTxts
228 case lTxts of
229 [] -> do
230 let txts = rights eTxts
231 -- TODO Sum lenghts of each txt elements
232 logStatus $ JobLog { _scst_succeeded = Just 2
233 , _scst_failed = Just 0
234 , _scst_remaining = Just $ 1 + length txts
235 , _scst_events = Just []
236 }
237
238 cids <- mapM (\txt -> do
239 flowDataText user txt (Multi l) cid (Just flw) logStatus) txts
240 printDebug "corpus id" cids
241 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
242 sendMail user
243 -- TODO ...
244 pure JobLog { _scst_succeeded = Just 3
245 , _scst_failed = Just 0
246 , _scst_remaining = Just 0
247 , _scst_events = Just []
248 }
249
250 (err:_) -> do
251 printDebug "Error: " err
252 let jl = addEvent "ERROR" (T.pack $ show err) $
253 JobLog { _scst_succeeded = Just 2
254 , _scst_failed = Just 1
255 , _scst_remaining = Just 0
256 , _scst_events = Just []
257 }
258 logStatus jl
259 pure jl
260
261
262 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
263 :> "corpus"
264 :> Capture "corpus_id" CorpusId
265 :> "add"
266 :> "form"
267 :> "async"
268 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
269
270 addToCorpusWithForm :: (FlowCmdM env err m)
271 => User
272 -> CorpusId
273 -> NewWithForm
274 -> (JobLog -> m ())
275 -> JobLog
276 -> m JobLog
277 addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) logStatus jobLog = do
278 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
279 printDebug "[addToCorpusWithForm] fileType" ft
280 printDebug "[addToCorpusWithForm] fileFormat" ff
281 logStatus jobLog
282 limit' <- view $ hasConfig . gc_max_docs_parsers
283 let limit = fromIntegral limit' :: Integer
284 let
285 parseC = case ft of
286 CSV_HAL -> Parser.parseFormatC Parser.CsvHal
287 CSV -> Parser.parseFormatC Parser.CsvGargV3
288 WOS -> Parser.parseFormatC Parser.WOS
289 PresseRIS -> Parser.parseFormatC Parser.RisPresse
290
291 -- TODO granularity of the logStatus
292 let data' = case ff of
293 Plain -> cs d
294 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
295 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
296 Right decoded -> decoded
297 eDocsC <- liftBase $ parseC ff data'
298 case eDocsC of
299 Right (mCount, docsC) -> do
300 -- TODO Add progress (jobStatus) update for docs - this is a
301 -- long action
302
303 let docsC' = zipSources (yieldMany [1..]) docsC
304 .| mapMC (\(idx, doc) ->
305 if idx > limit then do
306 --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
307 let panicMsg' = [ "[addToCorpusWithForm] number of docs "
308 , "exceeds the MAX_DOCS_PARSERS limit ("
309 , show limit
310 , ")" ]
311 let panicMsg = T.concat $ T.pack <$> panicMsg'
312 --logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
313 panic panicMsg
314 else
315 pure doc)
316 .| mapC toHyperdataDocument
317
318 --printDebug "Parsing corpus finished : " cid
319 --logStatus jobLog2
320
321 --printDebug "Starting extraction : " cid
322 -- TODO granularity of the logStatus
323 printDebug "flowCorpus with (corpus_id, lang)" (cid, l)
324
325 _cid' <- flowCorpus user
326 (Right [cid])
327 (Multi $ fromMaybe EN l)
328 (Just sel)
329 --(Just $ fromIntegral $ length docs, docsC')
330 (mCount, transPipe liftBase docsC') -- TODO fix number of docs
331 --(map (map toHyperdataDocument) docs)
332 logStatus
333
334 printDebug "Extraction finished : " cid
335 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
336 -- TODO uncomment this
337 --sendMail user
338
339 logStatus jobLog3
340 pure jobLog3
341 Left e -> do
342 printDebug "[addToCorpusWithForm] parse error" e
343
344 let evt = ScraperEvent { _scev_message = Just $ T.pack e
345 , _scev_level = Just "ERROR"
346 , _scev_date = Nothing }
347
348 logStatus $ over (scst_events . _Just) (\evt' -> evt' <> [evt]) jobLogE
349 pure jobLogE
350 where
351 jobLog2 = jobLogSuccess jobLog
352 jobLog3 = jobLogSuccess jobLog2
353 jobLogE = jobLogFailTotal jobLog
354
355 {-
356 addToCorpusWithFile :: FlowCmdM env err m
357 => CorpusId
358 -> MultipartData Mem
359 -> Maybe FileType
360 -> (JobLog -> m ())
361 -> m JobLog
362 addToCorpusWithFile cid input filetype logStatus = do
363 logStatus JobLog { _scst_succeeded = Just 10
364 , _scst_failed = Just 2
365 , _scst_remaining = Just 138
366 , _scst_events = Just []
367 }
368 printDebug "addToCorpusWithFile" cid
369 _h <- postUpload cid filetype input
370
371 pure JobLog { _scst_succeeded = Just 137
372 , _scst_failed = Just 13
373 , _scst_remaining = Just 0
374 , _scst_events = Just []
375 }
376 -}
377
378
379
380 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
381 :> "corpus"
382 :> Capture "corpus_id" CorpusId
383 :> "add"
384 :> "file"
385 :> "async"
386 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
387
388 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
389 => User
390 -> CorpusId
391 -> NewWithFile
392 -> (JobLog -> m ())
393 -> m JobLog
394 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
395
396 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
397 logStatus JobLog { _scst_succeeded = Just 0
398 , _scst_failed = Just 0
399 , _scst_remaining = Just 1
400 , _scst_events = Just []
401 }
402
403 fPath <- GargDB.writeFile nwf
404 printDebug "[addToCorpusWithFile] File saved as: " fPath
405
406 uId <- getUserId user
407 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
408
409 _ <- case nIds of
410 [nId] -> do
411 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
412 let hl = node ^. node_hyperdata
413 _ <- updateHyperdata nId $ hl { _hff_name = fName
414 , _hff_path = T.pack fPath }
415
416 printDebug "[addToCorpusWithFile] Created node with id: " nId
417 _ -> pure ()
418
419 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
420
421 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
422 sendMail user
423
424 pure $ JobLog { _scst_succeeded = Just 1
425 , _scst_failed = Just 0
426 , _scst_remaining = Just 0
427 , _scst_events = Just []
428 }