2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
23 import Control.Lens hiding (elements, Empty)
25 import Data.Aeson.TH (deriveJSON)
26 import qualified Data.ByteString.Base64 as BSB64
28 import Data.Maybe (fromMaybe)
30 import Data.Text (Text)
31 import qualified Data.Text as T
32 import GHC.Generics (Generic)
34 import Servant.Job.Utils (jsonOptions)
35 -- import Servant.Multipart
36 import qualified Data.Text.Encoding as TE
37 -- import Test.QuickCheck (elements)
38 import Test.QuickCheck.Arbitrary
40 import Gargantext.Prelude
42 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
43 import Gargantext.API.Admin.Types (HasSettings)
44 import Gargantext.API.Job (addEvent, jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage)
45 import Gargantext.API.Node.Corpus.New.File
46 import Gargantext.API.Node.Corpus.Searx
47 import Gargantext.API.Node.Corpus.Types
48 import Gargantext.API.Node.Types
49 import Gargantext.Core (Lang(..){-, allLangs-})
50 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
51 import qualified Gargantext.Core.Text.Corpus.API as API
52 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
53 import Gargantext.Core.Types.Individu (User(..))
54 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
55 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
56 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
57 import Gargantext.Database.Action.Mail (sendMail)
58 import Gargantext.Database.Action.Node (mkNodeWithParent)
59 import Gargantext.Database.Action.User (getUserId)
60 import Gargantext.Database.Admin.Types.Hyperdata
61 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
62 import Gargantext.Database.Prelude (hasConfig)
63 import Gargantext.Database.Query.Table.Node (getNodeWith)
64 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
65 import Gargantext.Database.Schema.Node (node_hyperdata)
66 import qualified Gargantext.Database.GargDB as GargDB
67 import Gargantext.Prelude.Config (gc_max_docs_parsers)
68 ------------------------------------------------------------------------
70 data Query = Query { query_query :: Text
71 , query_node_id :: Int
73 , query_databases :: [DataOrigin]
75 deriving (Eq, Generic)
77 deriveJSON (unPrefix "query_") 'Query
79 instance Arbitrary Query where
80 arbitrary = elements [ Query q n la fs
81 | q <- ["honeybee* AND collapse"
86 , fs <- take 3 $ repeat allDataOrigins
89 instance ToSchema Query where
90 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
93 ------------------------------------------------------------------------
99 type PostApi = Summary "New Corpus endpoint"
100 :> ReqBody '[JSON] Query
101 :> Post '[JSON] CorpusId
102 type GetApi = Get '[JSON] ApiInfo
105 -- | TODO manage several apis
107 -- TODO this is only the POST
109 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
110 api uid (Query q _ as) = do
111 cId <- case head as of
112 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
113 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
115 docs <- liftBase $ API.get a q (Just 1000)
116 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
122 ------------------------------------------------
123 -- TODO use this route for Client implementation
124 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
126 instance Arbitrary ApiInfo where
127 arbitrary = ApiInfo <$> arbitrary
129 deriveJSON (unPrefix "") 'ApiInfo
131 instance ToSchema ApiInfo
133 info :: FlowCmdM env err m => UserId -> m ApiInfo
134 info _u = pure $ ApiInfo API.externalAPIs
136 ------------------------------------------------------------------------
137 ------------------------------------------------------------------------
138 data WithQuery = WithQuery
140 , _wq_databases :: !Database
141 , _wq_datafield :: !(Maybe Datafield)
143 , _wq_node_id :: !Int
144 , _wq_flowListWith :: !FlowSocialListWith
148 makeLenses ''WithQuery
149 instance FromJSON WithQuery where
150 parseJSON = genericParseJSON $ jsonOptions "_wq_"
151 instance ToJSON WithQuery where
152 toJSON = genericToJSON $ jsonOptions "_wq_"
153 instance ToSchema WithQuery where
154 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
156 ------------------------------------------------------------------------
158 type AddWithQuery = Summary "Add with Query to corpus endpoint"
160 :> Capture "corpus_id" CorpusId
162 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
165 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
167 :> Capture "corpus_id" CorpusId
170 :> MultipartForm Mem (MultipartData Mem)
171 :> QueryParam "fileType" FileType
173 :> AsyncJobs JobLog '[JSON] () JobLog
177 ------------------------------------------------------------------------
178 -- TODO WithQuery also has a corpus id
179 addToCorpusWithQuery :: FlowCmdM env err m
186 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
187 , _wq_databases = dbs
188 , _wq_datafield = datafield
190 , _wq_flowListWith = flw }) maybeLimit logStatus = do
192 logStatus JobLog { _scst_succeeded = Just 0
193 , _scst_failed = Just 0
194 , _scst_remaining = Just 3
195 , _scst_events = Just []
197 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
198 printDebug "[addToCorpusWithQuery] datafield" datafield
199 printDebug "[addToCorpusWithQuery] flowListWith" flw
203 printDebug "[addToCorpusWithQuery] processing web request" datafield
205 _ <- triggerSearxSearch user cid q l logStatus
207 pure JobLog { _scst_succeeded = Just 3
208 , _scst_failed = Just 0
209 , _scst_remaining = Just 0
210 , _scst_events = Just []
215 -- TODO if cid is folder -> create Corpus
216 -- if cid is corpus -> add to corpus
217 -- if cid is root -> create corpus in Private
218 eTxts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
219 let lTxts = lefts eTxts
222 let txts = rights eTxts
223 -- TODO Sum lenghts of each txt elements
224 logStatus $ JobLog { _scst_succeeded = Just 2
225 , _scst_failed = Just 0
226 , _scst_remaining = Just $ 1 + length txts
227 , _scst_events = Just []
230 cids <- mapM (\txt -> do
231 flowDataText user txt (Multi l) cid Nothing logStatus) txts
232 printDebug "corpus id" cids
233 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
236 pure JobLog { _scst_succeeded = Just 3
237 , _scst_failed = Just 0
238 , _scst_remaining = Just 0
239 , _scst_events = Just []
243 pure $ addEvent "ERROR" (T.pack $ show err) $
244 JobLog { _scst_succeeded = Just 2
245 , _scst_failed = Just 1
246 , _scst_remaining = Just 0
247 , _scst_events = Just []
251 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
253 :> Capture "corpus_id" CorpusId
257 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
259 addToCorpusWithForm :: (FlowCmdM env err m)
266 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
267 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
268 printDebug "[addToCorpusWithForm] fileType" ft
272 CSV_HAL -> Parser.parseFormat Parser.CsvHal
273 CSV -> Parser.parseFormat Parser.CsvGargV3
274 WOS -> Parser.parseFormat Parser.WOS
275 PresseRIS -> Parser.parseFormat Parser.RisPresse
276 ZIP -> Parser.parseFormat Parser.ZIP
278 -- TODO granularity of the logStatus
279 let data' = case ft of
280 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
281 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
282 Right decoded -> decoded
284 eDocs <- liftBase $ parse data'
287 -- TODO Add progress (jobStatus) update for docs - this is a
290 limit' <- view $ hasConfig . gc_max_docs_parsers
291 let limit = fromIntegral limit'
292 if length docs > limit then do
293 printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs)
294 let panicMsg' = [ "[addToCorpusWithForm] number of docs ("
296 , ") exceeds the MAX_DOCS_PARSERS limit ("
299 let panicMsg = T.concat $ T.pack <$> panicMsg'
300 logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
305 printDebug "Parsing corpus finished : " cid
308 printDebug "Starting extraction : " cid
309 -- TODO granularity of the logStatus
310 _cid' <- flowCorpus user
312 (Multi $ fromMaybe EN l)
314 (Just $ fromIntegral $ length docs, yieldMany docs .| mapC toHyperdataDocument)
315 --(map (map toHyperdataDocument) docs)
318 printDebug "Extraction finished : " cid
319 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
325 printDebug "[addToCorpusWithForm] parse error" e
327 let evt = ScraperEvent { _scev_message = Just $ T.pack e
328 , _scev_level = Just "ERROR"
329 , _scev_date = Nothing }
331 logStatus $ over (scst_events . _Just) (\evt' -> evt' <> [evt]) jobLogE
334 jobLog2 = jobLogSuccess jobLog
335 jobLog3 = jobLogSuccess jobLog2
336 jobLogE = jobLogFailTotal jobLog
339 addToCorpusWithFile :: FlowCmdM env err m
345 addToCorpusWithFile cid input filetype logStatus = do
346 logStatus JobLog { _scst_succeeded = Just 10
347 , _scst_failed = Just 2
348 , _scst_remaining = Just 138
349 , _scst_events = Just []
351 printDebug "addToCorpusWithFile" cid
352 _h <- postUpload cid filetype input
354 pure JobLog { _scst_succeeded = Just 137
355 , _scst_failed = Just 13
356 , _scst_remaining = Just 0
357 , _scst_events = Just []
363 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
365 :> Capture "corpus_id" CorpusId
369 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
371 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
377 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
379 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
380 logStatus JobLog { _scst_succeeded = Just 0
381 , _scst_failed = Just 0
382 , _scst_remaining = Just 1
383 , _scst_events = Just []
386 fPath <- GargDB.writeFile nwf
387 printDebug "[addToCorpusWithFile] File saved as: " fPath
389 uId <- getUserId user
390 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
394 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
395 let hl = node ^. node_hyperdata
396 _ <- updateHyperdata nId $ hl { _hff_name = fName
397 , _hff_path = T.pack fPath }
399 printDebug "[addToCorpusWithFile] Created node with id: " nId
402 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
404 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
407 pure $ JobLog { _scst_succeeded = Just 1
408 , _scst_failed = Just 0
409 , _scst_remaining = Just 0
410 , _scst_events = Just []