2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
22 import Control.Lens hiding (elements, Empty)
24 import Data.Aeson.TH (deriveJSON)
25 import qualified Data.ByteString.Base64 as BSB64
27 import Data.Maybe (fromMaybe)
29 import Data.Text (Text)
30 import qualified Data.Text as T
31 import GHC.Generics (Generic)
33 import Servant.Job.Utils (jsonOptions)
34 -- import Servant.Multipart
35 import qualified Data.Text.Encoding as TE
36 -- import Test.QuickCheck (elements)
37 import Test.QuickCheck.Arbitrary
39 import Gargantext.Prelude
41 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
42 import Gargantext.API.Admin.Types (HasSettings)
43 import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage)
44 import Gargantext.API.Node.Corpus.New.File
45 import Gargantext.API.Node.Corpus.Searx
46 import Gargantext.API.Node.Corpus.Types
47 import Gargantext.API.Node.Types
48 import Gargantext.Core (Lang(..){-, allLangs-})
49 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
50 import qualified Gargantext.Core.Text.Corpus.API as API
51 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
52 import Gargantext.Core.Types.Individu (User(..))
53 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
54 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
55 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
56 import Gargantext.Database.Action.Mail (sendMail)
57 import Gargantext.Database.Action.Node (mkNodeWithParent)
58 import Gargantext.Database.Action.User (getUserId)
59 import Gargantext.Database.Admin.Types.Hyperdata
60 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
61 import Gargantext.Database.Prelude (hasConfig)
62 import Gargantext.Database.Query.Table.Node (getNodeWith)
63 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
64 import Gargantext.Database.Schema.Node (node_hyperdata)
65 import qualified Gargantext.Database.GargDB as GargDB
66 import Gargantext.Prelude.Config (gc_max_docs_parsers)
67 ------------------------------------------------------------------------
69 data Query = Query { query_query :: Text
70 , query_node_id :: Int
72 , query_databases :: [DataOrigin]
74 deriving (Eq, Generic)
76 deriveJSON (unPrefix "query_") 'Query
78 instance Arbitrary Query where
79 arbitrary = elements [ Query q n la fs
80 | q <- ["honeybee* AND collapse"
85 , fs <- take 3 $ repeat allDataOrigins
88 instance ToSchema Query where
89 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
92 ------------------------------------------------------------------------
98 type PostApi = Summary "New Corpus endpoint"
99 :> ReqBody '[JSON] Query
100 :> Post '[JSON] CorpusId
101 type GetApi = Get '[JSON] ApiInfo
104 -- | TODO manage several apis
106 -- TODO this is only the POST
108 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
109 api uid (Query q _ as) = do
110 cId <- case head as of
111 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
112 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
114 docs <- liftBase $ API.get a q (Just 1000)
115 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
121 ------------------------------------------------
122 -- TODO use this route for Client implementation
123 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
125 instance Arbitrary ApiInfo where
126 arbitrary = ApiInfo <$> arbitrary
128 deriveJSON (unPrefix "") 'ApiInfo
130 instance ToSchema ApiInfo
132 info :: FlowCmdM env err m => UserId -> m ApiInfo
133 info _u = pure $ ApiInfo API.externalAPIs
135 ------------------------------------------------------------------------
136 ------------------------------------------------------------------------
137 data WithQuery = WithQuery
139 , _wq_databases :: !Database
140 , _wq_datafield :: !(Maybe Datafield)
142 , _wq_node_id :: !Int
143 , _wq_flowListWith :: !FlowSocialListWith
147 makeLenses ''WithQuery
148 instance FromJSON WithQuery where
149 parseJSON = genericParseJSON $ jsonOptions "_wq_"
150 instance ToJSON WithQuery where
151 toJSON = genericToJSON $ jsonOptions "_wq_"
152 instance ToSchema WithQuery where
153 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
155 ------------------------------------------------------------------------
157 type AddWithQuery = Summary "Add with Query to corpus endpoint"
159 :> Capture "corpus_id" CorpusId
161 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
164 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
166 :> Capture "corpus_id" CorpusId
169 :> MultipartForm Mem (MultipartData Mem)
170 :> QueryParam "fileType" FileType
172 :> AsyncJobs JobLog '[JSON] () JobLog
176 ------------------------------------------------------------------------
177 -- TODO WithQuery also has a corpus id
178 addToCorpusWithQuery :: FlowCmdM env err m
185 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
186 , _wq_databases = dbs
187 , _wq_datafield = datafield
189 , _wq_flowListWith = flw }) maybeLimit logStatus = do
191 logStatus JobLog { _scst_succeeded = Just 0
192 , _scst_failed = Just 0
193 , _scst_remaining = Just 3
194 , _scst_events = Just []
196 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
197 printDebug "[addToCorpusWithQuery] datafield" datafield
198 printDebug "[addToCorpusWithQuery] flowListWith" flw
202 printDebug "[addToCorpusWithQuery] processing web request" datafield
204 _ <- triggerSearxSearch user cid q l logStatus
206 pure JobLog { _scst_succeeded = Just 3
207 , _scst_failed = Just 0
208 , _scst_remaining = Just 0
209 , _scst_events = Just []
214 -- TODO if cid is folder -> create Corpus
215 -- if cid is corpus -> add to corpus
216 -- if cid is root -> create corpus in Private
217 txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
219 logStatus JobLog { _scst_succeeded = Just 2
220 , _scst_failed = Just 0
221 , _scst_remaining = Just $ 1 + length txts
222 , _scst_events = Just []
225 cids <- mapM (\txt -> flowDataText user txt (Multi l) cid Nothing logStatus) txts
226 printDebug "corpus id" cids
227 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
230 pure JobLog { _scst_succeeded = Just 3
231 , _scst_failed = Just 0
232 , _scst_remaining = Just 0
233 , _scst_events = Just []
237 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
239 :> Capture "corpus_id" CorpusId
243 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
245 addToCorpusWithForm :: (FlowCmdM env err m)
252 addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
253 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
254 printDebug "[addToCorpusWithForm] fileType" ft
258 CSV_HAL -> Parser.parseFormat Parser.CsvHal
259 CSV -> Parser.parseFormat Parser.CsvGargV3
260 WOS -> Parser.parseFormat Parser.WOS
261 PresseRIS -> Parser.parseFormat Parser.RisPresse
262 ZIP -> Parser.parseFormat Parser.ZIP
264 -- TODO granularity of the logStatus
265 let data' = case ft of
266 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
267 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
268 Right decoded -> decoded
270 eDocs <- liftBase $ parse data'
273 -- TODO Add progress (jobStatus) update for docs - this is a
275 limit' <- view $ hasConfig . gc_max_docs_parsers
276 let limit = fromIntegral limit'
277 if length docs' > limit then do
278 printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs')
279 let panicMsg' = [ "[addToCorpusWithForm] number of docs ("
280 , show $ length docs'
281 , ") exceeds the MAX_DOCS_PARSERS limit ("
284 let panicMsg = T.concat $ T.pack <$> panicMsg'
285 logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
289 let docs = splitEvery 500 $ take limit docs'
291 printDebug "Parsing corpus finished : " cid
294 printDebug "Starting extraction : " cid
295 -- TODO granularity of the logStatus
296 _cid' <- flowCorpus user
298 (Multi $ fromMaybe EN l)
300 (map (map toHyperdataDocument) docs)
303 printDebug "Extraction finished : " cid
304 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
310 printDebug "[addToCorpusWithForm] parse error" e
312 let evt = ScraperEvent { _scev_message = Just $ T.pack e
313 , _scev_level = Just "ERROR"
314 , _scev_date = Nothing }
316 logStatus $ over (scst_events . _Just) (\evt' -> evt' <> [evt]) jobLogE
319 jobLog2 = jobLogSuccess jobLog
320 jobLog3 = jobLogSuccess jobLog2
321 jobLogE = jobLogFailTotal jobLog
324 addToCorpusWithFile :: FlowCmdM env err m
330 addToCorpusWithFile cid input filetype logStatus = do
331 logStatus JobLog { _scst_succeeded = Just 10
332 , _scst_failed = Just 2
333 , _scst_remaining = Just 138
334 , _scst_events = Just []
336 printDebug "addToCorpusWithFile" cid
337 _h <- postUpload cid filetype input
339 pure JobLog { _scst_succeeded = Just 137
340 , _scst_failed = Just 13
341 , _scst_remaining = Just 0
342 , _scst_events = Just []
348 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
350 :> Capture "corpus_id" CorpusId
354 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
356 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
362 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
364 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
365 logStatus JobLog { _scst_succeeded = Just 0
366 , _scst_failed = Just 0
367 , _scst_remaining = Just 1
368 , _scst_events = Just []
371 fPath <- GargDB.writeFile nwf
372 printDebug "[addToCorpusWithFile] File saved as: " fPath
374 uId <- getUserId user
375 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
379 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
380 let hl = node ^. node_hyperdata
381 _ <- updateHyperdata nId $ hl { _hff_name = fName
382 , _hff_path = T.pack fPath }
384 printDebug "[addToCorpusWithFile] Created node with id: " nId
387 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
389 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
392 pure $ JobLog { _scst_succeeded = Just 1
393 , _scst_failed = Just 0
394 , _scst_remaining = Just 0
395 , _scst_events = Just []