2 Module : Gargantext.API.Node.Corpus.New
3 Description : New corpus API
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 New corpus means either:
12 - new data in existing corpus
15 {-# LANGUAGE TemplateHaskell #-}
16 {-# LANGUAGE TypeOperators #-}
18 module Gargantext.API.Node.Corpus.New
22 import Control.Lens hiding (elements, Empty)
24 import Data.Aeson.TH (deriveJSON)
25 import qualified Data.ByteString.Base64 as BSB64
26 import Data.Conduit.Internal (zipSources)
28 import Data.Maybe (fromMaybe)
30 import Data.Text (Text)
31 import qualified Data.Text as T
32 import GHC.Generics (Generic)
34 import Servant.Job.Utils (jsonOptions)
35 -- import Servant.Multipart
36 import qualified Data.Text.Encoding as TE
37 -- import Test.QuickCheck (elements)
38 import Test.QuickCheck.Arbitrary
40 import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
41 import Gargantext.API.Admin.Types (HasSettings)
42 import Gargantext.API.Job (addEvent, jobLogSuccess, jobLogFailTotal)
43 import Gargantext.API.Node.Corpus.New.Types
44 import Gargantext.API.Node.Corpus.Searx
45 import Gargantext.API.Node.Corpus.Types
46 import Gargantext.API.Node.Types
47 import Gargantext.Core (Lang(..){-, allLangs-})
48 import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
49 import Gargantext.Core.Types.Individu (User(..))
50 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
51 import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
52 import Gargantext.Database.Action.Flow.Types (FlowCmdM)
53 import Gargantext.Database.Action.Mail (sendMail)
54 import Gargantext.Database.Action.Node (mkNodeWithParent)
55 import Gargantext.Database.Action.User (getUserId)
56 import Gargantext.Database.Admin.Types.Hyperdata
57 import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
58 import Gargantext.Database.Prelude (hasConfig)
59 import Gargantext.Database.Query.Table.Node (getNodeWith)
60 import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
61 import Gargantext.Database.Schema.Node (node_hyperdata)
62 import Gargantext.Prelude
63 import Gargantext.Prelude.Config (gc_max_docs_parsers)
64 import qualified Gargantext.Core.Text.Corpus.API as API
65 import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC)
66 import qualified Gargantext.Database.GargDB as GargDB
67 ------------------------------------------------------------------------
69 data Query = Query { query_query :: Text
70 , query_node_id :: Int
72 , query_databases :: [DataOrigin]
74 deriving (Eq, Generic)
76 deriveJSON (unPrefix "query_") 'Query
78 instance Arbitrary Query where
79 arbitrary = elements [ Query q n la fs
80 | q <- ["honeybee* AND collapse"
85 , fs <- take 3 $ repeat allDataOrigins
88 instance ToSchema Query where
89 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_")
92 ------------------------------------------------------------------------
98 type PostApi = Summary "New Corpus endpoint"
99 :> ReqBody '[JSON] Query
100 :> Post '[JSON] CorpusId
101 type GetApi = Get '[JSON] ApiInfo
104 -- | TODO manage several apis
106 -- TODO this is only the POST
108 api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId
109 api uid (Query q _ as) = do
110 cId <- case head as of
111 Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q
112 Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q
114 docs <- liftBase $ API.get a q (Just 1000)
115 cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs]
121 ------------------------------------------------
122 -- TODO use this route for Client implementation
123 data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]}
125 instance Arbitrary ApiInfo where
126 arbitrary = ApiInfo <$> arbitrary
128 deriveJSON (unPrefix "") 'ApiInfo
130 instance ToSchema ApiInfo
132 info :: FlowCmdM env err m => UserId -> m ApiInfo
134 ext <- API.externalAPIs
138 ------------------------------------------------------------------------
139 ------------------------------------------------------------------------
140 data WithQuery = WithQuery
142 , _wq_databases :: !Database
143 , _wq_datafield :: !(Maybe Datafield)
145 , _wq_node_id :: !Int
146 , _wq_flowListWith :: !FlowSocialListWith
150 makeLenses ''WithQuery
151 instance FromJSON WithQuery where
152 parseJSON = genericParseJSON $ jsonOptions "_wq_"
153 instance ToJSON WithQuery where
154 toJSON = genericToJSON $ jsonOptions "_wq_"
155 instance ToSchema WithQuery where
156 declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
158 ------------------------------------------------------------------------
160 type AddWithQuery = Summary "Add with Query to corpus endpoint"
162 :> Capture "corpus_id" CorpusId
164 :> AsyncJobs JobLog '[JSON] WithQuery JobLog
167 type AddWithFile = Summary "Add with MultipartData to corpus endpoint"
169 :> Capture "corpus_id" CorpusId
172 :> MultipartForm Mem (MultipartData Mem)
173 :> QueryParam "fileType" FileType
175 :> AsyncJobs JobLog '[JSON] () JobLog
179 ------------------------------------------------------------------------
180 -- TODO WithQuery also has a corpus id
183 addToCorpusWithQuery :: FlowCmdM env err m
190 addToCorpusWithQuery user cid (WithQuery { _wq_query = q
191 , _wq_databases = dbs
192 , _wq_datafield = datafield
194 , _wq_flowListWith = flw }) maybeLimit logStatus = do
196 logStatus JobLog { _scst_succeeded = Just 0
197 , _scst_failed = Just 0
198 , _scst_remaining = Just 3
199 , _scst_events = Just []
201 printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
202 printDebug "[addToCorpusWithQuery] datafield" datafield
203 printDebug "[addToCorpusWithQuery] flowListWith" flw
207 printDebug "[addToCorpusWithQuery] processing web request" datafield
209 _ <- triggerSearxSearch user cid q l logStatus
211 pure JobLog { _scst_succeeded = Just 3
212 , _scst_failed = Just 0
213 , _scst_remaining = Just 0
214 , _scst_events = Just []
219 -- TODO if cid is folder -> create Corpus
220 -- if cid is corpus -> add to corpus
221 -- if cid is root -> create corpus in Private
222 printDebug "[G.A.N.C.New] getDataText with query" q
223 databaseOrigin <- database2origin dbs
224 eTxts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [databaseOrigin]
226 let lTxts = lefts eTxts
227 printDebug "[G.A.N.C.New] lTxts" lTxts
230 let txts = rights eTxts
231 -- TODO Sum lenghts of each txt elements
232 logStatus $ JobLog { _scst_succeeded = Just 2
233 , _scst_failed = Just 0
234 , _scst_remaining = Just $ 1 + length txts
235 , _scst_events = Just []
238 cids <- mapM (\txt -> do
239 flowDataText user txt (Multi l) cid Nothing logStatus) txts
240 printDebug "corpus id" cids
241 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
244 pure JobLog { _scst_succeeded = Just 3
245 , _scst_failed = Just 0
246 , _scst_remaining = Just 0
247 , _scst_events = Just []
251 printDebug "Error: " err
252 let jl = addEvent "ERROR" (T.pack $ show err) $
253 JobLog { _scst_succeeded = Just 2
254 , _scst_failed = Just 1
255 , _scst_remaining = Just 0
256 , _scst_events = Just []
262 type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
264 :> Capture "corpus_id" CorpusId
268 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
270 addToCorpusWithForm :: (FlowCmdM env err m)
277 addToCorpusWithForm user cid (NewWithForm ft ff d l _n) logStatus jobLog = do
278 printDebug "[addToCorpusWithForm] Parsing corpus: " cid
279 printDebug "[addToCorpusWithForm] fileType" ft
280 printDebug "[addToCorpusWithForm] fileFormat" ff
282 limit' <- view $ hasConfig . gc_max_docs_parsers
283 let limit = fromIntegral limit' :: Integer
286 CSV_HAL -> Parser.parseFormatC Parser.CsvHal
287 CSV -> Parser.parseFormatC Parser.CsvGargV3
288 WOS -> Parser.parseFormatC Parser.WOS
289 PresseRIS -> Parser.parseFormatC Parser.RisPresse
291 -- TODO granularity of the logStatus
292 let data' = case ff of
294 ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
295 Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
296 Right decoded -> decoded
297 eDocsC <- liftBase $ parseC ff data'
299 Right (mCount, docsC) -> do
300 -- TODO Add progress (jobStatus) update for docs - this is a
303 let docsC' = zipSources (yieldMany [1..]) docsC
304 .| mapMC (\(idx, doc) ->
305 if idx > limit then do
306 --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit)
307 let panicMsg' = [ "[addToCorpusWithForm] number of docs "
308 , "exceeds the MAX_DOCS_PARSERS limit ("
311 let panicMsg = T.concat $ T.pack <$> panicMsg'
312 --logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
316 .| mapC toHyperdataDocument
318 --printDebug "Parsing corpus finished : " cid
321 --printDebug "Starting extraction : " cid
322 -- TODO granularity of the logStatus
323 printDebug "flowCorpus with (corpus_id, lang)" (cid, l)
325 _cid' <- flowCorpus user
327 (Multi $ fromMaybe EN l)
329 --(Just $ fromIntegral $ length docs, docsC')
330 (mCount, transPipe liftBase docsC') -- TODO fix number of docs
331 --(map (map toHyperdataDocument) docs)
334 printDebug "Extraction finished : " cid
335 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
336 -- TODO uncomment this
342 printDebug "[addToCorpusWithForm] parse error" e
344 let evt = ScraperEvent { _scev_message = Just $ T.pack e
345 , _scev_level = Just "ERROR"
346 , _scev_date = Nothing }
348 logStatus $ over (scst_events . _Just) (\evt' -> evt' <> [evt]) jobLogE
351 jobLog2 = jobLogSuccess jobLog
352 jobLog3 = jobLogSuccess jobLog2
353 jobLogE = jobLogFailTotal jobLog
356 addToCorpusWithFile :: FlowCmdM env err m
362 addToCorpusWithFile cid input filetype logStatus = do
363 logStatus JobLog { _scst_succeeded = Just 10
364 , _scst_failed = Just 2
365 , _scst_remaining = Just 138
366 , _scst_events = Just []
368 printDebug "addToCorpusWithFile" cid
369 _h <- postUpload cid filetype input
371 pure JobLog { _scst_succeeded = Just 137
372 , _scst_failed = Just 13
373 , _scst_remaining = Just 0
374 , _scst_events = Just []
380 type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint"
382 :> Capture "corpus_id" CorpusId
386 :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog
388 addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m)
394 addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do
396 printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid
397 logStatus JobLog { _scst_succeeded = Just 0
398 , _scst_failed = Just 0
399 , _scst_remaining = Just 1
400 , _scst_events = Just []
403 fPath <- GargDB.writeFile nwf
404 printDebug "[addToCorpusWithFile] File saved as: " fPath
406 uId <- getUserId user
407 nIds <- mkNodeWithParent NodeFile (Just cid) uId fName
411 node <- getNodeWith nId (Proxy :: Proxy HyperdataFile)
412 let hl = node ^. node_hyperdata
413 _ <- updateHyperdata nId $ hl { _hff_name = fName
414 , _hff_path = T.pack fPath }
416 printDebug "[addToCorpusWithFile] Created node with id: " nId
419 printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
421 printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
424 pure $ JobLog { _scst_succeeded = Just 1
425 , _scst_failed = Just 0
426 , _scst_remaining = Just 0
427 , _scst_events = Just []