{-| Module : Gargantext.API.Node.Corpus.New Description : New corpus API Copyright : (c) CNRS, 2017-Present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX New corpus means either: - new corpus - new data in existing corpus -} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TypeOperators #-} module Gargantext.API.Node.Corpus.New where import Conduit import Control.Lens hiding (elements, Empty) import Data.Aeson import Data.Aeson.TH (deriveJSON) import qualified Data.ByteString.Base64 as BSB64 import Data.Either import Data.Maybe (fromMaybe) import Data.Swagger import Data.Text (Text) import qualified Data.Text as T import GHC.Generics (Generic) import Servant import Servant.Job.Utils (jsonOptions) -- import Servant.Multipart import qualified Data.Text.Encoding as TE -- import Test.QuickCheck (elements) import Test.QuickCheck.Arbitrary import Gargantext.Prelude import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events) import Gargantext.API.Admin.Types (HasSettings) import Gargantext.API.Job (addEvent, jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage) import Gargantext.API.Node.Corpus.New.File import Gargantext.API.Node.Corpus.Searx import Gargantext.API.Node.Corpus.Types import Gargantext.API.Node.Types import Gargantext.Core (Lang(..){-, allLangs-}) import Gargantext.Core.Text.List.Social (FlowSocialListWith(..)) import qualified Gargantext.Core.Text.Corpus.API as API import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat) import Gargantext.Core.Types.Individu (User(..)) import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger) import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, DataText(..), TermType(..){-, allDataOrigins-}) import Gargantext.Database.Action.Flow.Types (FlowCmdM) import Gargantext.Database.Action.Mail (sendMail) import Gargantext.Database.Action.Node (mkNodeWithParent) import Gargantext.Database.Action.User (getUserId) import Gargantext.Database.Admin.Types.Hyperdata import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId) import Gargantext.Database.Prelude (hasConfig) import Gargantext.Database.Query.Table.Node (getNodeWith) import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata) import Gargantext.Database.Schema.Node (node_hyperdata) import qualified Gargantext.Database.GargDB as GargDB import Gargantext.Prelude.Config (gc_max_docs_parsers) ------------------------------------------------------------------------ {- data Query = Query { query_query :: Text , query_node_id :: Int , query_lang :: Lang , query_databases :: [DataOrigin] } deriving (Eq, Generic) deriveJSON (unPrefix "query_") 'Query instance Arbitrary Query where arbitrary = elements [ Query q n la fs | q <- ["honeybee* AND collapse" ,"covid 19" ] , n <- [0..10] , la <- allLangs , fs <- take 3 $ repeat allDataOrigins ] instance ToSchema Query where declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_") -} ------------------------------------------------------------------------ {- type Api = PostApi :<|> GetApi type PostApi = Summary "New Corpus endpoint" :> ReqBody '[JSON] Query :> Post '[JSON] CorpusId type GetApi = Get '[JSON] ApiInfo -} -- | TODO manage several apis -- TODO-ACCESS -- TODO this is only the POST {- api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId api uid (Query q _ as) = do cId <- case head as of Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q Just a -> do docs <- liftBase $ API.get a q (Just 1000) cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs] pure cId' pure cId -} ------------------------------------------------ -- TODO use this route for Client implementation data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]} deriving (Generic) instance Arbitrary ApiInfo where arbitrary = ApiInfo <$> arbitrary deriveJSON (unPrefix "") 'ApiInfo instance ToSchema ApiInfo info :: FlowCmdM env err m => UserId -> m ApiInfo info _u = pure $ ApiInfo API.externalAPIs ------------------------------------------------------------------------ ------------------------------------------------------------------------ data WithQuery = WithQuery { _wq_query :: !Text , _wq_databases :: !Database , _wq_datafield :: !(Maybe Datafield) , _wq_lang :: !Lang , _wq_node_id :: !Int , _wq_flowListWith :: !FlowSocialListWith } deriving Generic makeLenses ''WithQuery instance FromJSON WithQuery where parseJSON = genericParseJSON $ jsonOptions "_wq_" instance ToJSON WithQuery where toJSON = genericToJSON $ jsonOptions "_wq_" instance ToSchema WithQuery where declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_") ------------------------------------------------------------------------ type AddWithQuery = Summary "Add with Query to corpus endpoint" :> "corpus" :> Capture "corpus_id" CorpusId :> "query" :> AsyncJobs JobLog '[JSON] WithQuery JobLog {- type AddWithFile = Summary "Add with MultipartData to corpus endpoint" :> "corpus" :> Capture "corpus_id" CorpusId :> "add" :> "file" :> MultipartForm Mem (MultipartData Mem) :> QueryParam "fileType" FileType :> "async" :> AsyncJobs JobLog '[JSON] () JobLog -} ------------------------------------------------------------------------ -- TODO WithQuery also has a corpus id addToCorpusWithQuery :: FlowCmdM env err m => User -> CorpusId -> WithQuery -> Maybe Integer -> (JobLog -> m ()) -> m JobLog addToCorpusWithQuery user cid (WithQuery { _wq_query = q , _wq_databases = dbs , _wq_datafield = datafield , _wq_lang = l , _wq_flowListWith = flw }) maybeLimit logStatus = do -- TODO ... logStatus JobLog { _scst_succeeded = Just 0 , _scst_failed = Just 0 , _scst_remaining = Just 3 , _scst_events = Just [] } printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs) printDebug "[addToCorpusWithQuery] datafield" datafield printDebug "[addToCorpusWithQuery] flowListWith" flw case datafield of Just Web -> do printDebug "[addToCorpusWithQuery] processing web request" datafield _ <- triggerSearxSearch user cid q l logStatus pure JobLog { _scst_succeeded = Just 3 , _scst_failed = Just 0 , _scst_remaining = Just 0 , _scst_events = Just [] } _ -> do -- TODO add cid -- TODO if cid is folder -> create Corpus -- if cid is corpus -> add to corpus -- if cid is root -> create corpus in Private eTxts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs] let lTxts = lefts eTxts case lTxts of [] -> do let txts = rights eTxts -- TODO Sum lenghts of each txt elements let jl = JobLog { _scst_succeeded = Just 2 , _scst_failed = Just 0 , _scst_remaining = Just $ 1 + length txts , _scst_events = Just [] } logStatus jl cids <- mapM (\txt -> do let id = case txt of (DataNew (i, _)) -> i _ -> (Just 0) logStatus $ addEvent "INFO: doc id" (T.pack $ show id) jl flowDataText user txt (Multi l) cid Nothing logStatus) txts printDebug "corpus id" cids printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text) sendMail user -- TODO ... pure JobLog { _scst_succeeded = Just 3 , _scst_failed = Just 0 , _scst_remaining = Just 0 , _scst_events = Just [] } (err:_) -> do pure $ addEvent "ERROR" (T.pack $ show err) $ JobLog { _scst_succeeded = Just 2 , _scst_failed = Just 1 , _scst_remaining = Just 0 , _scst_events = Just [] } type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint" :> "corpus" :> Capture "corpus_id" CorpusId :> "add" :> "form" :> "async" :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog addToCorpusWithForm :: (FlowCmdM env err m) => User -> CorpusId -> NewWithForm -> (JobLog -> m ()) -> JobLog -> m JobLog addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do printDebug "[addToCorpusWithForm] Parsing corpus: " cid printDebug "[addToCorpusWithForm] fileType" ft logStatus jobLog let parse = case ft of CSV_HAL -> Parser.parseFormat Parser.CsvHal CSV -> Parser.parseFormat Parser.CsvGargV3 WOS -> Parser.parseFormat Parser.WOS PresseRIS -> Parser.parseFormat Parser.RisPresse ZIP -> Parser.parseFormat Parser.ZIP -- TODO granularity of the logStatus let data' = case ft of ZIP -> case BSB64.decode $ TE.encodeUtf8 d of Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err Right decoded -> decoded _ -> cs d eDocs <- liftBase $ parse data' case eDocs of Right docs -> do -- TODO Add progress (jobStatus) update for docs - this is a -- long action limit' <- view $ hasConfig . gc_max_docs_parsers let limit = fromIntegral limit' if length docs > limit then do printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs) let panicMsg' = [ "[addToCorpusWithForm] number of docs (" , show $ length docs , ") exceeds the MAX_DOCS_PARSERS limit (" , show limit , ")" ] let panicMsg = T.concat $ T.pack <$> panicMsg' logStatus $ jobLogFailTotalWithMessage panicMsg jobLog panic panicMsg else pure () printDebug "Parsing corpus finished : " cid logStatus jobLog2 printDebug "Starting extraction : " cid -- TODO granularity of the logStatus _cid' <- flowCorpus user (Right [cid]) (Multi $ fromMaybe EN l) Nothing (Just $ fromIntegral $ length docs, yieldMany docs .| mapC toHyperdataDocument) --(map (map toHyperdataDocument) docs) logStatus printDebug "Extraction finished : " cid printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text) sendMail user logStatus jobLog3 pure $ jobLog3 Left e -> do printDebug "[addToCorpusWithForm] parse error" e let evt = ScraperEvent { _scev_message = Just $ T.pack e , _scev_level = Just "ERROR" , _scev_date = Nothing } logStatus $ over (scst_events . _Just) (\evt' -> evt' <> [evt]) jobLogE pure jobLogE where jobLog2 = jobLogSuccess jobLog jobLog3 = jobLogSuccess jobLog2 jobLogE = jobLogFailTotal jobLog {- addToCorpusWithFile :: FlowCmdM env err m => CorpusId -> MultipartData Mem -> Maybe FileType -> (JobLog -> m ()) -> m JobLog addToCorpusWithFile cid input filetype logStatus = do logStatus JobLog { _scst_succeeded = Just 10 , _scst_failed = Just 2 , _scst_remaining = Just 138 , _scst_events = Just [] } printDebug "addToCorpusWithFile" cid _h <- postUpload cid filetype input pure JobLog { _scst_succeeded = Just 137 , _scst_failed = Just 13 , _scst_remaining = Just 0 , _scst_events = Just [] } -} type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint" :> "corpus" :> Capture "corpus_id" CorpusId :> "add" :> "file" :> "async" :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m) => User -> CorpusId -> NewWithFile -> (JobLog -> m ()) -> m JobLog addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) logStatus = do printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid logStatus JobLog { _scst_succeeded = Just 0 , _scst_failed = Just 0 , _scst_remaining = Just 1 , _scst_events = Just [] } fPath <- GargDB.writeFile nwf printDebug "[addToCorpusWithFile] File saved as: " fPath uId <- getUserId user nIds <- mkNodeWithParent NodeFile (Just cid) uId fName _ <- case nIds of [nId] -> do node <- getNodeWith nId (Proxy :: Proxy HyperdataFile) let hl = node ^. node_hyperdata _ <- updateHyperdata nId $ hl { _hff_name = fName , _hff_path = T.pack fPath } printDebug "[addToCorpusWithFile] Created node with id: " nId _ -> pure () printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text) sendMail user pure $ JobLog { _scst_succeeded = Just 1 , _scst_failed = Just 0 , _scst_remaining = Just 0 , _scst_events = Just [] }