{-| Module : Gargantext.API.Node.Corpus.New Description : New corpus API Copyright : (c) CNRS, 2017-Present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX New corpus means either: - new corpus - new data in existing corpus -} {-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE TypeOperators #-} module Gargantext.API.Node.Corpus.New where import Conduit import Control.Lens hiding (elements, Empty) import Data.Aeson import Data.Aeson.TH (deriveJSON) import qualified Data.ByteString.Base64 as BSB64 import Data.Conduit.Internal (zipSources) import Data.Either import Data.Maybe (fromMaybe) import Data.Swagger import Data.Text (Text) import qualified Data.Text as T import GHC.Generics (Generic) import Servant import Servant.Job.Utils (jsonOptions) -- import Servant.Multipart import qualified Data.Text.Encoding as TE -- import Test.QuickCheck (elements) import Test.QuickCheck.Arbitrary import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs) import Gargantext.API.Admin.Types (HasSettings) import Gargantext.API.Node.Corpus.New.Types import Gargantext.API.Node.Corpus.Searx import Gargantext.API.Node.Corpus.Types import Gargantext.API.Node.Types import Gargantext.Core (Lang(..){-, allLangs-}) import Gargantext.Core.Text.List.Social (FlowSocialListWith(..)) import Gargantext.Core.Types.Individu (User(..)) import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger) import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-}) import Gargantext.Database.Action.Flow.Types (FlowCmdM) import Gargantext.Database.Action.Mail (sendMail) import Gargantext.Database.Action.Node (mkNodeWithParent) import Gargantext.Database.Action.User (getUserId) import Gargantext.Database.Admin.Types.Hyperdata import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId) import Gargantext.Database.Prelude (hasConfig) import Gargantext.Database.Query.Table.Node (getNodeWith) import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata) import Gargantext.Database.Schema.Node (node_hyperdata) import Gargantext.Prelude import Gargantext.Prelude.Config (gc_max_docs_parsers) import Gargantext.Utils.Jobs (JobHandle, MonadJobStatus(..)) import qualified Gargantext.Core.Text.Corpus.API as API import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileType(..), parseFormatC) import qualified Gargantext.Database.GargDB as GargDB ------------------------------------------------------------------------ {- data Query = Query { query_query :: Text , query_node_id :: Int , query_lang :: Lang , query_databases :: [DataOrigin] } deriving (Eq, Generic) deriveJSON (unPrefix "query_") 'Query instance Arbitrary Query where arbitrary = elements [ Query q n la fs | q <- ["honeybee* AND collapse" ,"covid 19" ] , n <- [0..10] , la <- allLangs , fs <- take 3 $ repeat allDataOrigins ] instance ToSchema Query where declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "query_") -} ------------------------------------------------------------------------ {- type Api = PostApi :<|> GetApi type PostApi = Summary "New Corpus endpoint" :> ReqBody '[JSON] Query :> Post '[JSON] CorpusId type GetApi = Get '[JSON] ApiInfo -} -- | TODO manage several apis -- TODO-ACCESS -- TODO this is only the POST {- api :: (FlowCmdM env err m) => UserId -> Query -> m CorpusId api uid (Query q _ as) = do cId <- case head as of Nothing -> flowCorpusSearchInDatabase (UserDBId uid) EN q Just API.All -> flowCorpusSearchInDatabase (UserDBId uid) EN q Just a -> do docs <- liftBase $ API.get a q (Just 1000) cId' <- flowCorpus (UserDBId uid) (Left q) (Multi EN) [docs] pure cId' pure cId -} ------------------------------------------------ -- TODO use this route for Client implementation data ApiInfo = ApiInfo { api_info :: [API.ExternalAPIs]} deriving (Generic) instance Arbitrary ApiInfo where arbitrary = ApiInfo <$> arbitrary deriveJSON (unPrefix "") 'ApiInfo instance ToSchema ApiInfo info :: FlowCmdM env err m => UserId -> m ApiInfo info _u = do ext <- API.externalAPIs pure $ ApiInfo ext ------------------------------------------------------------------------ ------------------------------------------------------------------------ data WithQuery = WithQuery { _wq_query :: !Text , _wq_databases :: !Database , _wq_datafield :: !(Maybe Datafield) , _wq_lang :: !Lang , _wq_node_id :: !Int , _wq_flowListWith :: !FlowSocialListWith } deriving Generic makeLenses ''WithQuery instance FromJSON WithQuery where parseJSON = genericParseJSON $ jsonOptions "_wq_" instance ToJSON WithQuery where toJSON = genericToJSON $ jsonOptions "_wq_" instance ToSchema WithQuery where declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_") ------------------------------------------------------------------------ type AddWithQuery = Summary "Add with Query to corpus endpoint" :> "corpus" :> Capture "corpus_id" CorpusId :> "query" :> AsyncJobs JobLog '[JSON] WithQuery JobLog {- type AddWithFile = Summary "Add with MultipartData to corpus endpoint" :> "corpus" :> Capture "corpus_id" CorpusId :> "add" :> "file" :> MultipartForm Mem (MultipartData Mem) :> QueryParam "fileType" FileType :> "async" :> AsyncJobs JobLog '[JSON] () JobLog -} ------------------------------------------------------------------------ -- TODO WithQuery also has a corpus id addToCorpusWithQuery :: (FlowCmdM env err m, MonadJobStatus m) => User -> CorpusId -> WithQuery -> Maybe Integer -> JobHandle m -> m () addToCorpusWithQuery user cid (WithQuery { _wq_query = q , _wq_databases = dbs , _wq_datafield = datafield , _wq_lang = l , _wq_flowListWith = flw }) maybeLimit jobHandle = do -- TODO ... markStarted 3 jobHandle -- printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs) -- printDebug "[addToCorpusWithQuery] datafield" datafield -- printDebug "[addToCorpusWithQuery] flowListWith" flw case datafield of Just Web -> do -- printDebug "[addToCorpusWithQuery] processing web request" datafield _ <- triggerSearxSearch user cid q l jobHandle markComplete jobHandle _ -> do -- TODO add cid -- TODO if cid is folder -> create Corpus -- if cid is corpus -> add to corpus -- if cid is root -> create corpus in Private -- printDebug "[G.A.N.C.New] getDataText with query" q databaseOrigin <- database2origin dbs eTxts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [databaseOrigin] let lTxts = lefts eTxts -- printDebug "[G.A.N.C.New] lTxts" lTxts case lTxts of [] -> do let txts = rights eTxts -- TODO Sum lenghts of each txt elements -- NOTE(adinapoli) Some other weird arithmetic to have the -- following 'JobLog' as output: -- JobLog -- { _scst_succeeded = Just 2 -- , _scst_failed = Just 0 -- , _scst_remaining = Just $ 1 + length txts -- , _scst_events = Just [] -- } markStarted (3 + length txts) jobHandle markProgress 2 jobHandle _cids <- mapM (\txt -> do flowDataText user txt (Multi l) cid (Just flw) jobHandle) txts -- printDebug "corpus id" cids -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text) sendMail user -- TODO ... markComplete jobHandle (err:_) -> do -- printDebug "Error: " err markFailure 1 (Just $ T.pack (show err)) jobHandle type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint" :> "corpus" :> Capture "corpus_id" CorpusId :> "add" :> "form" :> "async" :> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog addToCorpusWithForm :: (FlowCmdM env err m, MonadJobStatus m) => User -> CorpusId -> NewWithForm -> JobHandle m -> m () addToCorpusWithForm user cid (NewWithForm ft ff d l _n sel) jobHandle = do -- printDebug "[addToCorpusWithForm] Parsing corpus: " cid -- printDebug "[addToCorpusWithForm] fileType" ft -- printDebug "[addToCorpusWithForm] fileFormat" ff limit' <- view $ hasConfig . gc_max_docs_parsers let limit = fromIntegral limit' :: Integer let parseC = case ft of CSV_HAL -> Parser.parseFormatC Parser.CsvHal CSV -> Parser.parseFormatC Parser.CsvGargV3 WOS -> Parser.parseFormatC Parser.WOS PresseRIS -> Parser.parseFormatC Parser.RisPresse -- TODO granularity of the logStatus let data' = case ff of Plain -> cs d ZIP -> case BSB64.decode $ TE.encodeUtf8 d of Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err Right decoded -> decoded eDocsC <- liftBase $ parseC ff data' case eDocsC of Right (mCount, docsC) -> do -- TODO Add progress (jobStatus) update for docs - this is a -- long action let docsC' = zipSources (yieldMany [1..]) docsC .| mapMC (\(idx, doc) -> if idx > limit then do --printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show limit) let panicMsg' = [ "[addToCorpusWithForm] number of docs " , "exceeds the MAX_DOCS_PARSERS limit (" , show limit , ")" ] let panicMsg = T.concat $ T.pack <$> panicMsg' --logStatus $ jobLogFailTotalWithMessage panicMsg jobLog panic panicMsg else pure doc) .| mapC toHyperdataDocument --printDebug "Parsing corpus finished : " cid --logStatus jobLog2 --printDebug "Starting extraction : " cid -- TODO granularity of the logStatus -- printDebug "flowCorpus with (corpus_id, lang)" (cid, l) _cid' <- flowCorpus user (Right [cid]) (Multi $ fromMaybe EN l) (Just sel) --(Just $ fromIntegral $ length docs, docsC') (mCount, transPipe liftBase docsC') -- TODO fix number of docs --(map (map toHyperdataDocument) docs) jobHandle -- printDebug "Extraction finished : " cid -- printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text) -- TODO uncomment this --sendMail user markComplete jobHandle Left e -> do printDebug "[addToCorpusWithForm] parse error" e markFailed (Just $ T.pack e) jobHandle {- addToCorpusWithFile :: FlowCmdM env err m => CorpusId -> MultipartData Mem -> Maybe FileType -> (JobLog -> m ()) -> m JobLog addToCorpusWithFile cid input filetype logStatus = do logStatus JobLog { _scst_succeeded = Just 10 , _scst_failed = Just 2 , _scst_remaining = Just 138 , _scst_events = Just [] } printDebug "addToCorpusWithFile" cid _h <- postUpload cid filetype input pure JobLog { _scst_succeeded = Just 137 , _scst_failed = Just 13 , _scst_remaining = Just 0 , _scst_events = Just [] } -} type AddWithFile = Summary "Add with FileUrlEncoded to corpus endpoint" :> "corpus" :> Capture "corpus_id" CorpusId :> "add" :> "file" :> "async" :> AsyncJobs JobLog '[FormUrlEncoded] NewWithFile JobLog addToCorpusWithFile :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m) => User -> CorpusId -> NewWithFile -> JobHandle m -> m () addToCorpusWithFile user cid nwf@(NewWithFile _d _l fName) jobHandle = do printDebug "[addToCorpusWithFile] Uploading file to corpus: " cid markStarted 1 jobHandle fPath <- GargDB.writeFile nwf printDebug "[addToCorpusWithFile] File saved as: " fPath uId <- getUserId user nIds <- mkNodeWithParent NodeFile (Just cid) uId fName _ <- case nIds of [nId] -> do node <- getNodeWith nId (Proxy :: Proxy HyperdataFile) let hl = node ^. node_hyperdata _ <- updateHyperdata nId $ hl { _hff_name = fName , _hff_path = T.pack fPath } printDebug "[addToCorpusWithFile] Created node with id: " nId _ -> pure () printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text) sendMail user markComplete jobHandle