{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
-{-# OPTIONS_GHC -fno-warn-orphans #-}
module Gargantext.API.Node.Corpus.New
where
import Control.Lens hiding (elements, Empty)
import Data.Aeson
import Data.Aeson.TH (deriveJSON)
-import qualified Data.ByteString as BS
import qualified Data.ByteString.Base64 as BSB64
import Data.Either
import Data.Maybe (fromMaybe)
import Data.Swagger
import Data.Text (Text)
import qualified Data.Text as T
-import qualified Data.Text.Encoding as TE
import GHC.Generics (Generic)
import Servant
-import Servant.Job.Core
-import Servant.Job.Types
import Servant.Job.Utils (jsonOptions)
-- import Servant.Multipart
+import qualified Data.Text.Encoding as TE
-- import Test.QuickCheck (elements)
import Test.QuickCheck.Arbitrary
-import Web.FormUrlEncoded (FromForm)
import Gargantext.Prelude
-import Gargantext.API.Admin.Orchestrator.Types (JobLog(..))
-import qualified Gargantext.API.Admin.Orchestrator.Types as T
-import Gargantext.API.Admin.Settings (HasSettings)
+import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
+import Gargantext.API.Admin.Types (HasSettings)
+import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage)
import Gargantext.API.Node.Corpus.New.File
+import Gargantext.API.Node.Corpus.Searx
+import Gargantext.API.Node.Corpus.Types
+import Gargantext.API.Node.Types
import Gargantext.Core (Lang(..){-, allLangs-})
+import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
+import qualified Gargantext.Core.Text.Corpus.API as API
+import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
-import Gargantext.Database.Action.Flow (FlowCmdM, flowCorpus, getDataText, flowDataText, TermType(..), DataOrigin(..){-, allDataOrigins-})
-import Gargantext.Database.Action.Flow.Utils (getUserId)
+import Gargantext.Database.Action.Flow (flowCorpus, getDataText, flowDataText, TermType(..){-, allDataOrigins-})
+import Gargantext.Database.Action.Flow.Types (FlowCmdM)
+import Gargantext.Database.Action.Mail (sendMail)
import Gargantext.Database.Action.Node (mkNodeWithParent)
+import Gargantext.Database.Action.User (getUserId)
import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
+import Gargantext.Database.Prelude (hasConfig)
import Gargantext.Database.Query.Table.Node (getNodeWith)
import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
import Gargantext.Database.Schema.Node (node_hyperdata)
-import qualified Gargantext.Prelude.Utils as GPU
-import qualified Gargantext.Core.Text.Corpus.API as API
-import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
-
+import qualified Gargantext.Database.GargDB as GargDB
+import Gargantext.Prelude.Config (gc_max_docs_parsers)
------------------------------------------------------------------------
{-
data Query = Query { query_query :: Text
info _u = pure $ ApiInfo API.externalAPIs
------------------------------------------------------------------------
-
-data Database = Empty
- | PubMed
- | HAL
- | IsTex
- | Isidore
- deriving (Eq, Show, Generic)
-
-deriveJSON (unPrefix "") ''Database
-instance ToSchema Database
-
-database2origin :: Database -> DataOrigin
-database2origin Empty = InternalOrigin T.IsTex
-database2origin PubMed = ExternalOrigin T.PubMed
-database2origin HAL = ExternalOrigin T.HAL
-database2origin IsTex = ExternalOrigin T.IsTex
-database2origin Isidore = ExternalOrigin T.Isidore
-
------------------------------------------------------------------------
data WithQuery = WithQuery
- { _wq_query :: !Text
- , _wq_databases :: !Database
- , _wq_lang :: !Lang
- , _wq_node_id :: !Int
+ { _wq_query :: !Text
+ , _wq_databases :: !Database
+ , _wq_datafield :: !(Maybe Datafield)
+ , _wq_lang :: !Lang
+ , _wq_node_id :: !Int
+ , _wq_flowListWith :: !FlowSocialListWith
}
deriving Generic
makeLenses ''WithQuery
instance FromJSON WithQuery where
parseJSON = genericParseJSON $ jsonOptions "_wq_"
+instance ToJSON WithQuery where
+ toJSON = genericToJSON $ jsonOptions "_wq_"
instance ToSchema WithQuery where
declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wq_")
--------------------------------------------------------
-data NewWithForm = NewWithForm
- { _wf_filetype :: !FileType
- , _wf_data :: !Text
- , _wf_lang :: !(Maybe Lang)
- , _wf_name :: !Text
- } deriving (Eq, Show, Generic)
-
-makeLenses ''NewWithForm
-instance FromForm NewWithForm
-instance FromJSON NewWithForm where
- parseJSON = genericParseJSON $ jsonOptions "_wf_"
-instance ToSchema NewWithForm where
- declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wf_")
-
--------------------------------------------------------
-data NewWithFile = NewWithFile
- { _wfi_b64_data :: !Text
- , _wfi_lang :: !(Maybe Lang)
- , _wfi_name :: !Text
- } deriving (Eq, Show, Generic)
-
-makeLenses ''NewWithFile
-instance FromForm NewWithFile
-instance FromJSON NewWithFile where
- parseJSON = genericParseJSON $ jsonOptions "_wfi_"
-instance ToSchema NewWithFile where
- declareNamedSchema = genericDeclareNamedSchema (unPrefixSwagger "_wfi_")
-
-instance GPU.SaveFile NewWithFile where
- saveFile' fp (NewWithFile b64d _ _) = do
- let eDecoded = BSB64.decode $ TE.encodeUtf8 b64d
- case eDecoded of
- Left err -> panic $ T.pack $ "Error decoding: " <> err
- Right decoded -> BS.writeFile fp decoded
- -- BS.writeFile fp $ BSB64.decodeLenient $ TE.encodeUtf8 b64d
-
---instance GPU.ReadFile NewWithFile where
--- readFile' = TIO.readFile
-
-------------------------------------------------------------------------
-type AsyncJobs event ctI input output =
- AsyncJobsAPI' 'Unsafe 'Safe ctI '[JSON] Maybe event input output
------------------------------------------------------------------------
type AddWithQuery = Summary "Add with Query to corpus endpoint"
=> User
-> CorpusId
-> WithQuery
+ -> Maybe Integer
-> (JobLog -> m ())
-> m JobLog
-addToCorpusWithQuery u cid (WithQuery q dbs l _nid) logStatus = do
+addToCorpusWithQuery user cid (WithQuery { _wq_query = q
+ , _wq_databases = dbs
+ , _wq_datafield = datafield
+ , _wq_lang = l
+ , _wq_flowListWith = flw }) maybeLimit logStatus = do
-- TODO ...
logStatus JobLog { _scst_succeeded = Just 0
, _scst_failed = Just 0
- , _scst_remaining = Just 5
- , _scst_events = Just []
- }
- printDebug "addToCorpusWithQuery" (cid, dbs)
- -- TODO add cid
- -- TODO if cid is folder -> create Corpus
- -- if cid is corpus -> add to corpus
- -- if cid is root -> create corpus in Private
- txts <- mapM (\db -> getDataText db (Multi l) q Nothing) [database2origin dbs]
-
- logStatus JobLog { _scst_succeeded = Just 2
- , _scst_failed = Just 0
- , _scst_remaining = Just 1
- , _scst_events = Just []
- }
-
- cids <- mapM (\txt -> flowDataText u txt (Multi l) cid) txts
- printDebug "corpus id" cids
- -- TODO ...
- pure JobLog { _scst_succeeded = Just 3
- , _scst_failed = Just 0
- , _scst_remaining = Just 0
+ , _scst_remaining = Just 3
, _scst_events = Just []
}
+ printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
+ printDebug "[addToCorpusWithQuery] datafield" datafield
+ printDebug "[addToCorpusWithQuery] flowListWith" flw
+
+ case datafield of
+ Just Web -> do
+ printDebug "[addToCorpusWithQuery] processing web request" datafield
+
+ _ <- triggerSearxSearch cid q l
+
+ pure JobLog { _scst_succeeded = Just 3
+ , _scst_failed = Just 0
+ , _scst_remaining = Just 0
+ , _scst_events = Just []
+ }
+
+ _ -> do
+ -- TODO add cid
+ -- TODO if cid is folder -> create Corpus
+ -- if cid is corpus -> add to corpus
+ -- if cid is root -> create corpus in Private
+ txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
+
+ logStatus JobLog { _scst_succeeded = Just 2
+ , _scst_failed = Just 0
+ , _scst_remaining = Just $ 1 + length txts
+ , _scst_events = Just []
+ }
+
+ cids <- mapM (\txt -> flowDataText user txt (Multi l) cid Nothing logStatus) txts
+ printDebug "corpus id" cids
+ printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
+ sendMail user
+ -- TODO ...
+ pure JobLog { _scst_succeeded = Just 3
+ , _scst_failed = Just 0
+ , _scst_remaining = Just 0
+ , _scst_events = Just []
+ }
type AddWithForm = Summary "Add with FormUrlEncoded to corpus endpoint"
:> "async"
:> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
-addToCorpusWithForm :: FlowCmdM env err m
+addToCorpusWithForm :: (FlowCmdM env err m)
=> User
-> CorpusId
-> NewWithForm
-> (JobLog -> m ())
+ -> JobLog
-> m JobLog
-addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus = do
-
+addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
printDebug "[addToCorpusWithForm] Parsing corpus: " cid
printDebug "[addToCorpusWithForm] fileType" ft
- logStatus JobLog { _scst_succeeded = Just 0
- , _scst_failed = Just 0
- , _scst_remaining = Just 2
- , _scst_events = Just []
- }
+ logStatus jobLog
let
parse = case ft of
CSV_HAL -> Parser.parseFormat Parser.CsvHal
CSV -> Parser.parseFormat Parser.CsvGargV3
WOS -> Parser.parseFormat Parser.WOS
PresseRIS -> Parser.parseFormat Parser.RisPresse
-
+ ZIP -> Parser.parseFormat Parser.ZIP
+
-- TODO granularity of the logStatus
- docs <- liftBase $ splitEvery 500
- <$> take 1000000
- <$> parse (cs d)
-
- printDebug "Parsing corpus finished : " cid
- logStatus JobLog { _scst_succeeded = Just 1
- , _scst_failed = Just 0
- , _scst_remaining = Just 1
- , _scst_events = Just []
- }
-
-
- printDebug "Starting extraction : " cid
- -- TODO granularity of the logStatus
- _cid' <- flowCorpus user
- (Right [cid])
- (Multi $ fromMaybe EN l)
- (map (map toHyperdataDocument) docs)
-
- printDebug "Extraction finished : " cid
- pure JobLog { _scst_succeeded = Just 2
- , _scst_failed = Just 0
- , _scst_remaining = Just 0
- , _scst_events = Just []
- }
+ let data' = case ft of
+ ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
+ Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
+ Right decoded -> decoded
+ _ -> cs d
+ eDocs <- liftBase $ parse data'
+ case eDocs of
+ Right docs' -> do
+ -- TODO Add progress (jobStatus) update for docs - this is a
+ -- long action
+ limit' <- view $ hasConfig . gc_max_docs_parsers
+ let limit = fromIntegral limit'
+ if length docs' > limit then do
+ printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs')
+ let panicMsg' = [ "[addToCorpusWithForm] number of docs ("
+ , show $ length docs'
+ , ") exceeds the MAX_DOCS_PARSERS limit ("
+ , show limit
+ , ")" ]
+ let panicMsg = T.concat $ T.pack <$> panicMsg'
+ logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
+ panic panicMsg
+ else
+ pure ()
+ let docs = splitEvery 500 $ take limit docs'
+
+ printDebug "Parsing corpus finished : " cid
+ logStatus jobLog2
+
+ printDebug "Starting extraction : " cid
+ -- TODO granularity of the logStatus
+ _cid' <- flowCorpus user
+ (Right [cid])
+ (Multi $ fromMaybe EN l)
+ Nothing
+ (map (map toHyperdataDocument) docs)
+ logStatus
+
+ printDebug "Extraction finished : " cid
+ printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
+ sendMail user
+
+ logStatus jobLog3
+ pure $ jobLog3
+ Left e -> do
+ printDebug "[addToCorpusWithForm] parse error" e
+
+ let evt = ScraperEvent { _scev_message = Just $ T.pack e
+ , _scev_level = Just "ERROR"
+ , _scev_date = Nothing }
+
+ logStatus $ over (scst_events . _Just) (\evt' -> evt' <> [evt]) jobLogE
+ pure jobLogE
+ where
+ jobLog2 = jobLogSuccess jobLog
+ jobLog3 = jobLogSuccess jobLog2
+ jobLogE = jobLogFailTotal jobLog
{-
addToCorpusWithFile :: FlowCmdM env err m
, _scst_events = Just []
}
- fPath <- GPU.writeFile nwf
+ fPath <- GargDB.writeFile nwf
printDebug "[addToCorpusWithFile] File saved as: " fPath
uId <- getUserId user
_ -> pure ()
printDebug "[addToCorpusWithFile] File upload to corpus finished: " cid
+
+ printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
+ sendMail user
+
pure $ JobLog { _scst_succeeded = Just 1
, _scst_failed = Just 0
, _scst_remaining = Just 0
, _scst_events = Just []
}
+