import Control.Lens hiding (elements, Empty)
import Data.Aeson
import Data.Aeson.TH (deriveJSON)
+import qualified Data.ByteString.Base64 as BSB64
import Data.Either
import Data.Maybe (fromMaybe)
import Data.Swagger
import Data.Text (Text)
import qualified Data.Text as T
import GHC.Generics (Generic)
-import qualified Prelude as Prelude
-import Protolude (readFile)
import Servant
import Servant.Job.Utils (jsonOptions)
-- import Servant.Multipart
+import qualified Data.Text.Encoding as TE
-- import Test.QuickCheck (elements)
import Test.QuickCheck.Arbitrary
import Gargantext.Prelude
-import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs)
+import Gargantext.API.Admin.Orchestrator.Types (JobLog(..), AsyncJobs, ScraperEvent(..), scst_events)
import Gargantext.API.Admin.Types (HasSettings)
-import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal)
+import Gargantext.API.Job (jobLogSuccess, jobLogFailTotal, jobLogFailTotalWithMessage)
import Gargantext.API.Node.Corpus.New.File
import Gargantext.API.Node.Corpus.Searx
import Gargantext.API.Node.Corpus.Types
import Gargantext.API.Node.Types
import Gargantext.Core (Lang(..){-, allLangs-})
+import Gargantext.Core.Text.List.Social (FlowSocialListWith(..))
import qualified Gargantext.Core.Text.Corpus.API as API
import qualified Gargantext.Core.Text.Corpus.Parsers as Parser (FileFormat(..), parseFormat)
import Gargantext.Core.Types.Individu (User(..))
import Gargantext.Database.Action.User (getUserId)
import Gargantext.Database.Admin.Types.Hyperdata
import Gargantext.Database.Admin.Types.Node (CorpusId, NodeType(..), UserId)
+import Gargantext.Database.Prelude (hasConfig)
import Gargantext.Database.Query.Table.Node (getNodeWith)
import Gargantext.Database.Query.Table.Node.UpdateOpaleye (updateHyperdata)
import Gargantext.Database.Schema.Node (node_hyperdata)
import qualified Gargantext.Database.GargDB as GargDB
-
+import Gargantext.Prelude.Config (gc_max_docs_parsers)
------------------------------------------------------------------------
{-
data Query = Query { query_query :: Text
------------------------------------------------------------------------
------------------------------------------------------------------------
data WithQuery = WithQuery
- { _wq_query :: !Text
- , _wq_databases :: !Database
- , _wq_datafield :: !Datafield
- , _wq_lang :: !Lang
- , _wq_node_id :: !Int
- -- , _wq_flowListWith :: !FlowSocialListWith
+ { _wq_query :: !Text
+ , _wq_databases :: !Database
+ , _wq_datafield :: !(Maybe Datafield)
+ , _wq_lang :: !Lang
+ , _wq_node_id :: !Int
+ , _wq_flowListWith :: !FlowSocialListWith
}
deriving Generic
-> Maybe Integer
-> (JobLog -> m ())
-> m JobLog
-addToCorpusWithQuery user cid (WithQuery q dbs datafield l _nid) maybeLimit logStatus = do
+addToCorpusWithQuery user cid (WithQuery { _wq_query = q
+ , _wq_databases = dbs
+ , _wq_datafield = datafield
+ , _wq_lang = l
+ , _wq_flowListWith = flw }) maybeLimit logStatus = do
-- TODO ...
logStatus JobLog { _scst_succeeded = Just 0
, _scst_failed = Just 0
}
printDebug "[addToCorpusWithQuery] (cid, dbs)" (cid, dbs)
printDebug "[addToCorpusWithQuery] datafield" datafield
+ printDebug "[addToCorpusWithQuery] flowListWith" flw
case datafield of
- Web -> do
+ Just Web -> do
printDebug "[addToCorpusWithQuery] processing web request" datafield
_ <- triggerSearxSearch cid q l
-- TODO if cid is folder -> create Corpus
-- if cid is corpus -> add to corpus
-- if cid is root -> create corpus in Private
- txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
+ txts <- mapM (\db -> getDataText db (Multi l) q maybeLimit) [database2origin dbs]
logStatus JobLog { _scst_succeeded = Just 2
, _scst_failed = Just 0
- , _scst_remaining = Just 1
+ , _scst_remaining = Just $ 1 + length txts
, _scst_events = Just []
}
- cids <- mapM (\txt -> flowDataText user txt (Multi l) cid Nothing) txts
+ cids <- mapM (\txt -> flowDataText user txt (Multi l) cid Nothing logStatus) txts
printDebug "corpus id" cids
printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
sendMail user
:> "async"
:> AsyncJobs JobLog '[FormUrlEncoded] NewWithForm JobLog
-addToCorpusWithForm :: FlowCmdM env err m
+addToCorpusWithForm :: (FlowCmdM env err m)
=> User
-> CorpusId
-> NewWithForm
CSV -> Parser.parseFormat Parser.CsvGargV3
WOS -> Parser.parseFormat Parser.WOS
PresseRIS -> Parser.parseFormat Parser.RisPresse
-
+ ZIP -> Parser.parseFormat Parser.ZIP
+
-- TODO granularity of the logStatus
- eDocs <- liftBase $ parse $ cs d
+ let data' = case ft of
+ ZIP -> case BSB64.decode $ TE.encodeUtf8 d of
+ Left err -> panic $ T.pack "[addToCorpusWithForm] error decoding base64: " <> T.pack err
+ Right decoded -> decoded
+ _ -> cs d
+ eDocs <- liftBase $ parse data'
case eDocs of
Right docs' -> do
- let docs = splitEvery 500 $ take 1000000 docs'
+ -- TODO Add progress (jobStatus) update for docs - this is a
+ -- long action
+ limit' <- view $ hasConfig . gc_max_docs_parsers
+ let limit = fromIntegral limit'
+ if length docs' > limit then do
+ printDebug "[addToCorpusWithForm] number of docs exceeds the limit" (show $ length docs')
+ let panicMsg' = [ "[addToCorpusWithForm] number of docs ("
+ , show $ length docs'
+ , ") exceeds the MAX_DOCS_PARSERS limit ("
+ , show limit
+ , ")" ]
+ let panicMsg = T.concat $ T.pack <$> panicMsg'
+ logStatus $ jobLogFailTotalWithMessage panicMsg jobLog
+ panic panicMsg
+ else
+ pure ()
+ let docs = splitEvery 500 $ take limit docs'
printDebug "Parsing corpus finished : " cid
logStatus jobLog2
(Multi $ fromMaybe EN l)
Nothing
(map (map toHyperdataDocument) docs)
+ logStatus
printDebug "Extraction finished : " cid
printDebug "sending email" ("xxxxxxxxxxxxxxxxxxxxx" :: Text)
logStatus jobLog3
pure $ jobLog3
Left e -> do
- printDebug "Error" e
+ printDebug "[addToCorpusWithForm] parse error" e
- logStatus jobLogE
+ let evt = ScraperEvent { _scev_message = Just $ T.pack e
+ , _scev_level = Just "ERROR"
+ , _scev_date = Nothing }
+
+ logStatus $ over (scst_events . _Just) (\evt' -> evt' <> [evt]) jobLogE
pure jobLogE
where
jobLog2 = jobLogSuccess jobLog
jobLog3 = jobLogSuccess jobLog2
jobLogE = jobLogFailTotal jobLog
-parseCsvGargV3Path :: [Char] -> IO (Either Prelude.String [HyperdataDocument])
-parseCsvGargV3Path fp = do
- contents <- readFile fp
- Parser.parseFormat Parser.CsvGargV3 $ cs contents
-
{-
addToCorpusWithFile :: FlowCmdM env err m
=> CorpusId
, _scst_remaining = Just 0
, _scst_events = Just []
}
+