{-# LANGUAGE PackageImports #-}
-module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
+module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC)
where
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
+import Conduit
import Control.Concurrent.Async as CCA (mapConcurrently)
+import Control.Monad.Trans.Control (MonadBaseControl)
import Control.Monad (join)
import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.Either(Either(..))
import Data.Ord()
import Data.String (String())
import Data.String()
-import Data.Text (Text)
+import Data.Text (Text, intercalate, pack, unpack)
import Data.Text.Encoding (decodeUtf8)
import Data.Tuple.Extra (both, first, second)
-import Gargantext.Core (Lang(..))
-import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
-import Gargantext.Prelude
-import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
-import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
-import Gargantext.Core.Text.Learn (detectLangDefault)
import System.FilePath (FilePath(), takeExtension)
import qualified Data.ByteString as DB
import qualified Data.ByteString.Char8 as DBC
import qualified Data.ByteString.Lazy as DBL
import qualified Data.Map as DM
import qualified Data.Text as DT
+import qualified Prelude
+import System.IO.Temp (emptySystemTempFile)
+
+import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
+import Gargantext.Core (Lang(..))
+import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
+import Gargantext.Prelude
+import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
+import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
+-- import Gargantext.Core.Text.Learn (detectLangDefault)
import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
-- | According to the format of Input file,
-- different parser are available.
-data FileFormat = WOS | RIS | RisPresse
- | CsvGargV3 | CsvHal
+data FileType = WOS | RIS | RisPresse | CsvGargV3 | CsvHal
deriving (Show)
-- Implemented (ISI Format)
-- | PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | XML -- Not Implemented / see :
+parseFormatC :: MonadBaseControl IO m
+ => FileType
+ -> FileFormat
+ -> DB.ByteString
+ -> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
+parseFormatC CsvGargV3 Plain bs = do
+ let eParsedC = parseCsvC $ DBL.fromStrict bs
+ case eParsedC of
+ Left err -> pure $ Left err
+ Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
+parseFormatC CsvHal Plain bs = do
+ let eParsedC = parseCsvC $ DBL.fromStrict bs
+ case eParsedC of
+ Left err -> pure $ Left err
+ Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
+parseFormatC RisPresse Plain bs = do
+ --docs <- enrichWith RisPresse
+ let eDocs = runParser' RisPresse bs
+ pure $ (\docs ->
+ ( Just $ fromIntegral $ length docs
+ , yieldMany docs
+ .| mapC presseEnrich
+ .| mapC (map $ both decodeUtf8)
+ .| mapMC (toDoc RIS)) ) <$> eDocs
+parseFormatC WOS Plain bs = do
+ let eDocs = runParser' WOS bs
+ pure $ (\docs ->
+ ( Just $ fromIntegral $ length docs
+ , yieldMany docs
+ .| mapC (map $ first WOS.keys)
+ .| mapC (map $ both decodeUtf8)
+ .| mapMC (toDoc WOS)) ) <$> eDocs
+parseFormatC ft ZIP bs = do
+ path <- liftBase $ emptySystemTempFile "parsed-zip"
+ liftBase $ DB.writeFile path bs
+ fileContents <- liftBase $ withArchive path $ do
+ files <- DM.keys <$> getEntries
+ mapM getEntry files
+ --printDebug "[parseFormatC] fileContents" fileContents
+ eContents <- mapM (parseFormatC ft Plain) fileContents
+ --printDebug "[parseFormatC] contents" contents
+ --pure $ Left $ "Not implemented for ZIP"
+ let (errs, contents) = partitionEithers eContents
+ case errs of
+ [] ->
+ case contents of
+ [] -> pure $ Left "No files in zip"
+ _ -> do
+ let lenghts = fst <$> contents
+ let contents' = snd <$> contents
+ let totalLength = sum $ sum <$> lenghts -- Trick: sum (Just 1) = 1, sum Nothing = 0
+ pure $ Right ( Just totalLength
+ , sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
+ _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
+
+parseFormatC _ _ _ = undefined
-parseFormat :: FileFormat -> DB.ByteString -> IO [HyperdataDocument]
-parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
-parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
-parseFormat RisPresse bs = mapM (toDoc RIS)
- <$> snd
- <$> enrichWith RisPresse
- $ partitionEithers
- $ [runParser' RisPresse bs]
-parseFormat WOS bs = mapM (toDoc WOS)
- <$> snd
- <$> enrichWith WOS
- $ partitionEithers
- $ [runParser' WOS bs]
-parseFormat _ _ = undefined
+-- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
+-- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
+-- parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
+-- parseFormat RisPresse bs = do
+-- docs <- mapM (toDoc RIS)
+-- <$> snd
+-- <$> enrichWith RisPresse
+-- $ partitionEithers
+-- $ [runParser' RisPresse bs]
+-- pure $ Right docs
+-- parseFormat WOS bs = do
+-- docs <- mapM (toDoc WOS)
+-- <$> snd
+-- <$> enrichWith WOS
+-- $ partitionEithers
+-- $ [runParser' WOS bs]
+-- pure $ Right docs
+-- parseFormat ZIP bs = do
+-- path <- emptySystemTempFile "parsed-zip"
+-- DB.writeFile path bs
+-- parsedZip <- withArchive path $ do
+-- DM.keys <$> getEntries
+-- pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
+-- parseFormat _ _ = undefined
-- | Parse file into documents
-- TODO manage errors here
-- TODO: to debug maybe add the filepath in error message
-parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
-parseFile CsvHal p = parseHal p
-parseFile CsvGargV3 p = parseCsv p
-parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
-parseFile WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
-parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
-
-toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
+
+parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
+parseFile CsvHal Plain p = parseHal p
+parseFile CsvGargV3 Plain p = parseCsv p
+parseFile RisPresse Plain p = do
+ docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
+ pure $ Right docs
+parseFile WOS Plain p = do
+ docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
+ pure $ Right docs
+parseFile ff _ p = do
+ docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
+ pure $ Right docs
+
+toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
-- TODO use language for RIS
toDoc ff d = do
- let abstract = lookup "abstract" d
- let lang = maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
-
+ -- let abstract = lookup "abstract" d
+ let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
+
let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
-
- (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
-
- pure $ HyperdataDocument (Just $ DT.pack $ show ff)
- (lookup "doi" d)
- (lookup "URL" d)
- Nothing
- Nothing
- Nothing
- (lookup "title" d)
- Nothing
- (lookup "authors" d)
- (lookup "source" d)
- (lookup "abstract" d)
- (fmap (DT.pack . show) utcTime)
- (pub_year)
- (pub_month)
- (pub_day)
- Nothing
- Nothing
- Nothing
- (Just $ (DT.pack . show) lang)
-
-enrichWith :: FileFormat
+
+ (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
+
+ pure HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
+ , _hd_doi = lookup "doi" d
+ , _hd_url = lookup "URL" d
+ , _hd_uniqId = Nothing
+ , _hd_uniqIdBdd = Nothing
+ , _hd_page = Nothing
+ , _hd_title = lookup "title" d
+ , _hd_authors = Nothing
+ , _hd_institutes = lookup "authors" d
+ , _hd_source = lookup "source" d
+ , _hd_abstract = lookup "abstract" d
+ , _hd_publication_date = fmap (DT.pack . show) utcTime
+ , _hd_publication_year = pub_year
+ , _hd_publication_month = pub_month
+ , _hd_publication_day = pub_day
+ , _hd_publication_hour = Nothing
+ , _hd_publication_minute = Nothing
+ , _hd_publication_second = Nothing
+ , _hd_language_iso2 = Just $ (DT.pack . show) lang }
+
+enrichWith :: FileType
-> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
enrichWith RisPresse = enrichWith' presseEnrich
enrichWith WOS = enrichWith' (map (first WOS.keys))
-readFileWith :: FileFormat -> FilePath
+readFileWith :: FileType -> FilePath
-> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
readFileWith format path = do
files <- case takeExtension path of
-- | withParser:
-- According to the format of the text, choose the right parser.
--- TODO withParser :: FileFormat -> Parser [Document]
-withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
+-- TODO withParser :: FileType -> Parser [Document]
+withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
withParser WOS = WOS.parser
withParser RIS = RIS.parser
--withParser ODT = odtParser
--withParser XML = xmlParser
withParser _ = panic "[ERROR] Parser not implemented yet"
-runParser :: FileFormat -> DB.ByteString
+runParser :: FileType -> DB.ByteString
-> IO (Either String [[(DB.ByteString, DB.ByteString)]])
runParser format text = pure $ runParser' format text
-runParser' :: FileFormat -> DB.ByteString
+runParser' :: FileType -> DB.ByteString
-> (Either String [[(DB.ByteString, DB.ByteString)]])
runParser' format text = parseOnly (withParser format) text