{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE PackageImports #-}
+{-# LANGUAGE OverloadedStrings #-}
-module Gargantext.Text.Parsers (parse, FileFormat(..), clean)
+module Gargantext.Text.Parsers (FileFormat(..), clean, parseFile)
where
-import System.FilePath (FilePath(), takeExtension)
+--import Data.ByteString (ByteString)
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
-
+import Control.Concurrent.Async as CCA (mapConcurrently)
+import Control.Monad (join)
+import qualified Data.ByteString.Char8 as DBC
+import Data.Attoparsec.ByteString (parseOnly, Parser)
+import Data.Either(Either(..))
import Data.Either.Extra (partitionEithers)
import Data.List (concat)
-import qualified Data.Map as DM
-import qualified Data.ByteString as DB
+import Data.List (lookup)
import Data.Ord()
+import Data.String (String())
import Data.String()
-import Data.Either(Either(..))
-import Data.Attoparsec.ByteString (parseOnly, Parser)
-
import Data.Text (Text)
-import qualified Data.Text as DT
-
--- Activate Async for to parse in parallel
-import Control.Concurrent.Async as CCA (mapConcurrently)
-
import Data.Text.Encoding (decodeUtf8)
-import Data.String (String())
-
-------------------------------------------------------------------------
+import Data.Tuple.Extra (both, first, second)
+import System.FilePath (FilePath(), takeExtension)
+import qualified Data.ByteString as DB
+import qualified Data.Map as DM
+import qualified Data.Text as DT
+import Gargantext.Core (Lang(..))
import Gargantext.Prelude
-import Gargantext.Text.Parsers.WOS (wosParser)
+import Gargantext.Database.Types.Node (HyperdataDocument(..))
+import qualified Gargantext.Text.Parsers.WOS as WOS
+import qualified Gargantext.Text.Parsers.RIS as RIS
+import Gargantext.Text.Parsers.RIS.Presse (presseEnrich)
+import qualified Gargantext.Text.Parsers.Date as Date
+import Gargantext.Text.Parsers.CSV (parseHal)
+import Gargantext.Text.Terms.Stop (detectLang)
------------------------------------------------------------------------
type ParseError = String
-- | According to the format of Input file,
-- different parser are available.
-data FileFormat = WOS -- Implemented (ISI Format)
+data FileFormat = WOS | RIS | RisPresse
+ | CsvGargV3 | CsvHalFormat
+ deriving (Show)
+
+-- Implemented (ISI Format)
-- | DOC -- Not Implemented / import Pandoc
-- | ODT -- Not Implemented / import Pandoc
-- | PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | XML -- Not Implemented / see :
--- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
--- TODO: to debug maybe add the filepath in error message
+{-
+parseFormat :: FileFormat -> ByteString -> [HyperdataDocument]
+parseFormat = undefined
+-}
-parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
-parse format path = do
+-- | Parse file into documents
+-- TODO manage errors here
+-- TODO: to debug maybe add the filepath in error message
+parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
+parseFile CsvHalFormat p = parseHal p
+parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
+parseFile WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
+parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
+
+toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
+-- TODO use language for RIS
+toDoc ff d = do
+ let abstract = lookup "abstract" d
+ let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
+
+ let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
+
+ (utcTime, (pub_year, pub_month, pub_day)) <- Date.split lang dateToParse
+
+ pure $ HyperdataDocument (Just $ DT.pack $ show ff)
+ (lookup "doi" d)
+ (lookup "URL" d)
+ Nothing
+ Nothing
+ Nothing
+ (lookup "title" d)
+ Nothing
+ (lookup "authors" d)
+ (lookup "source" d)
+ (lookup "abstract" d)
+ (fmap (DT.pack . show) utcTime)
+ (pub_year)
+ (pub_month)
+ (pub_day)
+ Nothing
+ Nothing
+ Nothing
+ (Just $ (DT.pack . show) lang)
+
+enrichWith :: FileFormat
+ -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
+enrichWith RisPresse = enrichWith' presseEnrich
+enrichWith WOS = enrichWith' (map (first WOS.keys))
+enrichWith _ = enrichWith' identity
+
+
+enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
+ -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
+enrichWith' f = second (map both' . map f . concat)
+ where
+ both' = map (both decodeUtf8)
+
+readFileWith :: FileFormat -> FilePath
+ -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
+readFileWith format path = do
files <- case takeExtension path of
".zip" -> openZip path
- _ -> pure <$> DB.readFile path
- (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
- pure (as, map toText $ concat bs)
- where
- -- TODO : decode with bayesian inference on encodings
- toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
+ _ -> pure <$> clean <$> DB.readFile path
+ partitionEithers <$> mapConcurrently (runParser format) files
-- | withParser:
-- According to the format of the text, choose the right parser.
-- TODO withParser :: FileFormat -> Parser [Document]
withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
-withParser WOS = wosParser
---withParser DOC = docParser
+withParser WOS = WOS.parser
+withParser RIS = RIS.parser
--withParser ODT = odtParser
--withParser XML = xmlParser
---withParser _ = error "[ERROR] Parser not implemented yet"
+withParser _ = panic "[ERROR] Parser not implemented yet"
-runParser :: FileFormat -> DB.ByteString
+runParser :: FileFormat -> DB.ByteString
-> IO (Either String [[(DB.ByteString, DB.ByteString)]])
runParser format text = pure $ parseOnly (withParser format) text
bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
pure bs
-clean :: Text -> Text
-clean txt = DT.map clean' txt
+clean :: DB.ByteString -> DB.ByteString
+clean txt = DBC.map clean' txt
where
clean' '’' = '\''
+ clean' '\r' = ' '
clean' c = c
-
-