-}
{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE PackageImports #-}
+{-# LANGUAGE OverloadedStrings #-}
-module Gargantext.Text.Parsers -- (parse, FileFormat(..))
+module Gargantext.Text.Parsers (parse, FileFormat(..), clean, parseDocs)
where
-import Gargantext.Prelude
+import System.FilePath (FilePath(), takeExtension)
+import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
-import System.FilePath (takeExtension, FilePath())
-import Data.Attoparsec.ByteString (parseOnly, Parser)
-import qualified Data.ByteString as DB
-import qualified Data.Map as DM
+import Control.Monad (join)
+import qualified Data.Time as DT
import Data.Either.Extra (partitionEithers)
+import Data.Time (UTCTime(..))
+import Data.List (concat)
+import qualified Data.Map as DM
+import qualified Data.ByteString as DB
import Data.Ord()
-import Data.Foldable (concat)
import Data.String()
-import Data.Either.Extra(Either())
+import Data.Either(Either(..))
+import Data.Attoparsec.ByteString (parseOnly, Parser)
import Data.Text (Text)
-import Data.Text.Encoding (decodeUtf8)
-----
---import Control.Monad (join)
-import Codec.Archive.Zip (withArchive, getEntry, getEntries)
-import Path.IO (resolveFile')
------- import qualified Data.ByteString.Lazy as B
---import Control.Applicative ( (<$>) )
+import qualified Data.Text as DT
+
+-- Activate Async for to parse in parallel
import Control.Concurrent.Async as CCA (mapConcurrently)
+import Data.Text.Encoding (decodeUtf8)
import Data.String (String())
-import Gargantext.Text.Parsers.WOS (wosParser)
----- import Gargantext.Parsers.XML (xmlParser)
----- import Gargantext.Parsers.DOC (docParser)
----- import Gargantext.Parsers.ODT (odtParser)
+import Data.List (lookup)
---import Gargantext.Prelude (pm)
---import Gargantext.Types.Main (ErrorMessage(), Corpus)
+------------------------------------------------------------------------
+import Gargantext.Core (Lang(..))
+import Gargantext.Prelude
+import Gargantext.Database.Types.Node (HyperdataDocument(..))
+import Gargantext.Text.Parsers.WOS (wosParser)
+import Gargantext.Text.Parsers.Date (parseDate)
+import Gargantext.Text.Parsers.CSV (parseHal)
+import Gargantext.Text.Terms.Stop (detectLang)
+------------------------------------------------------------------------
--- FIXME
---type Field = Text
type ParseError = String
---
---data Corpus = Corpus { _corpusErrors :: [ParseError]
--- , _corpusMap :: Map FilePath (Map Field Text)
--- }
+--type Field = Text
+--type Document = DM.Map Field Text
+--type FilesParsed = DM.Map FilePath FileParsed
+--data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
+-- , _fileParsed_result :: [Document]
+-- } deriving (Show)
-- | According to the format of Input file,
-- different parser are available.
-data FileFormat = WOS -- Implemented (ISI Format)
+data FileFormat = WOS | CsvHalFormat -- | CsvGargV3
+ deriving (Show)
+
+-- Implemented (ISI Format)
-- | DOC -- Not Implemented / import Pandoc
-- | ODT -- Not Implemented / import Pandoc
-- | PDF -- Not Implemented / pdftotext and import Pandoc ?
-- TODO: to debug maybe add the filepath in error message
+-- | Parse file into documents
+-- TODO manage errors here
+parseDocs :: FileFormat -> FilePath -> IO [HyperdataDocument]
+parseDocs WOS path = join $ mapM (toDoc WOS) <$> snd <$> parse WOS path
+parseDocs CsvHalFormat p = parseHal p
+
+type Year = Int
+type Month = Int
+type Day = Int
+
+-- | Parse date to Ints
+-- TODO add hours, minutes and seconds
+parseDate' :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
+parseDate' _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
+parseDate' l (Just txt) = do
+ utcTime <- parseDate l txt
+ let (UTCTime day _) = utcTime
+ let (y,m,d) = DT.toGregorian day
+ pure (Just utcTime, (Just (fromIntegral y), Just m,Just d))
+
+
+toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
+toDoc WOS d = do
+ let abstract = lookup "abstract" d
+ let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
+
+ let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
+
+ (utcTime, (pub_year, pub_month, pub_day)) <- parseDate' lang dateToParse
+
+ pure $ HyperdataDocument (Just $ DT.pack $ show WOS)
+ (lookup "doi" d)
+ (lookup "URL" d)
+ Nothing
+ Nothing
+ Nothing
+ (lookup "title" d)
+ Nothing
+ (lookup "authors" d)
+ (lookup "source" d)
+ (lookup "abstract" d)
+ (fmap (DT.pack . show) utcTime)
+ (pub_year)
+ (pub_month)
+ (pub_day)
+ Nothing
+ Nothing
+ Nothing
+ (Just $ (DT.pack . show) lang)
+toDoc _ _ = undefined
+
parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
parse format path = do
files <- case takeExtension path of
-- | withParser:
--- According the format of the text, choosing the right parser.
+-- According to the format of the text, choose the right parser.
-- TODO withParser :: FileFormat -> Parser [Document]
withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
withParser WOS = wosParser
--withParser DOC = docParser
--withParser ODT = odtParser
--withParser XML = xmlParser
---withParser _ = error "[ERROR] Parser not implemented yet"
+withParser _ = panic "[ERROR] Parser not implemented yet"
-runParser :: FileFormat -> DB.ByteString
+runParser :: FileFormat -> DB.ByteString
-> IO (Either String [[(DB.ByteString, DB.ByteString)]])
runParser format text = pure $ parseOnly (withParser format) text
openZip :: FilePath -> IO [DB.ByteString]
openZip fp = do
- path <- resolveFile' fp
- entries <- withArchive path (DM.keys <$> getEntries)
- bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
+ entries <- withArchive fp (DM.keys <$> getEntries)
+ bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
pure bs
+clean :: Text -> Text
+clean txt = DT.map clean' txt
+ where
+ clean' '’' = '\''
+ clean' c = c