src/Gargantext/Core/Text/Corpus/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Corpus.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE PackageImports    #-}
  22
  23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC)
  24     where
  25
  26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  27 import Conduit
  28 import Control.Concurrent.Async as CCA (mapConcurrently)
  29 import Control.Monad.Trans.Control (MonadBaseControl)
  30 import Control.Monad (join)
  31 import Data.Attoparsec.ByteString (parseOnly, Parser)
  32 import Data.Either(Either(..))
  33 import Data.Either.Extra (partitionEithers)
  34 import Data.List (concat, lookup)
  35 import Data.Ord()
  36 import Data.String (String())
  37 import Data.String()
  38 import Data.Text (Text, intercalate, pack, unpack)
  39 import Data.Text.Encoding (decodeUtf8)
  40 import Data.Tuple.Extra (both, first, second)
  41 import System.FilePath (FilePath(), takeExtension)
  42 import qualified Data.ByteString       as DB
  43 import qualified Data.ByteString.Char8 as DBC
  44 import qualified Data.ByteString.Lazy  as DBL
  45 import qualified Data.Map              as DM
  46 import qualified Data.Text             as DT
  47 import qualified Prelude
  48 import System.IO.Temp (emptySystemTempFile)
  49
  50 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
  51 import Gargantext.Core (Lang(..))
  52 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  53 import Gargantext.Prelude
  54 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
  55 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
  56 -- import Gargantext.Core.Text.Learn (detectLangDefault)
  57 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
  58 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS  as RIS
  59 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS  as WOS
  60 ------------------------------------------------------------------------
  61
  62 type ParseError = String
  63 --type Field      = Text
  64 --type Document   = DM.Map Field Text
  65 --type FilesParsed = DM.Map FilePath FileParsed
  66 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  67 --                              , _fileParsed_result :: [Document]
  68 --                              } deriving (Show)
  69
  70
  71 -- | According to the format of Input file,
  72 -- different parser are available.
  73 data FileType = WOS | RIS | RisPresse | CsvGargV3 | CsvHal
  74   deriving (Show)
  75
  76 -- Implemented (ISI Format)
  77 --                | DOC        -- Not Implemented / import Pandoc
  78 --                | ODT        -- Not Implemented / import Pandoc
  79 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  80 --                | XML        -- Not Implemented / see :
  81
  82 parseFormatC :: MonadBaseControl IO m
  83              => FileType
  84              -> FileFormat
  85              -> DB.ByteString
  86              -> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
  87 parseFormatC CsvGargV3 Plain bs = do
  88   let eParsedC = parseCsvC $ DBL.fromStrict bs
  89   case eParsedC of
  90     Left err -> pure $ Left err
  91     Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
  92 parseFormatC CsvHal    Plain bs = do
  93   let eParsedC = parseCsvC $ DBL.fromStrict bs
  94   case eParsedC of
  95     Left err -> pure $ Left err
  96     Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
  97 parseFormatC RisPresse Plain bs = do
  98   --docs <- enrichWith RisPresse
  99   let eDocs = runParser' RisPresse bs
 100   pure $ (\docs ->
 101             ( Just $ fromIntegral $ length docs
 102             , yieldMany docs
 103               .| mapC presseEnrich
 104               .| mapC (map $ both decodeUtf8)
 105               .| mapMC (toDoc RIS)) ) <$> eDocs
 106 parseFormatC WOS Plain bs = do
 107   let eDocs = runParser' WOS bs
 108   pure $ (\docs ->
 109             ( Just $ fromIntegral $ length docs
 110             , yieldMany docs
 111               .| mapC (map $ first WOS.keys)
 112               .| mapC (map $ both decodeUtf8)
 113               .| mapMC (toDoc WOS)) ) <$> eDocs
 114 parseFormatC ft ZIP bs = do
 115   path <- liftBase $ emptySystemTempFile "parsed-zip"
 116   liftBase $ DB.writeFile path bs
 117   fileContents <- liftBase $ withArchive path $ do
 118     files <- DM.keys <$> getEntries
 119     mapM getEntry files
 120   --printDebug "[parseFormatC] fileContents" fileContents
 121   eContents <- mapM (parseFormatC ft Plain) fileContents
 122   --printDebug "[parseFormatC] contents" contents
 123   --pure $ Left $ "Not implemented for ZIP"
 124   let (errs, contents) = partitionEithers eContents
 125   case errs of
 126     [] ->
 127       case contents of
 128         [] -> pure $ Left "No files in zip"
 129         _  -> do
 130           let lenghts = fst <$> contents
 131           let contents' = snd <$> contents
 132           let totalLength = sum $ sum <$> lenghts  -- Trick: sum (Just 1) = 1, sum Nothing = 0
 133           pure $ Right ( Just totalLength
 134                        , sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
 135     _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
 136
 137 parseFormatC _ _ _ = undefined
 138
 139 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
 140 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
 141 -- parseFormat CsvHal    bs = pure $ parseHal' $ DBL.fromStrict bs
 142 -- parseFormat RisPresse bs = do
 143 --   docs <- mapM (toDoc RIS)
 144 --           <$> snd
 145 --           <$> enrichWith RisPresse
 146 --           $ partitionEithers
 147 --           $ [runParser'  RisPresse bs]
 148 --   pure $ Right docs
 149 -- parseFormat WOS bs = do
 150 --   docs <- mapM (toDoc WOS)
 151 --           <$> snd
 152 --           <$> enrichWith WOS
 153 --           $ partitionEithers
 154 --           $ [runParser'  WOS bs]
 155 --   pure $ Right docs
 156 -- parseFormat ZIP bs = do
 157 --   path <- emptySystemTempFile "parsed-zip"
 158 --   DB.writeFile path bs
 159 --   parsedZip <- withArchive path $ do
 160 --     DM.keys <$> getEntries
 161 --   pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
 162 -- parseFormat _ _ = undefined
 163
 164 -- | Parse file into documents
 165 -- TODO manage errors here
 166 -- TODO: to debug maybe add the filepath in error message
 167 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
 168 parseFile CsvHal    Plain p = parseHal p
 169 parseFile CsvGargV3 Plain p = parseCsv p
 170 parseFile RisPresse Plain p = do
 171   docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
 172   pure $ Right docs
 173 parseFile WOS       Plain p = do
 174   docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
 175   pure $ Right docs
 176 parseFile ff        _ p = do
 177   docs <- join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff  p
 178   pure $ Right docs
 179
 180 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
 181 -- TODO use language for RIS
 182 toDoc ff d = do
 183       -- let abstract = lookup "abstract" d
 184       let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
 185
 186       let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
 187
 188       (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang  dateToParse
 189
 190       pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
 191                                , _hd_doi = lookup "doi" d
 192                                , _hd_url = lookup "URL" d
 193                                , _hd_uniqId = Nothing
 194                                , _hd_uniqIdBdd = Nothing
 195                                , _hd_page = Nothing
 196                                , _hd_title = lookup "title" d
 197                                , _hd_authors = Nothing
 198                                , _hd_institutes = lookup "authors" d
 199                                , _hd_source = lookup "source" d
 200                                , _hd_abstract = lookup "abstract" d
 201                                , _hd_publication_date = fmap (DT.pack . show) utcTime
 202                                , _hd_publication_year = pub_year
 203                                , _hd_publication_month = pub_month
 204                                , _hd_publication_day = pub_day
 205                                , _hd_publication_hour = Nothing
 206                                , _hd_publication_minute = Nothing
 207                                , _hd_publication_second = Nothing
 208                                , _hd_language_iso2 = Just $ (DT.pack . show) lang }
 209
 210 enrichWith :: FileType
 211            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 212 enrichWith RisPresse = enrichWith' presseEnrich
 213 enrichWith WOS       = enrichWith' (map (first WOS.keys))
 214 enrichWith _         = enrichWith' identity
 215
 216
 217 enrichWith' ::      ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 218            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 219 enrichWith' f = second (map both' . map f . concat)
 220   where
 221     both'   = map (both decodeUtf8)
 222
 223
 224
 225 readFileWith :: FileType -> FilePath
 226        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 227 readFileWith format path = do
 228     files <- case takeExtension path of
 229               ".zip" -> openZip              path
 230               _      -> pure <$> clean <$> DB.readFile path
 231     partitionEithers <$> mapConcurrently (runParser format) files
 232
 233
 234 -- | withParser:
 235 -- According to the format of the text, choose the right parser.
 236 -- TODO  withParser :: FileType -> Parser [Document]
 237 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
 238 withParser WOS = WOS.parser
 239 withParser RIS = RIS.parser
 240 --withParser ODT = odtParser
 241 --withParser XML = xmlParser
 242 withParser _   = panic "[ERROR] Parser not implemented yet"
 243
 244 runParser :: FileType -> DB.ByteString
 245           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 246 runParser format text = pure $ runParser' format text
 247
 248 runParser' :: FileType -> DB.ByteString
 249           -> (Either String [[(DB.ByteString, DB.ByteString)]])
 250 runParser' format text = parseOnly (withParser format) text
 251
 252 openZip :: FilePath -> IO [DB.ByteString]
 253 openZip fp = do
 254     entries <- withArchive fp (DM.keys <$> getEntries)
 255     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 256     pure bs
 257
 258 cleanText :: Text -> Text
 259 cleanText = cs . clean . cs
 260
 261 clean :: DB.ByteString -> DB.ByteString
 262 clean txt = DBC.map clean' txt
 263   where
 264     clean' '’' = '\''
 265     clean' '\r' = ' '
 266     clean' '\t' = ' '
 267     clean' ';' = '.'
 268     clean' c  = c