src/Gargantext/Core/Text/Corpus/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Corpus.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE PackageImports    #-}
  22
  23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC)
  24     where
  25
  26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  27 import Conduit
  28 import Control.Concurrent.Async as CCA (mapConcurrently)
  29 import Control.Monad.Trans.Control (MonadBaseControl)
  30 import Control.Monad (join)
  31 import Data.Attoparsec.ByteString (parseOnly, Parser)
  32 import Data.Either(Either(..))
  33 import Data.Either.Extra (partitionEithers)
  34 import Data.List (concat, lookup)
  35 import Data.Ord()
  36 import Data.String (String())
  37 import Data.String()
  38 import Data.Text (Text, intercalate, pack, unpack)
  39 import Data.Text.Encoding (decodeUtf8)
  40 import Data.Tuple.Extra (both, first, second)
  41 import System.FilePath (FilePath(), takeExtension)
  42 import qualified Data.ByteString       as DB
  43 import qualified Data.ByteString.Char8 as DBC
  44 import qualified Data.ByteString.Lazy  as DBL
  45 import qualified Data.Map              as DM
  46 import qualified Data.Text             as DT
  47 import qualified Prelude
  48 import System.IO.Temp (emptySystemTempFile)
  49
  50 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
  51 import Gargantext.Core (Lang(..))
  52 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  53 import Gargantext.Prelude
  54 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
  55 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
  56 -- import Gargantext.Core.Text.Learn (detectLangDefault)
  57 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
  58 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS  as RIS
  59 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS  as WOS
  60 ------------------------------------------------------------------------
  61
  62 type ParseError = String
  63 --type Field      = Text
  64 --type Document   = DM.Map Field Text
  65 --type FilesParsed = DM.Map FilePath FileParsed
  66 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  67 --                              , _fileParsed_result :: [Document]
  68 --                              } deriving (Show)
  69
  70
  71 -- | According to the format of Input file,
  72 -- different parser are available.
  73 data FileType = WOS | RIS | RisPresse | CsvGargV3 | CsvHal
  74   deriving (Show)
  75
  76 -- Implemented (ISI Format)
  77 --                | DOC        -- Not Implemented / import Pandoc
  78 --                | ODT        -- Not Implemented / import Pandoc
  79 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  80 --                | XML        -- Not Implemented / see :
  81
  82 parseFormatC :: MonadBaseControl IO m
  83              => FileType
  84              -> FileFormat
  85              -> DB.ByteString
  86              -> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
  87 parseFormatC CsvGargV3 Plain bs = do
  88   let eParsedC = parseCsvC $ DBL.fromStrict bs
  89   case eParsedC of
  90     Left err -> pure $ Left err
  91     Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
  92 parseFormatC CsvHal    Plain bs = do
  93   let eParsedC = parseCsvC $ DBL.fromStrict bs
  94   case eParsedC of
  95     Left err -> pure $ Left err
  96     Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
  97 parseFormatC RisPresse Plain bs = do
  98   --docs <- enrichWith RisPresse
  99   let eDocs = runParser' RisPresse bs
 100   pure $ (\docs ->
 101             ( Just $ fromIntegral $ length docs
 102             , yieldMany docs
 103               .| mapC presseEnrich
 104               .| mapC (map $ both decodeUtf8)
 105               .| mapMC (toDoc RIS)) ) <$> eDocs
 106 parseFormatC WOS Plain bs = do
 107   let eDocs = runParser' WOS bs
 108   pure $ (\docs ->
 109             ( Just $ fromIntegral $ length docs
 110             , yieldMany docs
 111               .| mapC (map $ first WOS.keys)
 112               .| mapC (map $ both decodeUtf8)
 113               .| mapMC (toDoc WOS)) ) <$> eDocs
 114 parseFormatC ft ZIP bs = do
 115   path <- liftBase $ emptySystemTempFile "parsed-zip"
 116   liftBase $ DB.writeFile path bs
 117   fileContents <- liftBase $ withArchive path $ do
 118     files <- DM.keys <$> getEntries
 119     mapM getEntry files
 120   --printDebug "[parseFormatC] fileContents" fileContents
 121   eContents <- mapM (parseFormatC ft Plain) fileContents
 122   --printDebug "[parseFormatC] contents" contents
 123   --pure $ Left $ "Not implemented for ZIP"
 124   let (errs, contents) = partitionEithers eContents
 125   case errs of
 126     [] ->
 127       case contents of
 128         [] -> pure $ Left "No files in zip"
 129         _  -> do
 130           let lenghts = fst <$> contents
 131           let contents' = snd <$> contents
 132           let totalLength = sum $ sum <$> lenghts  -- Trick: sum (Just 1) = 1, sum Nothing = 0
 133           pure $ Right ( Just totalLength
 134                        , sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
 135     _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
 136
 137 parseFormatC _ _ _ = undefined
 138
 139 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
 140 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
 141 -- parseFormat CsvHal    bs = pure $ parseHal' $ DBL.fromStrict bs
 142 -- parseFormat RisPresse bs = do
 143 --   docs <- mapM (toDoc RIS)
 144 --           <$> snd
 145 --           <$> enrichWith RisPresse
 146 --           $ partitionEithers
 147 --           $ [runParser'  RisPresse bs]
 148 --   pure $ Right docs
 149 -- parseFormat WOS bs = do
 150 --   docs <- mapM (toDoc WOS)
 151 --           <$> snd
 152 --           <$> enrichWith WOS
 153 --           $ partitionEithers
 154 --           $ [runParser'  WOS bs]
 155 --   pure $ Right docs
 156 -- parseFormat ZIP bs = do
 157 --   path <- emptySystemTempFile "parsed-zip"
 158 --   DB.writeFile path bs
 159 --   parsedZip <- withArchive path $ do
 160 --     DM.keys <$> getEntries
 161 --   pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
 162 -- parseFormat _ _ = undefined
 163
 164 -- | Parse file into documents
 165 -- TODO manage errors here
 166 -- TODO: to debug maybe add the filepath in error message
 167
 168 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
 169 parseFile CsvHal    Plain p = parseHal p
 170 parseFile CsvGargV3 Plain p = parseCsv p
 171 parseFile RisPresse Plain p = do
 172   docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
 173   pure $ Right docs
 174 parseFile WOS       Plain p = do
 175   docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
 176   pure $ Right docs
 177 parseFile ff        _ p = do
 178   docs <- join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff  p
 179   pure $ Right docs
 180
 181 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
 182 -- TODO use language for RIS
 183 toDoc ff d = do
 184       -- let abstract = lookup "abstract" d
 185       let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
 186
 187       let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
 188
 189       (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
 190
 191       pure HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
 192                              , _hd_doi = lookup "doi" d
 193                              , _hd_url = lookup "URL" d
 194                              , _hd_uniqId = Nothing
 195                              , _hd_uniqIdBdd = Nothing
 196                              , _hd_page = Nothing
 197                              , _hd_title = lookup "title" d
 198                              , _hd_authors = Nothing
 199                              , _hd_institutes = lookup "authors" d
 200                              , _hd_source = lookup "source" d
 201                              , _hd_abstract = lookup "abstract" d
 202                              , _hd_publication_date = fmap (DT.pack . show) utcTime
 203                              , _hd_publication_year = pub_year
 204                              , _hd_publication_month = pub_month
 205                              , _hd_publication_day = pub_day
 206                              , _hd_publication_hour = Nothing
 207                              , _hd_publication_minute = Nothing
 208                              , _hd_publication_second = Nothing
 209                              , _hd_language_iso2 = Just $ (DT.pack . show) lang }
 210
 211 enrichWith :: FileType
 212            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 213 enrichWith RisPresse = enrichWith' presseEnrich
 214 enrichWith WOS       = enrichWith' (map (first WOS.keys))
 215 enrichWith _         = enrichWith' identity
 216
 217
 218 enrichWith' ::      ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 219            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 220 enrichWith' f = second (map both' . map f . concat)
 221   where
 222     both'   = map (both decodeUtf8)
 223
 224
 225
 226 readFileWith :: FileType -> FilePath
 227        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 228 readFileWith format path = do
 229     files <- case takeExtension path of
 230               ".zip" -> openZip              path
 231               _      -> pure <$> clean <$> DB.readFile path
 232     partitionEithers <$> mapConcurrently (runParser format) files
 233
 234
 235 -- | withParser:
 236 -- According to the format of the text, choose the right parser.
 237 -- TODO  withParser :: FileType -> Parser [Document]
 238 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
 239 withParser WOS = WOS.parser
 240 withParser RIS = RIS.parser
 241 --withParser ODT = odtParser
 242 --withParser XML = xmlParser
 243 withParser _   = panic "[ERROR] Parser not implemented yet"
 244
 245 runParser :: FileType -> DB.ByteString
 246           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 247 runParser format text = pure $ runParser' format text
 248
 249 runParser' :: FileType -> DB.ByteString
 250           -> (Either String [[(DB.ByteString, DB.ByteString)]])
 251 runParser' format text = parseOnly (withParser format) text
 252
 253 openZip :: FilePath -> IO [DB.ByteString]
 254 openZip fp = do
 255     entries <- withArchive fp (DM.keys <$> getEntries)
 256     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 257     pure bs
 258
 259 cleanText :: Text -> Text
 260 cleanText = cs . clean . cs
 261
 262 clean :: DB.ByteString -> DB.ByteString
 263 clean txt = DBC.map clean' txt
 264   where
 265     clean' '’' = '\''
 266     clean' '\r' = ' '
 267     clean' '\t' = ' '
 268     clean' ';' = '.'
 269     clean' c  = c