src/Gargantext/Core/Text/Corpus/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Corpus.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE PackageImports    #-}
  22
  23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC)
  24     where
  25
  26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  27 import Conduit
  28 import Control.Concurrent.Async as CCA (mapConcurrently)
  29 import Control.Monad.Trans.Control (MonadBaseControl)
  30 import Control.Monad (join)
  31 import Data.Attoparsec.ByteString (parseOnly, Parser)
  32 import Data.Either(Either(..))
  33 import Data.Either.Extra (partitionEithers)
  34 import Data.List (concat, lookup)
  35 import Data.Ord()
  36 import Data.String (String())
  37 import Data.String()
  38 import Data.Text (Text, intercalate, pack, unpack)
  39 import Data.Text.Encoding (decodeUtf8)
  40 import Data.Tuple.Extra (both, first, second)
  41 import System.FilePath (FilePath(), takeExtension)
  42 import qualified Data.ByteString       as DB
  43 import qualified Data.ByteString.Char8 as DBC
  44 import qualified Data.ByteString.Lazy  as DBL
  45 import qualified Data.Map              as DM
  46 import qualified Data.Text             as DT
  47 import qualified Prelude
  48 import System.IO.Temp (emptySystemTempFile)
  49
  50 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
  51 import Gargantext.Core (Lang(..))
  52 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  53 import Gargantext.Prelude
  54 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
  55 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
  56 -- import Gargantext.Core.Text.Learn (detectLangDefault)
  57 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
  58 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS  as RIS
  59 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS  as WOS
  60 ------------------------------------------------------------------------
  61
  62 type ParseError = String
  63 --type Field      = Text
  64 --type Document   = DM.Map Field Text
  65 --type FilesParsed = DM.Map FilePath FileParsed
  66 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  67 --                              , _fileParsed_result :: [Document]
  68 --                              } deriving (Show)
  69
  70
  71 -- | According to the format of Input file,
  72 -- different parser are available.
  73 data FileType = WOS | RIS | RisPresse | CsvGargV3 | CsvHal
  74   deriving (Show)
  75
  76 -- Implemented (ISI Format)
  77 --                | DOC        -- Not Implemented / import Pandoc
  78 --                | ODT        -- Not Implemented / import Pandoc
  79 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  80 --                | XML        -- Not Implemented / see :
  81
  82 parseFormatC :: MonadBaseControl IO m => FileType -> FileFormat -> DB.ByteString -> m (Either Prelude.String (ConduitT () HyperdataDocument IO ()))
  83 parseFormatC CsvGargV3 Plain bs = pure $ transPipe (pure . runIdentity) <$> (parseCsvC $ DBL.fromStrict bs)
  84 parseFormatC CsvHal    Plain bs = pure $ transPipe (pure . runIdentity) <$> (parseCsvC $ DBL.fromStrict bs)
  85 parseFormatC RisPresse Plain bs = do
  86   --docs <- enrichWith RisPresse
  87   let eDocs = runParser' RisPresse bs
  88   pure $ (\docs -> yieldMany docs
  89                   .| mapC presseEnrich
  90                   .| mapC (map $ both decodeUtf8)
  91                   .| mapMC (toDoc RIS)) <$> eDocs
  92 parseFormatC WOS Plain bs = do
  93   let eDocs = runParser' WOS bs
  94   pure $ (\docs -> yieldMany docs
  95                   .| mapC (map $ first WOS.keys)
  96                   .| mapC (map $ both decodeUtf8)
  97                   .| mapMC (toDoc WOS)) <$> eDocs
  98 parseFormatC ft ZIP bs = do
  99   path <- liftBase $ emptySystemTempFile "parsed-zip"
 100   liftBase $ DB.writeFile path bs
 101   fileContents <- liftBase $ withArchive path $ do
 102     files <- DM.keys <$> getEntries
 103     mapM getEntry files
 104   --printDebug "[parseFormatC] fileContents" fileContents
 105   eContents <- mapM (parseFormatC ft Plain) fileContents
 106   --printDebug "[parseFormatC] contents" contents
 107   --pure $ Left $ "Not implemented for ZIP"
 108   let (errs, contents) = partitionEithers eContents
 109   case errs of
 110     [] ->
 111       case contents of
 112         [] -> pure $ Left "No files in zip"
 113         _  -> pure $ Right $ ( sequenceConduits contents >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
 114     _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
 115
 116 parseFormatC _ _ _ = undefined
 117
 118 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
 119 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
 120 -- parseFormat CsvHal    bs = pure $ parseHal' $ DBL.fromStrict bs
 121 -- parseFormat RisPresse bs = do
 122 --   docs <- mapM (toDoc RIS)
 123 --           <$> snd
 124 --           <$> enrichWith RisPresse
 125 --           $ partitionEithers
 126 --           $ [runParser'  RisPresse bs]
 127 --   pure $ Right docs
 128 -- parseFormat WOS bs = do
 129 --   docs <- mapM (toDoc WOS)
 130 --           <$> snd
 131 --           <$> enrichWith WOS
 132 --           $ partitionEithers
 133 --           $ [runParser'  WOS bs]
 134 --   pure $ Right docs
 135 -- parseFormat ZIP bs = do
 136 --   path <- emptySystemTempFile "parsed-zip"
 137 --   DB.writeFile path bs
 138 --   parsedZip <- withArchive path $ do
 139 --     DM.keys <$> getEntries
 140 --   pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
 141 -- parseFormat _ _ = undefined
 142
 143 -- | Parse file into documents
 144 -- TODO manage errors here
 145 -- TODO: to debug maybe add the filepath in error message
 146 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
 147 parseFile CsvHal    Plain p = parseHal p
 148 parseFile CsvGargV3 Plain p = parseCsv p
 149 parseFile RisPresse Plain p = do
 150   docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
 151   pure $ Right docs
 152 parseFile WOS       Plain p = do
 153   docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
 154   pure $ Right docs
 155 parseFile ff        _ p = do
 156   docs <- join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff  p
 157   pure $ Right docs
 158
 159 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
 160 -- TODO use language for RIS
 161 toDoc ff d = do
 162       -- let abstract = lookup "abstract" d
 163       let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
 164
 165       let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
 166
 167       (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang  dateToParse
 168
 169       pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
 170                                , _hd_doi = lookup "doi" d
 171                                , _hd_url = lookup "URL" d
 172                                , _hd_uniqId = Nothing
 173                                , _hd_uniqIdBdd = Nothing
 174                                , _hd_page = Nothing
 175                                , _hd_title = lookup "title" d
 176                                , _hd_authors = Nothing
 177                                , _hd_institutes = lookup "authors" d
 178                                , _hd_source = lookup "source" d
 179                                , _hd_abstract = lookup "abstract" d
 180                                , _hd_publication_date = fmap (DT.pack . show) utcTime
 181                                , _hd_publication_year = pub_year
 182                                , _hd_publication_month = pub_month
 183                                , _hd_publication_day = pub_day
 184                                , _hd_publication_hour = Nothing
 185                                , _hd_publication_minute = Nothing
 186                                , _hd_publication_second = Nothing
 187                                , _hd_language_iso2 = Just $ (DT.pack . show) lang }
 188
 189 enrichWith :: FileType
 190            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 191 enrichWith RisPresse = enrichWith' presseEnrich
 192 enrichWith WOS       = enrichWith' (map (first WOS.keys))
 193 enrichWith _         = enrichWith' identity
 194
 195
 196 enrichWith' ::      ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 197            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 198 enrichWith' f = second (map both' . map f . concat)
 199   where
 200     both'   = map (both decodeUtf8)
 201
 202
 203
 204 readFileWith :: FileType -> FilePath
 205        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 206 readFileWith format path = do
 207     files <- case takeExtension path of
 208               ".zip" -> openZip              path
 209               _      -> pure <$> clean <$> DB.readFile path
 210     partitionEithers <$> mapConcurrently (runParser format) files
 211
 212
 213 -- | withParser:
 214 -- According to the format of the text, choose the right parser.
 215 -- TODO  withParser :: FileType -> Parser [Document]
 216 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
 217 withParser WOS = WOS.parser
 218 withParser RIS = RIS.parser
 219 --withParser ODT = odtParser
 220 --withParser XML = xmlParser
 221 withParser _   = panic "[ERROR] Parser not implemented yet"
 222
 223 runParser :: FileType -> DB.ByteString
 224           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 225 runParser format text = pure $ runParser' format text
 226
 227 runParser' :: FileType -> DB.ByteString
 228           -> (Either String [[(DB.ByteString, DB.ByteString)]])
 229 runParser' format text = parseOnly (withParser format) text
 230
 231 openZip :: FilePath -> IO [DB.ByteString]
 232 openZip fp = do
 233     entries <- withArchive fp (DM.keys <$> getEntries)
 234     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 235     pure bs
 236
 237 cleanText :: Text -> Text
 238 cleanText = cs . clean . cs
 239
 240 clean :: DB.ByteString -> DB.ByteString
 241 clean txt = DBC.map clean' txt
 242   where
 243     clean' '’' = '\''
 244     clean' '\r' = ' '
 245     clean' '\t' = ' '
 246     clean' ';' = '.'
 247     clean' c  = c