src/Gargantext/Core/Text/Corpus/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Corpus.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE PackageImports    #-}
  22
  23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
  24     where
  25
  26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  27 import Conduit
  28 import Control.Concurrent.Async as CCA (mapConcurrently)
  29 import Control.Monad.Identity (runIdentity)
  30 import Data.Attoparsec.ByteString (parseOnly, Parser)
  31 import Control.Monad (join)
  32 import Data.Either(Either(..))
  33 import Data.Either.Extra (partitionEithers)
  34 import Data.List (concat, lookup)
  35 import Data.Ord()
  36 import Data.String (String())
  37 import Data.String()
  38 import Data.Text (Text)
  39 import Data.Text.Encoding (decodeUtf8)
  40 import Data.Tuple.Extra (both, first, second)
  41 import System.FilePath (FilePath(), takeExtension)
  42 import qualified Data.ByteString       as DB
  43 import qualified Data.ByteString.Char8 as DBC
  44 import qualified Data.ByteString.Lazy  as DBL
  45 import qualified Data.Map              as DM
  46 import qualified Data.Text             as DT
  47 import qualified Prelude as Prelude
  48 import System.IO.Temp (emptySystemTempFile)
  49
  50 import Gargantext.Core (Lang(..))
  51 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  52 import Gargantext.Prelude
  53 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv', parseCsvC)
  54 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
  55 -- import Gargantext.Core.Text.Learn (detectLangDefault)
  56 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
  57 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS  as RIS
  58 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS  as WOS
  59 ------------------------------------------------------------------------
  60
  61 type ParseError = String
  62 --type Field      = Text
  63 --type Document   = DM.Map Field Text
  64 --type FilesParsed = DM.Map FilePath FileParsed
  65 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  66 --                              , _fileParsed_result :: [Document]
  67 --                              } deriving (Show)
  68
  69
  70 -- | According to the format of Input file,
  71 -- different parser are available.
  72 data FileFormat = WOS | RIS | RisPresse
  73                 | CsvGargV3 | CsvHal
  74                 | ZIP
  75   deriving (Show)
  76
  77 -- Implemented (ISI Format)
  78 --                | DOC        -- Not Implemented / import Pandoc
  79 --                | ODT        -- Not Implemented / import Pandoc
  80 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  81 --                | XML        -- Not Implemented / see :
  82
  83 parseFormatC :: FileFormat -> DB.ByteString -> IO (Either Prelude.String (ConduitT () HyperdataDocument IO ()))
  84 parseFormatC CsvGargV3 bs = pure $ transPipe (pure . runIdentity) <$> (parseCsvC $ DBL.fromStrict bs)
  85 parseFormatC CsvHal    bs = pure $ transPipe (pure . runIdentity) <$> (parseCsvC $ DBL.fromStrict bs)
  86 parseFormatC RisPresse bs = do
  87   docs <- snd
  88           <$> enrichWith RisPresse
  89           $ partitionEithers
  90           $ [runParser'  RisPresse bs]
  91   pure $ (\docs' -> yieldMany docs' .| mapMC (toDoc RIS)) <$> docs
  92 parseFormatC WOS bs = do
  93   docs <- snd
  94           <$> enrichWith WOS
  95           $ partitionEithers
  96           $ [runParser'  WOS bs]
  97   pure $ (\docs' -> yieldMany docs' .| mapMC (toDoc WOS)) <$> docs
  98 parseFormatC ZIP bs = do
  99   path <- emptySystemTempFile "parsed-zip"
 100   DB.writeFile path bs
 101   parsedZip <- withArchive path $ do
 102     DM.keys <$> getEntries
 103   pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
 104 parseFormatC _ _ = undefined
 105
 106 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
 107 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
 108 parseFormat CsvHal    bs = pure $ parseHal' $ DBL.fromStrict bs
 109 parseFormat RisPresse bs = do
 110   docs <- mapM (toDoc RIS)
 111           <$> snd
 112           <$> enrichWith RisPresse
 113           $ partitionEithers
 114           $ [runParser'  RisPresse bs]
 115   pure $ Right docs
 116 parseFormat WOS bs = do
 117   docs <- mapM (toDoc WOS)
 118           <$> snd
 119           <$> enrichWith WOS
 120           $ partitionEithers
 121           $ [runParser'  WOS bs]
 122   pure $ Right docs
 123 parseFormat ZIP bs = do
 124   path <- emptySystemTempFile "parsed-zip"
 125   DB.writeFile path bs
 126   parsedZip <- withArchive path $ do
 127     DM.keys <$> getEntries
 128   pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
 129 parseFormat _ _ = undefined
 130
 131 -- | Parse file into documents
 132 -- TODO manage errors here
 133 -- TODO: to debug maybe add the filepath in error message
 134 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
 135 parseFile CsvHal    p = parseHal p
 136 parseFile CsvGargV3 p = parseCsv p
 137 parseFile RisPresse p = do
 138   docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
 139   pure $ Right docs
 140 parseFile WOS       p = do
 141   docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
 142   pure $ Right docs
 143 parseFile ff        p = do
 144   docs <- join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff  p
 145   pure $ Right docs
 146
 147 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
 148 -- TODO use language for RIS
 149 toDoc ff d = do
 150       -- let abstract = lookup "abstract" d
 151       let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
 152
 153       let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
 154
 155       (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang  dateToParse
 156
 157       pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
 158                                , _hd_doi = lookup "doi" d
 159                                , _hd_url = lookup "URL" d
 160                                , _hd_uniqId = Nothing
 161                                , _hd_uniqIdBdd = Nothing
 162                                , _hd_page = Nothing
 163                                , _hd_title = lookup "title" d
 164                                , _hd_authors = Nothing
 165                                , _hd_institutes = lookup "authors" d
 166                                , _hd_source = lookup "source" d
 167                                , _hd_abstract = lookup "abstract" d
 168                                , _hd_publication_date = fmap (DT.pack . show) utcTime
 169                                , _hd_publication_year = pub_year
 170                                , _hd_publication_month = pub_month
 171                                , _hd_publication_day = pub_day
 172                                , _hd_publication_hour = Nothing
 173                                , _hd_publication_minute = Nothing
 174                                , _hd_publication_second = Nothing
 175                                , _hd_language_iso2 = Just $ (DT.pack . show) lang }
 176
 177 enrichWith :: FileFormat
 178            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 179 enrichWith RisPresse = enrichWith' presseEnrich
 180 enrichWith WOS       = enrichWith' (map (first WOS.keys))
 181 enrichWith _         = enrichWith' identity
 182
 183
 184 enrichWith' ::      ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 185            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 186 enrichWith' f = second (map both' . map f . concat)
 187   where
 188     both'   = map (both decodeUtf8)
 189
 190
 191
 192 readFileWith :: FileFormat -> FilePath
 193        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 194 readFileWith format path = do
 195     files <- case takeExtension path of
 196               ".zip" -> openZip              path
 197               _      -> pure <$> clean <$> DB.readFile path
 198     partitionEithers <$> mapConcurrently (runParser format) files
 199
 200
 201 -- | withParser:
 202 -- According to the format of the text, choose the right parser.
 203 -- TODO  withParser :: FileFormat -> Parser [Document]
 204 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
 205 withParser WOS = WOS.parser
 206 withParser RIS = RIS.parser
 207 --withParser ODT = odtParser
 208 --withParser XML = xmlParser
 209 withParser _   = panic "[ERROR] Parser not implemented yet"
 210
 211 runParser :: FileFormat -> DB.ByteString
 212           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 213 runParser format text = pure $ runParser' format text
 214
 215 runParser' :: FileFormat -> DB.ByteString
 216           -> (Either String [[(DB.ByteString, DB.ByteString)]])
 217 runParser' format text = parseOnly (withParser format) text
 218
 219 openZip :: FilePath -> IO [DB.ByteString]
 220 openZip fp = do
 221     entries <- withArchive fp (DM.keys <$> getEntries)
 222     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 223     pure bs
 224
 225 cleanText :: Text -> Text
 226 cleanText = cs . clean . cs
 227
 228 clean :: DB.ByteString -> DB.ByteString
 229 clean txt = DBC.map clean' txt
 230   where
 231     clean' '’' = '\''
 232     clean' '\r' = ' '
 233     clean' '\t' = ' '
 234     clean' ';' = '.'
 235     clean' c  = c