src/Gargantext/Core/Text/Corpus/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Corpus.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE PackageImports    #-}
  22
  23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
  24     where
  25
  26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  27 import Control.Concurrent.Async as CCA (mapConcurrently)
  28 import Control.Monad (join, sequence)
  29 import Control.Monad.IO.Class (liftIO)
  30 import Data.Attoparsec.ByteString (parseOnly, Parser)
  31 import Data.Either(Either(..))
  32 import Data.Either.Extra (partitionEithers)
  33 import Data.List (concat, lookup)
  34 import Data.Ord()
  35 import Data.String (String())
  36 import Data.String()
  37 import Data.Text (Text)
  38 import Data.Text.Encoding (decodeUtf8)
  39 import Data.Tuple.Extra (both, first, second)
  40 import System.FilePath (FilePath(), takeExtension)
  41 import qualified Data.ByteString       as DB
  42 import qualified Data.ByteString.Char8 as DBC
  43 import qualified Data.ByteString.Lazy  as DBL
  44 import qualified Data.Map              as DM
  45 import qualified Data.Text             as DT
  46 import qualified Prelude as Prelude
  47 import System.IO.Temp (emptySystemTempFile)
  48
  49 import Gargantext.Core (Lang(..))
  50 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  51 import Gargantext.Prelude
  52 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
  53 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
  54 -- import Gargantext.Core.Text.Learn (detectLangDefault)
  55 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
  56 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS  as RIS
  57 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS  as WOS
  58 ------------------------------------------------------------------------
  59
  60 type ParseError = String
  61 --type Field      = Text
  62 --type Document   = DM.Map Field Text
  63 --type FilesParsed = DM.Map FilePath FileParsed
  64 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  65 --                              , _fileParsed_result :: [Document]
  66 --                              } deriving (Show)
  67
  68
  69 -- | According to the format of Input file,
  70 -- different parser are available.
  71 data FileFormat = WOS | RIS | RisPresse
  72                 | CsvGargV3 | CsvHal
  73                 | ZIP
  74   deriving (Show)
  75
  76 -- Implemented (ISI Format)
  77 --                | DOC        -- Not Implemented / import Pandoc
  78 --                | ODT        -- Not Implemented / import Pandoc
  79 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  80 --                | XML        -- Not Implemented / see :
  81
  82
  83 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
  84 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
  85 parseFormat CsvHal    bs = pure $ parseHal' $ DBL.fromStrict bs
  86 parseFormat RisPresse bs = do
  87   docs <- mapM (toDoc RIS)
  88           <$> snd
  89           <$> enrichWith RisPresse
  90           $ partitionEithers
  91           $ [runParser'  RisPresse bs]
  92   pure $ Right docs
  93 parseFormat WOS bs = do
  94   docs <- mapM (toDoc WOS)
  95           <$> snd
  96           <$> enrichWith WOS
  97           $ partitionEithers
  98           $ [runParser'  WOS bs]
  99   pure $ Right docs
 100 parseFormat ZIP bs = do
 101   path <- emptySystemTempFile "parsed.zip"
 102   DB.writeFile path bs
 103   withArchive path $ do
 104     files <- DM.keys <$> getEntries
 105     filesContents <- mapM getEntry files
 106     ddocs <- liftIO $ mapM (parseFormat CsvGargV3) filesContents
 107     pure $ concat <$> sequence ddocs
 108 parseFormat _ _ = undefined
 109
 110 -- | Parse file into documents
 111 -- TODO manage errors here
 112 -- TODO: to debug maybe add the filepath in error message
 113 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
 114 parseFile CsvHal    p = parseHal p
 115 parseFile CsvGargV3 p = parseCsv p
 116 parseFile RisPresse p = do
 117   docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
 118   pure $ Right docs
 119 parseFile WOS       p = do
 120   docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
 121   pure $ Right docs
 122 parseFile ff        p = do
 123   docs <- join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff  p
 124   pure $ Right docs
 125
 126 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
 127 -- TODO use language for RIS
 128 toDoc ff d = do
 129       -- let abstract = lookup "abstract" d
 130       let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
 131
 132       let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
 133
 134       (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang  dateToParse
 135
 136       pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
 137                                , _hd_doi = lookup "doi" d
 138                                , _hd_url = lookup "URL" d
 139                                , _hd_uniqId = Nothing
 140                                , _hd_uniqIdBdd = Nothing
 141                                , _hd_page = Nothing
 142                                , _hd_title = lookup "title" d
 143                                , _hd_authors = Nothing
 144                                , _hd_institutes = lookup "authors" d
 145                                , _hd_source = lookup "source" d
 146                                , _hd_abstract = lookup "abstract" d
 147                                , _hd_publication_date = fmap (DT.pack . show) utcTime
 148                                , _hd_publication_year = pub_year
 149                                , _hd_publication_month = pub_month
 150                                , _hd_publication_day = pub_day
 151                                , _hd_publication_hour = Nothing
 152                                , _hd_publication_minute = Nothing
 153                                , _hd_publication_second = Nothing
 154                                , _hd_language_iso2 = Just $ (DT.pack . show) lang }
 155
 156 enrichWith :: FileFormat
 157            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 158 enrichWith RisPresse = enrichWith' presseEnrich
 159 enrichWith WOS       = enrichWith' (map (first WOS.keys))
 160 enrichWith _         = enrichWith' identity
 161
 162
 163 enrichWith' ::      ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 164            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 165 enrichWith' f = second (map both' . map f . concat)
 166   where
 167     both'   = map (both decodeUtf8)
 168
 169
 170
 171 readFileWith :: FileFormat -> FilePath
 172        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 173 readFileWith format path = do
 174     files <- case takeExtension path of
 175               ".zip" -> openZip              path
 176               _      -> pure <$> clean <$> DB.readFile path
 177     partitionEithers <$> mapConcurrently (runParser format) files
 178
 179
 180 -- | withParser:
 181 -- According to the format of the text, choose the right parser.
 182 -- TODO  withParser :: FileFormat -> Parser [Document]
 183 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
 184 withParser WOS = WOS.parser
 185 withParser RIS = RIS.parser
 186 --withParser ODT = odtParser
 187 --withParser XML = xmlParser
 188 withParser _   = panic "[ERROR] Parser not implemented yet"
 189
 190 runParser :: FileFormat -> DB.ByteString
 191           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 192 runParser format text = pure $ runParser' format text
 193
 194 runParser' :: FileFormat -> DB.ByteString
 195           -> (Either String [[(DB.ByteString, DB.ByteString)]])
 196 runParser' format text = parseOnly (withParser format) text
 197
 198 openZip :: FilePath -> IO [DB.ByteString]
 199 openZip fp = do
 200     entries <- withArchive fp (DM.keys <$> getEntries)
 201     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 202     pure bs
 203
 204 cleanText :: Text -> Text
 205 cleanText = cs . clean . cs
 206
 207 clean :: DB.ByteString -> DB.ByteString
 208 clean txt = DBC.map clean' txt
 209   where
 210     clean' '’' = '\''
 211     clean' '\r' = ' '
 212     clean' '\t' = ' '
 213     clean' ';' = '.'
 214     clean' c  = c