src/Gargantext/Core/Text/Corpus/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Corpus.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE PackageImports    #-}
  22
  23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
  24     where
  25
  26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  27 import Control.Concurrent.Async as CCA (mapConcurrently)
  28 import Data.Attoparsec.ByteString (parseOnly, Parser)
  29 import Control.Monad (join)
  30 import Data.Either(Either(..))
  31 import Data.Either.Extra (partitionEithers)
  32 import Data.List (concat, lookup)
  33 import Data.Ord()
  34 import Data.String (String())
  35 import Data.String()
  36 import Data.Text (Text)
  37 import Data.Text.Encoding (decodeUtf8)
  38 import Data.Tuple.Extra (both, first, second)
  39 import System.FilePath (FilePath(), takeExtension)
  40 import qualified Data.ByteString       as DB
  41 import qualified Data.ByteString.Char8 as DBC
  42 import qualified Data.ByteString.Lazy  as DBL
  43 import qualified Data.Map              as DM
  44 import qualified Data.Text             as DT
  45 import qualified Prelude as Prelude
  46 import System.IO.Temp (emptySystemTempFile)
  47
  48 import Gargantext.Core (Lang(..))
  49 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  50 import Gargantext.Prelude
  51 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
  52 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
  53 -- import Gargantext.Core.Text.Learn (detectLangDefault)
  54 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
  55 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS  as RIS
  56 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS  as WOS
  57 ------------------------------------------------------------------------
  58
  59 type ParseError = String
  60 --type Field      = Text
  61 --type Document   = DM.Map Field Text
  62 --type FilesParsed = DM.Map FilePath FileParsed
  63 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  64 --                              , _fileParsed_result :: [Document]
  65 --                              } deriving (Show)
  66
  67
  68 -- | According to the format of Input file,
  69 -- different parser are available.
  70 data FileFormat = WOS | RIS | RisPresse
  71                 | CsvGargV3 | CsvHal
  72                 | ZIP
  73   deriving (Show)
  74
  75 -- Implemented (ISI Format)
  76 --                | DOC        -- Not Implemented / import Pandoc
  77 --                | ODT        -- Not Implemented / import Pandoc
  78 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  79 --                | XML        -- Not Implemented / see :
  80
  81
  82 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
  83 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
  84 parseFormat CsvHal    bs = pure $ parseHal' $ DBL.fromStrict bs
  85 parseFormat RisPresse bs = do
  86   docs <- mapM (toDoc RIS)
  87           <$> snd
  88           <$> enrichWith RisPresse
  89           $ partitionEithers
  90           $ [runParser'  RisPresse bs]
  91   pure $ Right docs
  92 parseFormat WOS bs = do
  93   docs <- mapM (toDoc WOS)
  94           <$> snd
  95           <$> enrichWith WOS
  96           $ partitionEithers
  97           $ [runParser'  WOS bs]
  98   pure $ Right docs
  99 parseFormat ZIP bs = do
 100   path <- emptySystemTempFile "parsed-zip"
 101   DB.writeFile path bs
 102   parsedZip <- withArchive path $ do
 103     DM.keys <$> getEntries
 104   pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
 105 parseFormat _ _ = undefined
 106
 107 -- | Parse file into documents
 108 -- TODO manage errors here
 109 -- TODO: to debug maybe add the filepath in error message
 110 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
 111 parseFile CsvHal    p = parseHal p
 112 parseFile CsvGargV3 p = parseCsv p
 113 parseFile RisPresse p = do
 114   docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
 115   pure $ Right docs
 116 parseFile WOS       p = do
 117   docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
 118   pure $ Right docs
 119 parseFile ff        p = do
 120   docs <- join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff  p
 121   pure $ Right docs
 122
 123 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
 124 -- TODO use language for RIS
 125 toDoc ff d = do
 126       -- let abstract = lookup "abstract" d
 127       let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
 128
 129       let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
 130
 131       (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang  dateToParse
 132
 133       pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
 134                                , _hd_doi = lookup "doi" d
 135                                , _hd_url = lookup "URL" d
 136                                , _hd_uniqId = Nothing
 137                                , _hd_uniqIdBdd = Nothing
 138                                , _hd_page = Nothing
 139                                , _hd_title = lookup "title" d
 140                                , _hd_authors = Nothing
 141                                , _hd_institutes = lookup "authors" d
 142                                , _hd_source = lookup "source" d
 143                                , _hd_abstract = lookup "abstract" d
 144                                , _hd_publication_date = fmap (DT.pack . show) utcTime
 145                                , _hd_publication_year = pub_year
 146                                , _hd_publication_month = pub_month
 147                                , _hd_publication_day = pub_day
 148                                , _hd_publication_hour = Nothing
 149                                , _hd_publication_minute = Nothing
 150                                , _hd_publication_second = Nothing
 151                                , _hd_language_iso2 = Just $ (DT.pack . show) lang }
 152
 153 enrichWith :: FileFormat
 154            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 155 enrichWith RisPresse = enrichWith' presseEnrich
 156 enrichWith WOS       = enrichWith' (map (first WOS.keys))
 157 enrichWith _         = enrichWith' identity
 158
 159
 160 enrichWith' ::      ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 161            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 162 enrichWith' f = second (map both' . map f . concat)
 163   where
 164     both'   = map (both decodeUtf8)
 165
 166
 167
 168 readFileWith :: FileFormat -> FilePath
 169        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 170 readFileWith format path = do
 171     files <- case takeExtension path of
 172               ".zip" -> openZip              path
 173               _      -> pure <$> clean <$> DB.readFile path
 174     partitionEithers <$> mapConcurrently (runParser format) files
 175
 176
 177 -- | withParser:
 178 -- According to the format of the text, choose the right parser.
 179 -- TODO  withParser :: FileFormat -> Parser [Document]
 180 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
 181 withParser WOS = WOS.parser
 182 withParser RIS = RIS.parser
 183 --withParser ODT = odtParser
 184 --withParser XML = xmlParser
 185 withParser _   = panic "[ERROR] Parser not implemented yet"
 186
 187 runParser :: FileFormat -> DB.ByteString
 188           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 189 runParser format text = pure $ runParser' format text
 190
 191 runParser' :: FileFormat -> DB.ByteString
 192           -> (Either String [[(DB.ByteString, DB.ByteString)]])
 193 runParser' format text = parseOnly (withParser format) text
 194
 195 openZip :: FilePath -> IO [DB.ByteString]
 196 openZip fp = do
 197     entries <- withArchive fp (DM.keys <$> getEntries)
 198     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 199     pure bs
 200
 201 cleanText :: Text -> Text
 202 cleanText = cs . clean . cs
 203
 204 clean :: DB.ByteString -> DB.ByteString
 205 clean txt = DBC.map clean' txt
 206   where
 207     clean' '’' = '\''
 208     clean' '\r' = ' '
 209     clean' '\t' = ' '
 210     clean' ';' = '.'
 211     clean' c  = c