src/Gargantext/Core/Text/Corpus/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Corpus.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE PackageImports    #-}
  22
  23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
  24     where
  25
  26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  27 import Control.Concurrent.Async as CCA (mapConcurrently)
  28 import Control.Monad (join, sequence)
  29 import Control.Monad.IO.Class (liftIO)
  30 import Data.Attoparsec.ByteString (parseOnly, Parser)
  31 import Data.Either(Either(..))
  32 import Data.Either.Extra (partitionEithers)
  33 import Data.List (concat, lookup)
  34 import Data.Ord()
  35 import Data.String (String())
  36 import Data.String()
  37 import Data.Text (Text)
  38 import Data.Text.Encoding (decodeUtf8)
  39 import Data.Tuple.Extra (both, first, second)
  40 import System.FilePath (FilePath(), takeExtension)
  41 import qualified Data.ByteString       as DB
  42 import qualified Data.ByteString.Char8 as DBC
  43 import qualified Data.ByteString.Lazy  as DBL
  44 import qualified Data.Map              as DM
  45 import qualified Data.Text             as DT
  46 import qualified Prelude as Prelude
  47 import System.IO.Temp (emptySystemTempFile)
  48
  49 import Gargantext.Core (Lang(..))
  50 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  51 import Gargantext.Prelude
  52 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
  53 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
  54 -- import Gargantext.Core.Text.Learn (detectLangDefault)
  55 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
  56 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS  as RIS
  57 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS  as WOS
  58 ------------------------------------------------------------------------
  59
  60 type ParseError = String
  61 --type Field      = Text
  62 --type Document   = DM.Map Field Text
  63 --type FilesParsed = DM.Map FilePath FileParsed
  64 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  65 --                              , _fileParsed_result :: [Document]
  66 --                              } deriving (Show)
  67
  68
  69 -- | According to the format of Input file,
  70 -- different parser are available.
  71 data FileFormat = WOS | RIS | RisPresse
  72                 | CsvGargV3 | CsvHal
  73                 | ZIP
  74   deriving (Show)
  75
  76 -- Implemented (ISI Format)
  77 --                | DOC        -- Not Implemented / import Pandoc
  78 --                | ODT        -- Not Implemented / import Pandoc
  79 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  80 --                | XML        -- Not Implemented / see :
  81
  82
  83 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
  84 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
  85 parseFormat CsvHal    bs = pure $ parseHal' $ DBL.fromStrict bs
  86 parseFormat RisPresse bs = do
  87   docs <- mapM (toDoc RIS)
  88           <$> snd
  89           <$> enrichWith RisPresse
  90           $ partitionEithers
  91           $ [runParser'  RisPresse bs]
  92   pure $ Right docs
  93 parseFormat WOS bs = do
  94   docs <- mapM (toDoc WOS)
  95           <$> snd
  96           <$> enrichWith WOS
  97           $ partitionEithers
  98           $ [runParser'  WOS bs]
  99   pure $ Right docs
 100 parseFormat ZIP bs = do
 101   path <- emptySystemTempFile "parsed-zip"
 102   DB.writeFile path bs
 103   parsedZip <- withArchive path $ do
 104     DM.keys <$> getEntries
 105   pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
 106 parseFormat _ _ = undefined
 107
 108 -- | Parse file into documents
 109 -- TODO manage errors here
 110 -- TODO: to debug maybe add the filepath in error message
 111 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
 112 parseFile CsvHal    p = parseHal p
 113 parseFile CsvGargV3 p = parseCsv p
 114 parseFile RisPresse p = do
 115   docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
 116   pure $ Right docs
 117 parseFile WOS       p = do
 118   docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
 119   pure $ Right docs
 120 parseFile ff        p = do
 121   docs <- join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff  p
 122   pure $ Right docs
 123
 124 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
 125 -- TODO use language for RIS
 126 toDoc ff d = do
 127       -- let abstract = lookup "abstract" d
 128       let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
 129
 130       let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
 131
 132       (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang  dateToParse
 133
 134       pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
 135                                , _hd_doi = lookup "doi" d
 136                                , _hd_url = lookup "URL" d
 137                                , _hd_uniqId = Nothing
 138                                , _hd_uniqIdBdd = Nothing
 139                                , _hd_page = Nothing
 140                                , _hd_title = lookup "title" d
 141                                , _hd_authors = Nothing
 142                                , _hd_institutes = lookup "authors" d
 143                                , _hd_source = lookup "source" d
 144                                , _hd_abstract = lookup "abstract" d
 145                                , _hd_publication_date = fmap (DT.pack . show) utcTime
 146                                , _hd_publication_year = pub_year
 147                                , _hd_publication_month = pub_month
 148                                , _hd_publication_day = pub_day
 149                                , _hd_publication_hour = Nothing
 150                                , _hd_publication_minute = Nothing
 151                                , _hd_publication_second = Nothing
 152                                , _hd_language_iso2 = Just $ (DT.pack . show) lang }
 153
 154 enrichWith :: FileFormat
 155            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 156 enrichWith RisPresse = enrichWith' presseEnrich
 157 enrichWith WOS       = enrichWith' (map (first WOS.keys))
 158 enrichWith _         = enrichWith' identity
 159
 160
 161 enrichWith' ::      ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 162            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 163 enrichWith' f = second (map both' . map f . concat)
 164   where
 165     both'   = map (both decodeUtf8)
 166
 167
 168
 169 readFileWith :: FileFormat -> FilePath
 170        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 171 readFileWith format path = do
 172     files <- case takeExtension path of
 173               ".zip" -> openZip              path
 174               _      -> pure <$> clean <$> DB.readFile path
 175     partitionEithers <$> mapConcurrently (runParser format) files
 176
 177
 178 -- | withParser:
 179 -- According to the format of the text, choose the right parser.
 180 -- TODO  withParser :: FileFormat -> Parser [Document]
 181 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
 182 withParser WOS = WOS.parser
 183 withParser RIS = RIS.parser
 184 --withParser ODT = odtParser
 185 --withParser XML = xmlParser
 186 withParser _   = panic "[ERROR] Parser not implemented yet"
 187
 188 runParser :: FileFormat -> DB.ByteString
 189           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 190 runParser format text = pure $ runParser' format text
 191
 192 runParser' :: FileFormat -> DB.ByteString
 193           -> (Either String [[(DB.ByteString, DB.ByteString)]])
 194 runParser' format text = parseOnly (withParser format) text
 195
 196 openZip :: FilePath -> IO [DB.ByteString]
 197 openZip fp = do
 198     entries <- withArchive fp (DM.keys <$> getEntries)
 199     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 200     pure bs
 201
 202 cleanText :: Text -> Text
 203 cleanText = cs . clean . cs
 204
 205 clean :: DB.ByteString -> DB.ByteString
 206 clean txt = DBC.map clean' txt
 207   where
 208     clean' '’' = '\''
 209     clean' '\r' = ' '
 210     clean' '\t' = ' '
 211     clean' ';' = '.'
 212     clean' c  = c