src/Gargantext/Core/Text/Corpus/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Corpus.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE PackageImports    #-}
  22
  23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
  24     where
  25
  26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  27 import Control.Concurrent.Async as CCA (mapConcurrently)
  28 import Control.Monad (join)
  29 import Data.Attoparsec.ByteString (parseOnly, Parser)
  30 import Data.Either(Either(..))
  31 import Data.Either.Extra (partitionEithers)
  32 import Data.List (concat, lookup)
  33 import Data.Ord()
  34 import Data.String (String())
  35 import Data.String()
  36 import Data.Text (Text)
  37 import Data.Text.Encoding (decodeUtf8)
  38 import Data.Tuple.Extra (both, first, second)
  39 import System.FilePath (FilePath(), takeExtension)
  40 import qualified Data.ByteString       as DB
  41 import qualified Data.ByteString.Char8 as DBC
  42 import qualified Data.ByteString.Lazy  as DBL
  43 import qualified Data.Map              as DM
  44 import qualified Data.Text             as DT
  45 import qualified Prelude as Prelude
  46
  47 import Gargantext.Core (Lang(..))
  48 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  49 import Gargantext.Prelude
  50 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
  51 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
  52 -- import Gargantext.Core.Text.Learn (detectLangDefault)
  53 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
  54 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS  as RIS
  55 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS  as WOS
  56 ------------------------------------------------------------------------
  57
  58 type ParseError = String
  59 --type Field      = Text
  60 --type Document   = DM.Map Field Text
  61 --type FilesParsed = DM.Map FilePath FileParsed
  62 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  63 --                              , _fileParsed_result :: [Document]
  64 --                              } deriving (Show)
  65
  66
  67 -- | According to the format of Input file,
  68 -- different parser are available.
  69 data FileFormat = WOS | RIS | RisPresse
  70                 | CsvGargV3 | CsvHal
  71   deriving (Show)
  72
  73 -- Implemented (ISI Format)
  74 --                | DOC        -- Not Implemented / import Pandoc
  75 --                | ODT        -- Not Implemented / import Pandoc
  76 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  77 --                | XML        -- Not Implemented / see :
  78
  79
  80 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
  81 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
  82 parseFormat CsvHal    bs = pure $ parseHal' $ DBL.fromStrict bs
  83 parseFormat RisPresse bs = do
  84   docs <- mapM (toDoc RIS)
  85           <$> snd
  86           <$> enrichWith RisPresse
  87           $ partitionEithers
  88           $ [runParser'  RisPresse bs]
  89   pure $ Right docs
  90 parseFormat WOS bs = do
  91   docs <- mapM (toDoc WOS)
  92           <$> snd
  93           <$> enrichWith WOS
  94           $ partitionEithers
  95           $ [runParser'  WOS bs]
  96   pure $ Right docs
  97 parseFormat _ _ = undefined
  98
  99 -- | Parse file into documents
 100 -- TODO manage errors here
 101 -- TODO: to debug maybe add the filepath in error message
 102 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
 103 parseFile CsvHal    p = parseHal p
 104 parseFile CsvGargV3 p = parseCsv p
 105 parseFile RisPresse p = do
 106   docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
 107   pure $ Right docs
 108 parseFile WOS       p = do
 109   docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
 110   pure $ Right docs
 111 parseFile ff        p = do
 112   docs <- join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff  p
 113   pure $ Right docs
 114
 115 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
 116 -- TODO use language for RIS
 117 toDoc ff d = do
 118       -- let abstract = lookup "abstract" d
 119       let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
 120
 121       let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
 122
 123       (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang  dateToParse
 124
 125       pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
 126                                , _hd_doi = lookup "doi" d
 127                                , _hd_url = lookup "URL" d
 128                                , _hd_uniqId = Nothing
 129                                , _hd_uniqIdBdd = Nothing
 130                                , _hd_page = Nothing
 131                                , _hd_title = lookup "title" d
 132                                , _hd_authors = Nothing
 133                                , _hd_institutes = lookup "authors" d
 134                                , _hd_source = lookup "source" d
 135                                , _hd_abstract = lookup "abstract" d
 136                                , _hd_publication_date = fmap (DT.pack . show) utcTime
 137                                , _hd_publication_year = pub_year
 138                                , _hd_publication_month = pub_month
 139                                , _hd_publication_day = pub_day
 140                                , _hd_publication_hour = Nothing
 141                                , _hd_publication_minute = Nothing
 142                                , _hd_publication_second = Nothing
 143                                , _hd_language_iso2 = Just $ (DT.pack . show) lang }
 144
 145 enrichWith :: FileFormat
 146            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 147 enrichWith RisPresse = enrichWith' presseEnrich
 148 enrichWith WOS       = enrichWith' (map (first WOS.keys))
 149 enrichWith _         = enrichWith' identity
 150
 151
 152 enrichWith' ::      ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 153            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 154 enrichWith' f = second (map both' . map f . concat)
 155   where
 156     both'   = map (both decodeUtf8)
 157
 158
 159
 160 readFileWith :: FileFormat -> FilePath
 161        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 162 readFileWith format path = do
 163     files <- case takeExtension path of
 164               ".zip" -> openZip              path
 165               _      -> pure <$> clean <$> DB.readFile path
 166     partitionEithers <$> mapConcurrently (runParser format) files
 167
 168
 169 -- | withParser:
 170 -- According to the format of the text, choose the right parser.
 171 -- TODO  withParser :: FileFormat -> Parser [Document]
 172 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
 173 withParser WOS = WOS.parser
 174 withParser RIS = RIS.parser
 175 --withParser ODT = odtParser
 176 --withParser XML = xmlParser
 177 withParser _   = panic "[ERROR] Parser not implemented yet"
 178
 179 runParser :: FileFormat -> DB.ByteString
 180           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 181 runParser format text = pure $ runParser' format text
 182
 183 runParser' :: FileFormat -> DB.ByteString
 184           -> (Either String [[(DB.ByteString, DB.ByteString)]])
 185 runParser' format text = parseOnly (withParser format) text
 186
 187 openZip :: FilePath -> IO [DB.ByteString]
 188 openZip fp = do
 189     entries <- withArchive fp (DM.keys <$> getEntries)
 190     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 191     pure bs
 192
 193 cleanText :: Text -> Text
 194 cleanText = cs . clean . cs
 195
 196 clean :: DB.ByteString -> DB.ByteString
 197 clean txt = DBC.map clean' txt
 198   where
 199     clean' '’' = '\''
 200     clean' '\r' = ' '
 201     clean' '\t' = ' '
 202     clean' ';' = '.'
 203     clean' c  = c