src/Gargantext/Core/Text/Corpus/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Corpus.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE PackageImports    #-}
  22
  23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
  24     where
  25
  26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  27 import Control.Concurrent.Async as CCA (mapConcurrently)
  28 import Control.Monad (join)
  29 import Data.Attoparsec.ByteString (parseOnly, Parser)
  30 import Data.Either(Either(..))
  31 import Data.Either.Extra (partitionEithers)
  32 import Data.List (concat, lookup)
  33 import Data.Ord()
  34 import Data.String (String())
  35 import Data.String()
  36 import Data.Text (Text)
  37 import Data.Text.Encoding (decodeUtf8)
  38 import Data.Tuple.Extra (both, first, second)
  39 import System.FilePath (FilePath(), takeExtension)
  40 import qualified Data.ByteString       as DB
  41 import qualified Data.ByteString.Char8 as DBC
  42 import qualified Data.ByteString.Lazy  as DBL
  43 import qualified Data.Map              as DM
  44 import qualified Data.Text             as DT
  45 import qualified Prelude as Prelude
  46
  47 import Gargantext.Core (Lang(..))
  48 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  49 import Gargantext.Prelude
  50 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
  51 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
  52 -- import Gargantext.Core.Text.Learn (detectLangDefault)
  53 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
  54 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS  as RIS
  55 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS  as WOS
  56 ------------------------------------------------------------------------
  57
  58 type ParseError = String
  59 --type Field      = Text
  60 --type Document   = DM.Map Field Text
  61 --type FilesParsed = DM.Map FilePath FileParsed
  62 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  63 --                              , _fileParsed_result :: [Document]
  64 --                              } deriving (Show)
  65
  66
  67 -- | According to the format of Input file,
  68 -- different parser are available.
  69 data FileFormat = WOS | RIS | RisPresse
  70                 | CsvGargV3 | CsvHal
  71                 | ZIP
  72   deriving (Show)
  73
  74 -- Implemented (ISI Format)
  75 --                | DOC        -- Not Implemented / import Pandoc
  76 --                | ODT        -- Not Implemented / import Pandoc
  77 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  78 --                | XML        -- Not Implemented / see :
  79
  80
  81 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
  82 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
  83 parseFormat CsvHal    bs = pure $ parseHal' $ DBL.fromStrict bs
  84 parseFormat RisPresse bs = do
  85   docs <- mapM (toDoc RIS)
  86           <$> snd
  87           <$> enrichWith RisPresse
  88           $ partitionEithers
  89           $ [runParser'  RisPresse bs]
  90   pure $ Right docs
  91 parseFormat WOS bs = do
  92   docs <- mapM (toDoc WOS)
  93           <$> snd
  94           <$> enrichWith WOS
  95           $ partitionEithers
  96           $ [runParser'  WOS bs]
  97   pure $ Right docs
  98 parseFormat ZIP _bs = do
  99   printDebug "[parseFormat]" ZIP
 100   pure $ Left "Not implemented for ZIP"
 101 parseFormat _ _ = undefined
 102
 103 -- | Parse file into documents
 104 -- TODO manage errors here
 105 -- TODO: to debug maybe add the filepath in error message
 106 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
 107 parseFile CsvHal    p = parseHal p
 108 parseFile CsvGargV3 p = parseCsv p
 109 parseFile RisPresse p = do
 110   docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
 111   pure $ Right docs
 112 parseFile WOS       p = do
 113   docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
 114   pure $ Right docs
 115 parseFile ff        p = do
 116   docs <- join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff  p
 117   pure $ Right docs
 118
 119 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
 120 -- TODO use language for RIS
 121 toDoc ff d = do
 122       -- let abstract = lookup "abstract" d
 123       let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
 124
 125       let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
 126
 127       (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang  dateToParse
 128
 129       pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
 130                                , _hd_doi = lookup "doi" d
 131                                , _hd_url = lookup "URL" d
 132                                , _hd_uniqId = Nothing
 133                                , _hd_uniqIdBdd = Nothing
 134                                , _hd_page = Nothing
 135                                , _hd_title = lookup "title" d
 136                                , _hd_authors = Nothing
 137                                , _hd_institutes = lookup "authors" d
 138                                , _hd_source = lookup "source" d
 139                                , _hd_abstract = lookup "abstract" d
 140                                , _hd_publication_date = fmap (DT.pack . show) utcTime
 141                                , _hd_publication_year = pub_year
 142                                , _hd_publication_month = pub_month
 143                                , _hd_publication_day = pub_day
 144                                , _hd_publication_hour = Nothing
 145                                , _hd_publication_minute = Nothing
 146                                , _hd_publication_second = Nothing
 147                                , _hd_language_iso2 = Just $ (DT.pack . show) lang }
 148
 149 enrichWith :: FileFormat
 150            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 151 enrichWith RisPresse = enrichWith' presseEnrich
 152 enrichWith WOS       = enrichWith' (map (first WOS.keys))
 153 enrichWith _         = enrichWith' identity
 154
 155
 156 enrichWith' ::      ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 157            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 158 enrichWith' f = second (map both' . map f . concat)
 159   where
 160     both'   = map (both decodeUtf8)
 161
 162
 163
 164 readFileWith :: FileFormat -> FilePath
 165        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 166 readFileWith format path = do
 167     files <- case takeExtension path of
 168               ".zip" -> openZip              path
 169               _      -> pure <$> clean <$> DB.readFile path
 170     partitionEithers <$> mapConcurrently (runParser format) files
 171
 172
 173 -- | withParser:
 174 -- According to the format of the text, choose the right parser.
 175 -- TODO  withParser :: FileFormat -> Parser [Document]
 176 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
 177 withParser WOS = WOS.parser
 178 withParser RIS = RIS.parser
 179 --withParser ODT = odtParser
 180 --withParser XML = xmlParser
 181 withParser _   = panic "[ERROR] Parser not implemented yet"
 182
 183 runParser :: FileFormat -> DB.ByteString
 184           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 185 runParser format text = pure $ runParser' format text
 186
 187 runParser' :: FileFormat -> DB.ByteString
 188           -> (Either String [[(DB.ByteString, DB.ByteString)]])
 189 runParser' format text = parseOnly (withParser format) text
 190
 191 openZip :: FilePath -> IO [DB.ByteString]
 192 openZip fp = do
 193     entries <- withArchive fp (DM.keys <$> getEntries)
 194     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 195     pure bs
 196
 197 cleanText :: Text -> Text
 198 cleanText = cs . clean . cs
 199
 200 clean :: DB.ByteString -> DB.ByteString
 201 clean txt = DBC.map clean' txt
 202   where
 203     clean' '’' = '\''
 204     clean' '\r' = ' '
 205     clean' '\t' = ' '
 206     clean' ';' = '.'
 207     clean' c  = c