src/Gargantext/Core/Text/Corpus/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Corpus.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE PackageImports    #-}
  22
  23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
  24     where
  25
  26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  27 import Control.Concurrent.Async as CCA (mapConcurrently)
  28 import Data.Attoparsec.ByteString (parseOnly, Parser)
  29 import Data.Either(Either(..))
  30 import Data.Either.Extra (partitionEithers)
  31 import Data.List (concat, lookup)
  32 import Data.Ord()
  33 import Data.String (String())
  34 import Data.String()
  35 import Data.Text (Text)
  36 import Data.Text.Encoding (decodeUtf8)
  37 import Data.Tuple.Extra (both, first, second)
  38 import System.FilePath (FilePath(), takeExtension)
  39 import qualified Data.ByteString       as DB
  40 import qualified Data.ByteString.Char8 as DBC
  41 import qualified Data.ByteString.Lazy  as DBL
  42 import qualified Data.Map              as DM
  43 import qualified Data.Text             as DT
  44 import qualified Prelude as Prelude
  45 import System.IO.Temp (emptySystemTempFile)
  46
  47 import Gargantext.Core (Lang(..))
  48 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  49 import Gargantext.Prelude
  50 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
  51 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
  52 -- import Gargantext.Core.Text.Learn (detectLangDefault)
  53 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
  54 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS  as RIS
  55 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS  as WOS
  56 ------------------------------------------------------------------------
  57
  58 type ParseError = String
  59 --type Field      = Text
  60 --type Document   = DM.Map Field Text
  61 --type FilesParsed = DM.Map FilePath FileParsed
  62 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  63 --                              , _fileParsed_result :: [Document]
  64 --                              } deriving (Show)
  65
  66
  67 -- | According to the format of Input file,
  68 -- different parser are available.
  69 data FileFormat = WOS | RIS | RisPresse
  70                 | CsvGargV3 | CsvHal
  71                 | ZIP
  72   deriving (Show)
  73
  74 -- Implemented (ISI Format)
  75 --                | DOC        -- Not Implemented / import Pandoc
  76 --                | ODT        -- Not Implemented / import Pandoc
  77 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  78 --                | XML        -- Not Implemented / see :
  79
  80
  81 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
  82 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
  83 parseFormat CsvHal    bs = pure $ parseHal' $ DBL.fromStrict bs
  84 parseFormat RisPresse bs = do
  85   let docs = map (toDoc RIS)
  86           <$> snd
  87           <$> enrichWith RisPresse
  88           $ partitionEithers
  89           $ [runParser'  RisPresse bs]
  90   pure $ Right docs
  91 parseFormat WOS bs = do
  92   let docs = map (toDoc WOS)
  93           <$> snd
  94           <$> enrichWith WOS
  95           $ partitionEithers
  96           $ [runParser'  WOS bs]
  97   pure $ Right docs
  98 parseFormat ZIP bs = do
  99   path <- emptySystemTempFile "parsed-zip"
 100   DB.writeFile path bs
 101   parsedZip <- withArchive path $ do
 102     DM.keys <$> getEntries
 103   pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
 104 parseFormat _ _ = undefined
 105
 106 -- | Parse file into documents
 107 -- TODO manage errors here
 108 -- TODO: to debug maybe add the filepath in error message
 109 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
 110 parseFile CsvHal    p = parseHal p
 111 parseFile CsvGargV3 p = parseCsv p
 112 parseFile RisPresse p = do
 113   docs <- map (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
 114   pure $ Right docs
 115 parseFile WOS       p = do
 116   docs <- map (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
 117   pure $ Right docs
 118 parseFile ff        p = do
 119   docs <- map (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff  p
 120   pure $ Right docs
 121
 122 toDoc :: FileFormat -> [(Text, Text)] -> HyperdataDocument
 123 -- TODO use language for RIS
 124 toDoc ff d = do
 125       -- let abstract = lookup "abstract" d
 126       let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
 127
 128       let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
 129
 130       let (utcTime, (pub_year, pub_month, pub_day)) = Date.dateSplit lang dateToParse
 131
 132       HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
 133                         , _hd_doi = lookup "doi" d
 134                         , _hd_url = lookup "URL" d
 135                         , _hd_uniqId = Nothing
 136                         , _hd_uniqIdBdd = Nothing
 137                         , _hd_page = Nothing
 138                         , _hd_title = lookup "title" d
 139                         , _hd_authors = Nothing
 140                         , _hd_institutes = lookup "authors" d
 141                         , _hd_source = lookup "source" d
 142                         , _hd_abstract = lookup "abstract" d
 143                         , _hd_publication_date = fmap (DT.pack . show) utcTime
 144                         , _hd_publication_year = pub_year
 145                         , _hd_publication_month = pub_month
 146                         , _hd_publication_day = pub_day
 147                         , _hd_publication_hour = Nothing
 148                         , _hd_publication_minute = Nothing
 149                         , _hd_publication_second = Nothing
 150                         , _hd_language_iso2 = Just $ (DT.pack . show) lang }
 151
 152 enrichWith :: FileFormat
 153            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 154 enrichWith RisPresse = enrichWith' presseEnrich
 155 enrichWith WOS       = enrichWith' (map (first WOS.keys))
 156 enrichWith _         = enrichWith' identity
 157
 158
 159 enrichWith' ::      ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 160            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 161 enrichWith' f = second (map both' . map f . concat)
 162   where
 163     both'   = map (both decodeUtf8)
 164
 165
 166
 167 readFileWith :: FileFormat -> FilePath
 168        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 169 readFileWith format path = do
 170     files <- case takeExtension path of
 171               ".zip" -> openZip              path
 172               _      -> pure <$> clean <$> DB.readFile path
 173     partitionEithers <$> mapConcurrently (runParser format) files
 174
 175
 176 -- | withParser:
 177 -- According to the format of the text, choose the right parser.
 178 -- TODO  withParser :: FileFormat -> Parser [Document]
 179 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
 180 withParser WOS = WOS.parser
 181 withParser RIS = RIS.parser
 182 --withParser ODT = odtParser
 183 --withParser XML = xmlParser
 184 withParser _   = panic "[ERROR] Parser not implemented yet"
 185
 186 runParser :: FileFormat -> DB.ByteString
 187           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 188 runParser format text = pure $ runParser' format text
 189
 190 runParser' :: FileFormat -> DB.ByteString
 191           -> (Either String [[(DB.ByteString, DB.ByteString)]])
 192 runParser' format text = parseOnly (withParser format) text
 193
 194 openZip :: FilePath -> IO [DB.ByteString]
 195 openZip fp = do
 196     entries <- withArchive fp (DM.keys <$> getEntries)
 197     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 198     pure bs
 199
 200 cleanText :: Text -> Text
 201 cleanText = cs . clean . cs
 202
 203 clean :: DB.ByteString -> DB.ByteString
 204 clean txt = DBC.map clean' txt
 205   where
 206     clean' '’' = '\''
 207     clean' '\r' = ' '
 208     clean' '\t' = ' '
 209     clean' ';' = '.'
 210     clean' c  = c