src/Gargantext/Text/Corpus/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Text.Corpus.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE PackageImports    #-}
  22
  23 module Gargantext.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
  24     where
  25
  26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  27 import Control.Concurrent.Async as CCA (mapConcurrently)
  28 import Control.Monad (join)
  29 import Data.Attoparsec.ByteString (parseOnly, Parser)
  30 import Data.Either(Either(..))
  31 import Data.Either.Extra (partitionEithers)
  32 import Data.List (concat, lookup)
  33 import Data.Ord()
  34 import Data.String (String())
  35 import Data.String()
  36 import Data.Text (Text)
  37 import Data.Text.Encoding (decodeUtf8)
  38 import Data.Tuple.Extra (both, first, second)
  39 import Gargantext.Core (Lang(..))
  40 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  41 import Gargantext.Prelude
  42 import Gargantext.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
  43 import Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
  44 import Gargantext.Text.Learn (detectLangDefault)
  45 import System.FilePath (FilePath(), takeExtension)
  46 import qualified Data.ByteString       as DB
  47 import qualified Data.ByteString.Char8 as DBC
  48 import qualified Data.ByteString.Lazy  as DBL
  49 import qualified Data.Map              as DM
  50 import qualified Data.Text             as DT
  51 import qualified Gargantext.Text.Corpus.Parsers.Date as Date
  52 import qualified Gargantext.Text.Corpus.Parsers.RIS  as RIS
  53 import qualified Gargantext.Text.Corpus.Parsers.WOS  as WOS
  54 ------------------------------------------------------------------------
  55
  56 type ParseError = String
  57 --type Field      = Text
  58 --type Document   = DM.Map Field Text
  59 --type FilesParsed = DM.Map FilePath FileParsed
  60 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  61 --                              , _fileParsed_result :: [Document]
  62 --                              } deriving (Show)
  63
  64
  65 -- | According to the format of Input file,
  66 -- different parser are available.
  67 data FileFormat = WOS | RIS | RisPresse
  68                 | CsvGargV3 | CsvHal
  69   deriving (Show)
  70
  71 -- Implemented (ISI Format)
  72 --                | DOC        -- Not Implemented / import Pandoc
  73 --                | ODT        -- Not Implemented / import Pandoc
  74 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  75 --                | XML        -- Not Implemented / see :
  76
  77
  78 parseFormat :: FileFormat -> DB.ByteString -> IO [HyperdataDocument]
  79 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
  80 parseFormat CsvHal    bs = pure $ parseHal' $ DBL.fromStrict bs
  81 parseFormat RisPresse bs = mapM (toDoc RIS)
  82                         <$> snd
  83                         <$> enrichWith RisPresse
  84                          $ partitionEithers
  85                          $ [runParser'  RisPresse bs]
  86 parseFormat WOS bs = mapM (toDoc WOS)
  87                         <$> snd
  88                         <$> enrichWith WOS
  89                          $ partitionEithers
  90                          $ [runParser'  WOS bs]
  91 parseFormat _ _ = undefined
  92
  93 -- | Parse file into documents
  94 -- TODO manage errors here
  95 -- TODO: to debug maybe add the filepath in error message
  96 parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
  97 parseFile CsvHal    p = parseHal p
  98 parseFile CsvGargV3 p = parseCsv p
  99 parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
 100 parseFile WOS       p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
 101 parseFile ff        p = join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff  p
 102
 103 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
 104 -- TODO use language for RIS
 105 toDoc ff d = do
 106       let abstract = lookup "abstract" d
 107       let lang = maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
 108
 109       let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
 110
 111       (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang  dateToParse
 112
 113       pure $ HyperdataDocument (Just $ DT.pack $ show ff)
 114                                (lookup "doi" d)
 115                                (lookup "URL" d)
 116                                 Nothing
 117                                 Nothing
 118                                 Nothing
 119                                (lookup "title" d)
 120                                 Nothing
 121                                (lookup "authors" d)
 122                                (lookup "source" d)
 123                                (lookup "abstract" d)
 124                                (fmap (DT.pack . show) utcTime)
 125                                (pub_year)
 126                                (pub_month)
 127                                (pub_day)
 128                                Nothing
 129                                Nothing
 130                                Nothing
 131                                (Just $ (DT.pack . show) lang)
 132
 133 enrichWith :: FileFormat
 134            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 135 enrichWith RisPresse = enrichWith' presseEnrich
 136 enrichWith WOS       = enrichWith' (map (first WOS.keys))
 137 enrichWith _         = enrichWith' identity
 138
 139
 140 enrichWith' ::      ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 141            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 142 enrichWith' f = second (map both' . map f . concat)
 143   where
 144     both'   = map (both decodeUtf8)
 145
 146
 147
 148 readFileWith :: FileFormat -> FilePath
 149        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 150 readFileWith format path = do
 151     files <- case takeExtension path of
 152               ".zip" -> openZip              path
 153               _      -> pure <$> clean <$> DB.readFile path
 154     partitionEithers <$> mapConcurrently (runParser format) files
 155
 156
 157 -- | withParser:
 158 -- According to the format of the text, choose the right parser.
 159 -- TODO  withParser :: FileFormat -> Parser [Document]
 160 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
 161 withParser WOS = WOS.parser
 162 withParser RIS = RIS.parser
 163 --withParser ODT = odtParser
 164 --withParser XML = xmlParser
 165 withParser _   = panic "[ERROR] Parser not implemented yet"
 166
 167 runParser :: FileFormat -> DB.ByteString
 168           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 169 runParser format text = pure $ runParser' format text
 170
 171 runParser' :: FileFormat -> DB.ByteString
 172           -> (Either String [[(DB.ByteString, DB.ByteString)]])
 173 runParser' format text = parseOnly (withParser format) text
 174
 175 openZip :: FilePath -> IO [DB.ByteString]
 176 openZip fp = do
 177     entries <- withArchive fp (DM.keys <$> getEntries)
 178     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 179     pure bs
 180
 181 cleanText :: Text -> Text
 182 cleanText = cs . clean . cs
 183
 184 clean :: DB.ByteString -> DB.ByteString
 185 clean txt = DBC.map clean' txt
 186   where
 187     clean' '’' = '\''
 188     clean' '\r' = ' '
 189     clean' '\t' = ' '
 190     clean' ';' = '.'
 191     clean' c  = c