src/Gargantext/Text/Corpus/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Text.Corpus.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE NoImplicitPrelude #-}
  22 {-# LANGUAGE PackageImports    #-}
  23 {-# LANGUAGE OverloadedStrings #-}
  24
  25 module Gargantext.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
  26     where
  27
  28 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  29 import Control.Concurrent.Async as CCA (mapConcurrently)
  30 import Control.Monad (join)
  31 import Data.Attoparsec.ByteString (parseOnly, Parser)
  32 import Data.Either(Either(..))
  33 import Data.Either.Extra (partitionEithers)
  34 import Data.List (concat, lookup)
  35 import Data.Ord()
  36 import Data.String (String())
  37 import Data.String()
  38 import Data.Text (Text)
  39 import Data.Text.Encoding (decodeUtf8)
  40 import Data.Tuple.Extra (both, first, second)
  41 import Gargantext.Core (Lang(..))
  42 import Gargantext.Database.Admin.Types.Node (HyperdataDocument(..))
  43 import Gargantext.Prelude
  44 import Gargantext.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
  45 import Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
  46 import Gargantext.Text.Learn (detectLangDefault)
  47 import System.FilePath (FilePath(), takeExtension)
  48 import qualified Data.ByteString       as DB
  49 import qualified Data.ByteString.Char8 as DBC
  50 import qualified Data.ByteString.Lazy  as DBL
  51 import qualified Data.Map              as DM
  52 import qualified Data.Text             as DT
  53 import qualified Gargantext.Text.Corpus.Parsers.Date as Date
  54 import qualified Gargantext.Text.Corpus.Parsers.RIS  as RIS
  55 import qualified Gargantext.Text.Corpus.Parsers.WOS  as WOS
  56 ------------------------------------------------------------------------
  57
  58 type ParseError = String
  59 --type Field      = Text
  60 --type Document   = DM.Map Field Text
  61 --type FilesParsed = DM.Map FilePath FileParsed
  62 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  63 --                              , _fileParsed_result :: [Document]
  64 --                              } deriving (Show)
  65
  66
  67 -- | According to the format of Input file,
  68 -- different parser are available.
  69 data FileFormat = WOS | RIS | RisPresse
  70                 | CsvGargV3 | CsvHal
  71   deriving (Show)
  72
  73 -- Implemented (ISI Format)
  74 --                | DOC        -- Not Implemented / import Pandoc
  75 --                | ODT        -- Not Implemented / import Pandoc
  76 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  77 --                | XML        -- Not Implemented / see :
  78
  79
  80 parseFormat :: FileFormat -> DB.ByteString -> IO [HyperdataDocument]
  81 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
  82 parseFormat CsvHal    bs = pure $ parseHal' $ DBL.fromStrict bs
  83 parseFormat RisPresse bs = mapM (toDoc RIS)
  84                         <$> snd
  85                         <$> enrichWith RisPresse
  86                          $ partitionEithers
  87                          $ [runParser'  RisPresse bs]
  88 parseFormat WOS bs = mapM (toDoc WOS)
  89                         <$> snd
  90                         <$> enrichWith WOS
  91                          $ partitionEithers
  92                          $ [runParser'  WOS bs]
  93 parseFormat _ _ = undefined
  94
  95 -- | Parse file into documents
  96 -- TODO manage errors here
  97 -- TODO: to debug maybe add the filepath in error message
  98 parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
  99 parseFile CsvHal    p = parseHal p
 100 parseFile CsvGargV3 p = parseCsv p
 101 parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
 102 parseFile WOS       p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
 103 parseFile ff        p = join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff  p
 104
 105 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
 106 -- TODO use language for RIS
 107 toDoc ff d = do
 108       let abstract = lookup "abstract" d
 109       let lang = maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
 110
 111       let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
 112
 113       (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang  dateToParse
 114
 115       pure $ HyperdataDocument (Just $ DT.pack $ show ff)
 116                                (lookup "doi" d)
 117                                (lookup "URL" d)
 118                                 Nothing
 119                                 Nothing
 120                                 Nothing
 121                                (lookup "title" d)
 122                                 Nothing
 123                                (lookup "authors" d)
 124                                (lookup "source" d)
 125                                (lookup "abstract" d)
 126                                (fmap (DT.pack . show) utcTime)
 127                                (pub_year)
 128                                (pub_month)
 129                                (pub_day)
 130                                Nothing
 131                                Nothing
 132                                Nothing
 133                                (Just $ (DT.pack . show) lang)
 134
 135 enrichWith :: FileFormat
 136            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 137 enrichWith RisPresse = enrichWith' presseEnrich
 138 enrichWith WOS       = enrichWith' (map (first WOS.keys))
 139 enrichWith _         = enrichWith' identity
 140
 141
 142 enrichWith' ::      ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 143            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 144 enrichWith' f = second (map both' . map f . concat)
 145   where
 146     both'   = map (both decodeUtf8)
 147
 148
 149
 150 readFileWith :: FileFormat -> FilePath
 151        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 152 readFileWith format path = do
 153     files <- case takeExtension path of
 154               ".zip" -> openZip              path
 155               _      -> pure <$> clean <$> DB.readFile path
 156     partitionEithers <$> mapConcurrently (runParser format) files
 157
 158
 159 -- | withParser:
 160 -- According to the format of the text, choose the right parser.
 161 -- TODO  withParser :: FileFormat -> Parser [Document]
 162 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
 163 withParser WOS = WOS.parser
 164 withParser RIS = RIS.parser
 165 --withParser ODT = odtParser
 166 --withParser XML = xmlParser
 167 withParser _   = panic "[ERROR] Parser not implemented yet"
 168
 169 runParser :: FileFormat -> DB.ByteString
 170           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 171 runParser format text = pure $ runParser' format text
 172
 173 runParser' :: FileFormat -> DB.ByteString
 174           -> (Either String [[(DB.ByteString, DB.ByteString)]])
 175 runParser' format text = parseOnly (withParser format) text
 176
 177 openZip :: FilePath -> IO [DB.ByteString]
 178 openZip fp = do
 179     entries <- withArchive fp (DM.keys <$> getEntries)
 180     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 181     pure bs
 182
 183 cleanText :: Text -> Text
 184 cleanText = cs . clean . cs
 185
 186 clean :: DB.ByteString -> DB.ByteString
 187 clean txt = DBC.map clean' txt
 188   where
 189     clean' '’' = '\''
 190     clean' '\r' = ' '
 191     clean' '\t' = ' '
 192     clean' ';' = '.'
 193     clean' c  = c