src/Gargantext/Text/Corpus/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Text.Corpus.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE NoImplicitPrelude #-}
  22 {-# LANGUAGE PackageImports    #-}
  23 {-# LANGUAGE OverloadedStrings #-}
  24
  25 module Gargantext.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText)
  26     where
  27
  28 --import Data.ByteString (ByteString)
  29 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  30 import Control.Concurrent.Async as CCA (mapConcurrently)
  31 import Control.Monad (join)
  32 import qualified Data.ByteString.Char8 as DBC
  33 import Data.Attoparsec.ByteString (parseOnly, Parser)
  34 import Data.Either(Either(..))
  35 import Data.Either.Extra (partitionEithers)
  36 import Data.List (concat)
  37 import Data.List (lookup)
  38 import Data.Ord()
  39 import Data.String (String())
  40 import Data.String()
  41 import Data.Text (Text)
  42 import Data.Text.Encoding (decodeUtf8)
  43 import Data.Tuple.Extra (both, first, second)
  44 import System.FilePath (FilePath(), takeExtension)
  45 import qualified Data.ByteString as DB
  46 import qualified Data.Map        as DM
  47 import qualified Data.Text as DT
  48 import Gargantext.Core (Lang(..))
  49 import Gargantext.Prelude
  50 import Gargantext.Database.Types.Node (HyperdataDocument(..))
  51 import qualified Gargantext.Text.Corpus.Parsers.WOS as WOS
  52 import qualified Gargantext.Text.Corpus.Parsers.RIS as RIS
  53 import Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
  54 import qualified Gargantext.Text.Corpus.Parsers.Date as Date
  55 import Gargantext.Text.Corpus.Parsers.CSV (parseHal)
  56 import Gargantext.Text.Terms.Stop (detectLang)
  57 ------------------------------------------------------------------------
  58
  59 type ParseError = String
  60 --type Field      = Text
  61 --type Document   = DM.Map Field Text
  62 --type FilesParsed = DM.Map FilePath FileParsed
  63 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  64 --                              , _fileParsed_result :: [Document]
  65 --                              } deriving (Show)
  66
  67
  68 -- | According to the format of Input file,
  69 -- different parser are available.
  70 data FileFormat = WOS | RIS | RisPresse
  71                 | CsvGargV3 | CsvHalFormat
  72   deriving (Show)
  73
  74 -- Implemented (ISI Format)
  75 --                | DOC        -- Not Implemented / import Pandoc
  76 --                | ODT        -- Not Implemented / import Pandoc
  77 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  78 --                | XML        -- Not Implemented / see :
  79
  80
  81 {-
  82 parseFormat :: FileFormat -> ByteString -> [HyperdataDocument]
  83 parseFormat = undefined
  84 -}
  85
  86 -- | Parse file into documents
  87 -- TODO manage errors here
  88 -- TODO: to debug maybe add the filepath in error message
  89 parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
  90 parseFile CsvHalFormat p = parseHal p
  91 parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
  92 parseFile WOS       p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
  93 parseFile ff        p = join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff p
  94
  95 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
  96 -- TODO use language for RIS
  97 toDoc ff d = do
  98       let abstract = lookup "abstract" d
  99       let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
 100
 101       let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
 102
 103       (utcTime, (pub_year, pub_month, pub_day)) <- Date.split lang  dateToParse
 104
 105       pure $ HyperdataDocument (Just $ DT.pack $ show ff)
 106                                (lookup "doi" d)
 107                                (lookup "URL" d)
 108                                 Nothing
 109                                 Nothing
 110                                 Nothing
 111                                (lookup "title" d)
 112                                 Nothing
 113                                (lookup "authors" d)
 114                                (lookup "source" d)
 115                                (lookup "abstract" d)
 116                                (fmap (DT.pack . show) utcTime)
 117                                (pub_year)
 118                                (pub_month)
 119                                (pub_day)
 120                                Nothing
 121                                Nothing
 122                                Nothing
 123                                (Just $ (DT.pack . show) lang)
 124
 125 enrichWith :: FileFormat
 126            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 127 enrichWith RisPresse = enrichWith' presseEnrich
 128 enrichWith WOS       = enrichWith' (map (first WOS.keys))
 129 enrichWith _         = enrichWith' identity
 130
 131
 132 enrichWith' ::       ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 133            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 134 enrichWith' f = second (map both' . map f . concat)
 135   where
 136     both'   = map (both decodeUtf8)
 137
 138 readFileWith :: FileFormat -> FilePath
 139        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 140 readFileWith format path = do
 141     files <- case takeExtension path of
 142               ".zip" -> openZip              path
 143               _      -> pure <$> clean <$> DB.readFile path
 144     partitionEithers <$> mapConcurrently (runParser format) files
 145
 146
 147 -- | withParser:
 148 -- According to the format of the text, choose the right parser.
 149 -- TODO  withParser :: FileFormat -> Parser [Document]
 150 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
 151 withParser WOS = WOS.parser
 152 withParser RIS = RIS.parser
 153 --withParser ODT = odtParser
 154 --withParser XML = xmlParser
 155 withParser _   = panic "[ERROR] Parser not implemented yet"
 156
 157 runParser :: FileFormat -> DB.ByteString
 158           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 159 runParser format text = pure $ parseOnly (withParser format) text
 160
 161 openZip :: FilePath -> IO [DB.ByteString]
 162 openZip fp = do
 163     entries <- withArchive fp (DM.keys <$> getEntries)
 164     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 165     pure bs
 166
 167 cleanText :: Text -> Text
 168 cleanText = cs . clean . cs
 169
 170 clean :: DB.ByteString -> DB.ByteString
 171 clean txt = DBC.map clean' txt
 172   where
 173     clean' '’' = '\''
 174     clean' '\r' = ' '
 175     clean' '\t' = ' '
 176     clean' ';' = '.'
 177     clean' c  = c