src/Gargantext/Text/Corpus/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Text.Corpus.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE NoImplicitPrelude #-}
  22 {-# LANGUAGE PackageImports    #-}
  23 {-# LANGUAGE OverloadedStrings #-}
  24
  25 module Gargantext.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText)
  26     where
  27
  28 --import Data.ByteString (ByteString)
  29 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  30 import Control.Concurrent.Async as CCA (mapConcurrently)
  31 import Control.Monad (join)
  32 import qualified Data.ByteString.Char8 as DBC
  33 import Data.Attoparsec.ByteString (parseOnly, Parser)
  34 import Data.Either(Either(..))
  35 import Data.Either.Extra (partitionEithers)
  36 import Data.List (concat)
  37 import Data.List (lookup)
  38 import Data.Ord()
  39 import Data.String (String())
  40 import Data.String()
  41 import Data.Text (Text)
  42 import Data.Text.Encoding (decodeUtf8)
  43 import Data.Tuple.Extra (both, first, second)
  44 import System.FilePath (FilePath(), takeExtension)
  45 import qualified Data.ByteString as DB
  46 import qualified Data.Map        as DM
  47 import qualified Data.Text as DT
  48 import Gargantext.Core (Lang(..))
  49 import Gargantext.Prelude
  50 import Gargantext.Database.Types.Node (HyperdataDocument(..))
  51 import qualified Gargantext.Text.Corpus.Parsers.WOS as WOS
  52 import qualified Gargantext.Text.Corpus.Parsers.RIS as RIS
  53 import Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
  54 import qualified Gargantext.Text.Corpus.Parsers.Date as Date
  55 import Gargantext.Text.Corpus.Parsers.CSV (parseHal, parseCsv)
  56 import Gargantext.Text.Terms.Stop (detectLang)
  57 ------------------------------------------------------------------------
  58
  59 type ParseError = String
  60 --type Field      = Text
  61 --type Document   = DM.Map Field Text
  62 --type FilesParsed = DM.Map FilePath FileParsed
  63 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  64 --                              , _fileParsed_result :: [Document]
  65 --                              } deriving (Show)
  66
  67
  68 -- | According to the format of Input file,
  69 -- different parser are available.
  70 data FileFormat = WOS | RIS | RisPresse
  71                 | CsvGargV3 | CsvHalFormat
  72   deriving (Show)
  73
  74 -- Implemented (ISI Format)
  75 --                | DOC        -- Not Implemented / import Pandoc
  76 --                | ODT        -- Not Implemented / import Pandoc
  77 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  78 --                | XML        -- Not Implemented / see :
  79
  80
  81 {-
  82 parseFormat :: FileFormat -> ByteString -> [HyperdataDocument]
  83 parseFormat = undefined
  84 -}
  85
  86 -- | Parse file into documents
  87 -- TODO manage errors here
  88 -- TODO: to debug maybe add the filepath in error message
  89 parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
  90 parseFile CsvHalFormat p = parseHal p
  91 parseFile CsvGargV3 p = parseCsv p
  92 parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
  93 parseFile WOS       p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
  94 parseFile ff        p = join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff p
  95
  96 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
  97 -- TODO use language for RIS
  98 toDoc ff d = do
  99       let abstract = lookup "abstract" d
 100       let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
 101
 102       let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
 103
 104       (utcTime, (pub_year, pub_month, pub_day)) <- Date.split lang  dateToParse
 105
 106       pure $ HyperdataDocument (Just $ DT.pack $ show ff)
 107                                (lookup "doi" d)
 108                                (lookup "URL" d)
 109                                 Nothing
 110                                 Nothing
 111                                 Nothing
 112                                (lookup "title" d)
 113                                 Nothing
 114                                (lookup "authors" d)
 115                                (lookup "source" d)
 116                                (lookup "abstract" d)
 117                                (fmap (DT.pack . show) utcTime)
 118                                (pub_year)
 119                                (pub_month)
 120                                (pub_day)
 121                                Nothing
 122                                Nothing
 123                                Nothing
 124                                (Just $ (DT.pack . show) lang)
 125
 126 enrichWith :: FileFormat
 127            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 128 enrichWith RisPresse = enrichWith' presseEnrich
 129 enrichWith WOS       = enrichWith' (map (first WOS.keys))
 130 enrichWith _         = enrichWith' identity
 131
 132
 133 enrichWith' ::       ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 134            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 135 enrichWith' f = second (map both' . map f . concat)
 136   where
 137     both'   = map (both decodeUtf8)
 138
 139 readFileWith :: FileFormat -> FilePath
 140        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 141 readFileWith format path = do
 142     files <- case takeExtension path of
 143               ".zip" -> openZip              path
 144               _      -> pure <$> clean <$> DB.readFile path
 145     partitionEithers <$> mapConcurrently (runParser format) files
 146
 147
 148 -- | withParser:
 149 -- According to the format of the text, choose the right parser.
 150 -- TODO  withParser :: FileFormat -> Parser [Document]
 151 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
 152 withParser WOS = WOS.parser
 153 withParser RIS = RIS.parser
 154 --withParser ODT = odtParser
 155 --withParser XML = xmlParser
 156 withParser _   = panic "[ERROR] Parser not implemented yet"
 157
 158 runParser :: FileFormat -> DB.ByteString
 159           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 160 runParser format text = pure $ parseOnly (withParser format) text
 161
 162 openZip :: FilePath -> IO [DB.ByteString]
 163 openZip fp = do
 164     entries <- withArchive fp (DM.keys <$> getEntries)
 165     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 166     pure bs
 167
 168 cleanText :: Text -> Text
 169 cleanText = cs . clean . cs
 170
 171 clean :: DB.ByteString -> DB.ByteString
 172 clean txt = DBC.map clean' txt
 173   where
 174     clean' '’' = '\''
 175     clean' '\r' = ' '
 176     clean' '\t' = ' '
 177     clean' ';' = '.'
 178     clean' c  = c