src/Gargantext/Text/Corpus/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Text.Corpus.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE NoImplicitPrelude #-}
  22 {-# LANGUAGE PackageImports    #-}
  23 {-# LANGUAGE OverloadedStrings #-}
  24
  25 module Gargantext.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
  26     where
  27
  28 --import Data.ByteString (ByteString)
  29 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
  30 import Control.Concurrent.Async as CCA (mapConcurrently)
  31 import Control.Monad (join)
  32 import Data.Attoparsec.ByteString (parseOnly, Parser)
  33 import Data.Either(Either(..))
  34 import Data.Either.Extra (partitionEithers)
  35 import Data.List (concat, lookup)
  36 import Data.Ord()
  37 import Data.String (String())
  38 import Data.String()
  39 import Data.Text (Text)
  40 import Data.Text.Encoding (decodeUtf8)
  41 import Data.Tuple.Extra (both, first, second)
  42 import Gargantext.Core (Lang(..))
  43 import Gargantext.Database.Types.Node (HyperdataDocument(..))
  44 import Gargantext.Prelude
  45 import Gargantext.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
  46 import Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
  47 import Gargantext.Text.Learn (detectLangDefault)
  48 import System.FilePath (FilePath(), takeExtension)
  49 import qualified Data.ByteString       as DB
  50 import qualified Data.ByteString.Lazy  as DBL
  51 import qualified Data.ByteString.Char8 as DBC
  52 import qualified Data.Map              as DM
  53 import qualified Data.Text             as DT
  54 import qualified Gargantext.Text.Corpus.Parsers.Date as Date
  55 import qualified Gargantext.Text.Corpus.Parsers.RIS  as RIS
  56 import qualified Gargantext.Text.Corpus.Parsers.WOS  as WOS
  57 ------------------------------------------------------------------------
  58
  59 type ParseError = String
  60 --type Field      = Text
  61 --type Document   = DM.Map Field Text
  62 --type FilesParsed = DM.Map FilePath FileParsed
  63 --data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  64 --                              , _fileParsed_result :: [Document]
  65 --                              } deriving (Show)
  66
  67
  68 -- | According to the format of Input file,
  69 -- different parser are available.
  70 data FileFormat = WOS | RIS | RisPresse
  71                 | CsvGargV3 | CsvHal
  72   deriving (Show)
  73
  74 -- Implemented (ISI Format)
  75 --                | DOC        -- Not Implemented / import Pandoc
  76 --                | ODT        -- Not Implemented / import Pandoc
  77 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  78 --                | XML        -- Not Implemented / see :
  79
  80
  81 parseFormat :: FileFormat -> DB.ByteString -> IO [HyperdataDocument]
  82 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
  83 parseFormat CsvHal    bs = pure $ parseHal' $ DBL.fromStrict bs
  84 parseFormat RisPresse bs = mapM (toDoc RIS)
  85                         <$> snd
  86                         <$> enrichWith RisPresse
  87                          $ partitionEithers
  88                          $ [runParser'  RisPresse bs]
  89 parseFormat WOS bs = mapM (toDoc WOS)
  90                         <$> snd
  91                         <$> enrichWith WOS
  92                          $ partitionEithers
  93                          $ [runParser'  WOS bs]
  94 parseFormat _ _ = undefined
  95
  96 -- | Parse file into documents
  97 -- TODO manage errors here
  98 -- TODO: to debug maybe add the filepath in error message
  99 parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
 100 parseFile CsvHal    p = parseHal p
 101 parseFile CsvGargV3 p = parseCsv p
 102 parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
 103 parseFile WOS       p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS       <$> readFileWith WOS p
 104 parseFile ff        p = join $ mapM (toDoc ff)  <$> snd <$> enrichWith ff        <$> readFileWith ff  p
 105
 106 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
 107 -- TODO use language for RIS
 108 toDoc ff d = do
 109       let abstract = lookup "abstract" d
 110       let lang = maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
 111
 112       let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
 113
 114       (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang  dateToParse
 115
 116       pure $ HyperdataDocument (Just $ DT.pack $ show ff)
 117                                (lookup "doi" d)
 118                                (lookup "URL" d)
 119                                 Nothing
 120                                 Nothing
 121                                 Nothing
 122                                (lookup "title" d)
 123                                 Nothing
 124                                (lookup "authors" d)
 125                                (lookup "source" d)
 126                                (lookup "abstract" d)
 127                                (fmap (DT.pack . show) utcTime)
 128                                (pub_year)
 129                                (pub_month)
 130                                (pub_day)
 131                                Nothing
 132                                Nothing
 133                                Nothing
 134                                (Just $ (DT.pack . show) lang)
 135
 136 enrichWith :: FileFormat
 137            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 138 enrichWith RisPresse = enrichWith' presseEnrich
 139 enrichWith WOS       = enrichWith' (map (first WOS.keys))
 140 enrichWith _         = enrichWith' identity
 141
 142
 143 enrichWith' ::      ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
 144            ->  (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
 145 enrichWith' f = second (map both' . map f . concat)
 146   where
 147     both'   = map (both decodeUtf8)
 148
 149
 150
 151 readFileWith :: FileFormat -> FilePath
 152        -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
 153 readFileWith format path = do
 154     files <- case takeExtension path of
 155               ".zip" -> openZip              path
 156               _      -> pure <$> clean <$> DB.readFile path
 157     partitionEithers <$> mapConcurrently (runParser format) files
 158
 159
 160 -- | withParser:
 161 -- According to the format of the text, choose the right parser.
 162 -- TODO  withParser :: FileFormat -> Parser [Document]
 163 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
 164 withParser WOS = WOS.parser
 165 withParser RIS = RIS.parser
 166 --withParser ODT = odtParser
 167 --withParser XML = xmlParser
 168 withParser _   = panic "[ERROR] Parser not implemented yet"
 169
 170 runParser :: FileFormat -> DB.ByteString
 171           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
 172 runParser format text = pure $ runParser' format text
 173
 174 runParser' :: FileFormat -> DB.ByteString
 175           -> (Either String [[(DB.ByteString, DB.ByteString)]])
 176 runParser' format text = parseOnly (withParser format) text
 177
 178 openZip :: FilePath -> IO [DB.ByteString]
 179 openZip fp = do
 180     entries <- withArchive fp (DM.keys <$> getEntries)
 181     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 182     pure bs
 183
 184 cleanText :: Text -> Text
 185 cleanText = cs . clean . cs
 186
 187 clean :: DB.ByteString -> DB.ByteString
 188 clean txt = DBC.map clean' txt
 189   where
 190     clean' '’' = '\''
 191     clean' '\r' = ' '
 192     clean' '\t' = ' '
 193     clean' ';' = '.'
 194     clean' c  = c