src/Gargantext/Text/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 {-# LANGUAGE NoImplicitPrelude #-}
  22
  23 module Gargantext.Text.Parsers -- (parse, FileFormat(..))
  24     where
  25
  26 import System.FilePath (FilePath(), takeExtension)
  27 import Codec.Archive.Zip (withArchive, getEntry, getEntries)
  28
  29 import Data.Either.Extra (partitionEithers)
  30 import Data.List (concat)
  31 import qualified Data.Map        as DM
  32 import qualified Data.ByteString as DB
  33 import Data.Ord()
  34 import Data.String()
  35 import Data.Either(Either(..))
  36 import Data.Attoparsec.ByteString (parseOnly, Parser)
  37
  38 import Data.Text (Text)
  39 import qualified Data.Text as DT
  40 -- | Activate Async for to parse in parallel
  41 import Control.Concurrent.Async as CCA (mapConcurrently)
  42
  43 import Data.Text.Encoding (decodeUtf8)
  44 import Data.String (String())
  45
  46 ------------------------------------------------------------------------
  47 import Gargantext.Prelude
  48 import Gargantext.Text.Parsers.WOS (wosParser)
  49 ------------------------------------------------------------------------
  50
  51
  52 type ParseError = String
  53 type Field      = Text
  54 type Document   = DM.Map Field Text
  55
  56 type FilesParsed = DM.Map FilePath FileParsed
  57 data FileParsed  = FileParsed { _fileParsed_errors ::  Maybe ParseError
  58                               , _fileParsed_result :: [Document]
  59                               } deriving (Show)
  60
  61
  62 -- | According to the format of Input file,
  63 -- different parser are available.
  64 data FileFormat = WOS        -- Implemented (ISI Format)
  65 --                | DOC        -- Not Implemented / import Pandoc
  66 --                | ODT        -- Not Implemented / import Pandoc
  67 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  68 --                | XML        -- Not Implemented / see :
  69 --                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
  70
  71 -- TODO: to debug maybe add the filepath in error message
  72
  73
  74 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
  75 parse format path = do
  76     files <- case takeExtension path of
  77               ".zip" -> openZip              path
  78               _      -> pure <$> DB.readFile path
  79     (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
  80     pure (as, map toText $ concat bs)
  81       where
  82         -- TODO : decode with bayesian inference on encodings
  83         toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
  84
  85
  86 -- | withParser:
  87 -- According the format of the text, choosing the right parser.
  88 -- TODO  withParser :: FileFormat -> Parser [Document]
  89 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
  90 withParser WOS = wosParser
  91 --withParser DOC = docParser
  92 --withParser ODT = odtParser
  93 --withParser XML = xmlParser
  94 --withParser _   = error "[ERROR] Parser not implemented yet"
  95
  96 runParser :: FileFormat -> DB.ByteString
  97           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
  98 runParser format text = pure $ parseOnly (withParser format) text
  99
 100 openZip :: FilePath -> IO [DB.ByteString]
 101 openZip fp = do
 102     entries <- withArchive fp (DM.keys <$> getEntries)
 103     bs      <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
 104     pure bs
 105
 106 clean :: Text -> Text
 107 clean txt = DT.map clean' txt
 108   where
 109     clean' '’' = '\''
 110     clean' c  = c
 111
 112