src/Data/Gargantext/Parsers.hs

   1 {-|
   2 Module      : Data.Gargantext.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 module Data.Gargantext.Parsers -- (parse, FileFormat(..))
  22     where
  23
  24 import System.FilePath (takeExtension)
  25 import Data.Attoparsec.ByteString (parseOnly, Parser)
  26 import Data.ByteString as DB
  27 import Data.Map                    as DM
  28 ----import Data.Either.Extra(Either(..))
  29 ----
  30 --import Control.Monad (join)
  31 import Codec.Archive.Zip
  32 import Path.IO (resolveFile')
  33 ------ import qualified Data.ByteString.Lazy as B
  34 --import Control.Applicative ( (<$>) )
  35 import Control.Concurrent.Async as CCA (mapConcurrently)
  36
  37
  38 import Data.Gargantext.Parsers.WOS (wosParser)
  39 ---- import Data.Gargantext.Parsers.XML (xmlParser)
  40 ---- import Data.Gargantext.Parsers.DOC (docParser)
  41 ---- import Data.Gargantext.Parsers.ODT (odtParser)
  42
  43 --import Data.Gargantext.Prelude (pm)
  44 --import Data.Gargantext.Types.Main (ErrorMessage(), Corpus)
  45
  46
  47 -- | According to the format of Input file,
  48 -- different parser are available.
  49 data FileFormat = WOS        -- Implemented (ISI Format)
  50 --                | DOC        -- Not Implemented / import Pandoc
  51 --                | ODT        -- Not Implemented / import Pandoc
  52 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  53 --                | XML        -- Not Implemented / see :
  54 --                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
  55
  56 ---- | withParser:
  57 ---- According the format of the text, choosing the right parser.
  58
  59 --withParser :: FileFormat -> ByteString -> IO Corpus
  60 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
  61 withParser WOS = wosParser
  62 --withParser DOC = docParser
  63 --withParser ODT = odtParser
  64 --withParser XML = xmlParser
  65 --withParser _   = error "[ERROR] Parser not implemented yet"
  66
  67 runParser :: FileFormat -> DB.ByteString
  68           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
  69 runParser format text = pure $ parseOnly (withParser format) text
  70
  71 openZip :: FilePath -> IO [DB.ByteString]
  72 openZip fp = do
  73     path    <- resolveFile' fp
  74     entries <- withArchive path (DM.keys <$> getEntries)
  75     bs      <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
  76     pure bs
  77
  78 parse :: FileFormat -> FilePath
  79       -> IO [Either String [[(DB.ByteString, DB.ByteString)]]]
  80 parse format path = do
  81     files <- case takeExtension path of
  82               ".zip" -> openZip              path
  83               _      -> pure <$> DB.readFile path
  84     mapConcurrently (runParser format) files
  85
  86