src/Gargantext/Parsers.hs

   1 {-|
   2 Module      : Gargantext.Parsers
   3 Description : All parsers of Gargantext in one file.
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Gargantext enables analyzing semi-structured text that should be parsed
  11 in order to be analyzed.
  12
  13 The parsers suppose we know the format of the Text (TextFormat data
  14 type) according to which the right parser is chosen among the list of
  15 available parsers.
  16
  17 This module mainly describe how to add a new parser to Gargantext,
  18 please follow the types.
  19 -}
  20
  21 module Gargantext.Parsers -- (parse, FileFormat(..))
  22     where
  23
  24 import Gargantext.Prelude
  25
  26 import System.FilePath (takeExtension, FilePath())
  27 import Data.Attoparsec.ByteString (parseOnly, Parser)
  28 import Data.ByteString as DB
  29 import Data.Map        as DM
  30 import Data.Ord()
  31 import Data.String()
  32 import Data.Either.Extra(Either())
  33 ----
  34 --import Control.Monad (join)
  35 import Codec.Archive.Zip (withArchive, getEntry, getEntries)
  36 import Path.IO (resolveFile')
  37 ------ import qualified Data.ByteString.Lazy as B
  38 --import Control.Applicative ( (<$>) )
  39 import Control.Concurrent.Async as CCA (mapConcurrently)
  40
  41 import Data.String (String())
  42 import Gargantext.Parsers.WOS (wosParser)
  43 ---- import Gargantext.Parsers.XML (xmlParser)
  44 ---- import Gargantext.Parsers.DOC (docParser)
  45 ---- import Gargantext.Parsers.ODT (odtParser)
  46
  47 --import Gargantext.Prelude (pm)
  48 --import Gargantext.Types.Main (ErrorMessage(), Corpus)
  49
  50
  51 -- | According to the format of Input file,
  52 -- different parser are available.
  53 data FileFormat = WOS        -- Implemented (ISI Format)
  54 --                | DOC        -- Not Implemented / import Pandoc
  55 --                | ODT        -- Not Implemented / import Pandoc
  56 --                | PDF        -- Not Implemented / pdftotext and import Pandoc ?
  57 --                | XML        -- Not Implemented / see :
  58 --                             -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
  59
  60 parse :: FileFormat -> FilePath
  61       -> IO [Either String [[(DB.ByteString, DB.ByteString)]]]
  62 parse format path = do
  63     files <- case takeExtension path of
  64               ".zip" -> openZip              path
  65               _      -> pure <$> DB.readFile path
  66     mapConcurrently (runParser format) files
  67
  68
  69 -- | withParser:
  70 -- According the format of the text, choosing the right parser.
  71 -- TODO  withParser :: FileFormat -> Parser [Document]
  72 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
  73 withParser WOS = wosParser
  74 --withParser DOC = docParser
  75 --withParser ODT = odtParser
  76 --withParser XML = xmlParser
  77 --withParser _   = error "[ERROR] Parser not implemented yet"
  78
  79 runParser :: FileFormat -> DB.ByteString
  80           -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
  81 runParser format text = pure $ parseOnly (withParser format) text
  82
  83 openZip :: FilePath -> IO [DB.ByteString]
  84 openZip fp = do
  85     path    <- resolveFile' fp
  86     entries <- withArchive path (DM.keys <$> getEntries)
  87     bs      <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
  88     pure bs
  89
  90