2 Module : Data.Gargantext.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
21 module Data.Gargantext.Parsers -- (parse, FileFormat(..))
24 import System.FilePath (takeExtension)
25 import Data.Attoparsec.ByteString (parseOnly, Parser)
26 import Data.ByteString as DB
28 ----import Data.Either.Extra(Either(..))
30 --import Control.Monad (join)
31 import Codec.Archive.Zip (withArchive, getEntry, getEntries)
32 import Path.IO (resolveFile')
33 ------ import qualified Data.ByteString.Lazy as B
34 --import Control.Applicative ( (<$>) )
35 import Control.Concurrent.Async as CCA (mapConcurrently)
38 import Data.Gargantext.Parsers.WOS (wosParser)
39 ---- import Data.Gargantext.Parsers.XML (xmlParser)
40 ---- import Data.Gargantext.Parsers.DOC (docParser)
41 ---- import Data.Gargantext.Parsers.ODT (odtParser)
43 --import Data.Gargantext.Prelude (pm)
44 --import Data.Gargantext.Types.Main (ErrorMessage(), Corpus)
47 -- | According to the format of Input file,
48 -- different parser are available.
49 data FileFormat = WOS -- Implemented (ISI Format)
50 -- | DOC -- Not Implemented / import Pandoc
51 -- | ODT -- Not Implemented / import Pandoc
52 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
53 -- | XML -- Not Implemented / see :
54 -- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
56 parse :: FileFormat -> FilePath
57 -> IO [Either String [[(DB.ByteString, DB.ByteString)]]]
58 parse format path = do
59 files <- case takeExtension path of
60 ".zip" -> openZip path
61 _ -> pure <$> DB.readFile path
62 mapConcurrently (runParser format) files
66 -- According the format of the text, choosing the right parser.
67 -- TODO withParser :: FileFormat -> Parser [Document]
68 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
69 withParser WOS = wosParser
70 --withParser DOC = docParser
71 --withParser ODT = odtParser
72 --withParser XML = xmlParser
73 --withParser _ = error "[ERROR] Parser not implemented yet"
75 runParser :: FileFormat -> DB.ByteString
76 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
77 runParser format text = pure $ parseOnly (withParser format) text
79 openZip :: FilePath -> IO [DB.ByteString]
81 path <- resolveFile' fp
82 entries <- withArchive path (DM.keys <$> getEntries)
83 bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries