2 Module : Gargantext.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
21 module Gargantext.Parsers -- (parse, FileFormat(..))
24 import Gargantext.Prelude
26 import System.FilePath (takeExtension, FilePath())
27 import Data.Attoparsec.ByteString (parseOnly, Parser)
28 import qualified Data.ByteString as DB
29 import qualified Data.Map as DM
30 import Data.Either.Extra (partitionEithers)
32 import Data.Foldable (concat)
34 import Data.Either.Extra(Either())
36 import Data.Text (Text)
37 import Data.Text.Encoding (decodeUtf8)
39 --import Control.Monad (join)
40 import Codec.Archive.Zip (withArchive, getEntry, getEntries)
41 import Path.IO (resolveFile')
42 ------ import qualified Data.ByteString.Lazy as B
43 --import Control.Applicative ( (<$>) )
44 import Control.Concurrent.Async as CCA (mapConcurrently)
46 import Data.String (String())
47 import Gargantext.Parsers.WOS (wosParser)
48 ---- import Gargantext.Parsers.XML (xmlParser)
49 ---- import Gargantext.Parsers.DOC (docParser)
50 ---- import Gargantext.Parsers.ODT (odtParser)
52 --import Gargantext.Prelude (pm)
53 --import Gargantext.Types.Main (ErrorMessage(), Corpus)
56 -- | According to the format of Input file,
57 -- different parser are available.
58 data FileFormat = WOS -- Implemented (ISI Format)
59 -- | DOC -- Not Implemented / import Pandoc
60 -- | ODT -- Not Implemented / import Pandoc
61 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
62 -- | XML -- Not Implemented / see :
63 -- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
65 -- TODO: to debug maybe add the filepath in error message
66 type ParseError = String
69 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
70 parse format path = do
71 files <- case takeExtension path of
72 ".zip" -> openZip path
73 _ -> pure <$> DB.readFile path
74 (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
75 pure (as, map toText $ concat bs)
77 -- TODO : decode with bayesian inference on encodings
78 toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
82 -- According the format of the text, choosing the right parser.
83 -- TODO withParser :: FileFormat -> Parser [Document]
84 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
85 withParser WOS = wosParser
86 --withParser DOC = docParser
87 --withParser ODT = odtParser
88 --withParser XML = xmlParser
89 --withParser _ = error "[ERROR] Parser not implemented yet"
91 runParser :: FileFormat -> DB.ByteString
92 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
93 runParser format text = pure $ parseOnly (withParser format) text
95 openZip :: FilePath -> IO [DB.ByteString]
97 path <- resolveFile' fp
98 entries <- withArchive path (DM.keys <$> getEntries)
99 bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries