]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Parsers.hs
[CLEAN] fix gitignore on cabal files in order to minimize merge/error risks.
[gargantext.git] / src / Gargantext / Parsers.hs
1 {-|
2 Module : Gargantext.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 module Gargantext.Parsers -- (parse, FileFormat(..))
22 where
23
24 import Gargantext.Prelude
25
26 import System.FilePath (takeExtension, FilePath())
27 import Data.Attoparsec.ByteString (parseOnly, Parser)
28 import Data.ByteString as DB
29 import Data.Map as DM
30 import Data.Ord()
31 import Data.String()
32 import Data.Either.Extra(Either())
33 ----
34 --import Control.Monad (join)
35 import Codec.Archive.Zip (withArchive, getEntry, getEntries)
36 import Path.IO (resolveFile')
37 ------ import qualified Data.ByteString.Lazy as B
38 --import Control.Applicative ( (<$>) )
39 import Control.Concurrent.Async as CCA (mapConcurrently)
40
41 import Data.String (String())
42 import Gargantext.Parsers.WOS (wosParser)
43 ---- import Gargantext.Parsers.XML (xmlParser)
44 ---- import Gargantext.Parsers.DOC (docParser)
45 ---- import Gargantext.Parsers.ODT (odtParser)
46
47 --import Gargantext.Prelude (pm)
48 --import Gargantext.Types.Main (ErrorMessage(), Corpus)
49
50
51 -- | According to the format of Input file,
52 -- different parser are available.
53 data FileFormat = WOS -- Implemented (ISI Format)
54 -- | DOC -- Not Implemented / import Pandoc
55 -- | ODT -- Not Implemented / import Pandoc
56 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
57 -- | XML -- Not Implemented / see :
58 -- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
59
60 parse :: FileFormat -> FilePath
61 -> IO [Either String [[(DB.ByteString, DB.ByteString)]]]
62 parse format path = do
63 files <- case takeExtension path of
64 ".zip" -> openZip path
65 _ -> pure <$> DB.readFile path
66 mapConcurrently (runParser format) files
67
68
69 -- | withParser:
70 -- According the format of the text, choosing the right parser.
71 -- TODO withParser :: FileFormat -> Parser [Document]
72 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
73 withParser WOS = wosParser
74 --withParser DOC = docParser
75 --withParser ODT = odtParser
76 --withParser XML = xmlParser
77 --withParser _ = error "[ERROR] Parser not implemented yet"
78
79 runParser :: FileFormat -> DB.ByteString
80 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
81 runParser format text = pure $ parseOnly (withParser format) text
82
83 openZip :: FilePath -> IO [DB.ByteString]
84 openZip fp = do
85 path <- resolveFile' fp
86 entries <- withArchive path (DM.keys <$> getEntries)
87 bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
88 pure bs
89
90