]> Git — Sourcephile - gargantext.git/blob - src/Data/Gargantext/Parsers.hs
[DRAFT] Parser main functions, for meeting.
[gargantext.git] / src / Data / Gargantext / Parsers.hs
1 {-|
2 Module : Data.Gargantext.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : alexandre.delanoe@iscpif.fr
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose, we know the format of the Text (TextFormat data
14 type) according which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21
22 module Data.Gargantext.Parsers ( module Data.Gargantext.Parsers.WOS
23 --, module Data.Gargantext.Parsers.XML
24 --, module Data.Gargantext.Parsers.DOC
25 --, module Data.Gargantext.Parsers.ODS
26 )
27 where
28
29
30 import Data.Attoparsec.ByteString
31 import Data.ByteString (ByteString)
32 import Data.Map as DM
33 import Data.Either.Extra(Either(..))
34
35 import Control.Monad (join)
36 import Codec.Archive.Zip
37 import Path.IO (resolveFile')
38 -- import qualified Data.ByteString.Lazy as B
39 import Control.Applicative ( (<$>) )
40
41
42
43 import Control.Concurrent.Async as CCA (mapConcurrently)
44
45
46 import Data.Gargantext.Parsers.WOS (wosParser)
47 -- import Data.Gargantext.Parsers.XML (xmlParser)
48 -- import Data.Gargantext.Parsers.DOC (docParser)
49 -- import Data.Gargantext.Parsers.ODS (odsParser)
50
51 import Data.Gargantext.Prelude
52 import Data.Gargantext.Types.Main (ErrorMessage(), GargParser(), Corpus)
53
54
55 -- | According to the format of Input file,
56 -- different parser are available.
57 data FileFormat = WOS -- Implemented (ISI Format)
58 | XML -- Not Implemented / see :
59 -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
60 | DOC -- Not Implemented / import Pandoc
61 | ODS -- Not Implemented / import Pandoc
62 | PDF -- Not Implemented / pdftotext and import Pandoc ?
63
64
65
66 -- | withParser:
67 -- According the format of the text, choosing the right parser.
68 withParser :: FileFormat -> GargParser
69 withParser WOS = wosParser
70 --withParser XML = xmlParser
71 --withParser DOC = docParser
72 --withParser ODS = odsParser
73 withParser _ = error "[ERROR] Parser not implemented yet"
74
75
76 runParser :: FileFormat -> ByteString -> Either ErrorMessage (IO (Maybe Corpus))
77 runParser format text = parseOnly (withParser format) text
78
79
80 parseZip :: FilePath -> ByteString -> IO Corpus
81 parseZip = undefined
82
83 parseFile :: FileFormat -> ByteString -> IO Corpus
84 parseFile p x = case runParser p x of
85 Left _ -> pure 0
86 Right r -> pure $ length r
87
88
89 openZipFiles :: FilePath -> IO [ByteString]
90 openZipFiles fp = do
91 path <- resolveFile' fp
92 entries <- withArchive path (DM.keys <$> getEntries)
93 bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
94 pure bs
95
96
97 wosParserTest :: FilePath -> IO [Int]
98 wosParserTest fp = join $ mapConcurrently (parseFile WOS) <$> openZipFiles fp
99
100