]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Parsers.hs
[CODE/READ] with NP.
[gargantext.git] / src / Gargantext / Parsers.hs
1 {-|
2 Module : Gargantext.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 module Gargantext.Parsers -- (parse, FileFormat(..))
22 where
23
24 import Gargantext.Prelude
25
26 import System.FilePath (takeExtension, FilePath())
27 import Data.Attoparsec.ByteString (parseOnly, Parser)
28 import qualified Data.ByteString as DB
29 import qualified Data.Map as DM
30 import Data.Either.Extra (partitionEithers)
31 import Data.Ord()
32 import Data.Foldable (concat)
33 import Data.String()
34 import Data.Either.Extra(Either())
35
36 import Data.Text (Text)
37 import Data.Text.Encoding (decodeUtf8)
38 ----
39 --import Control.Monad (join)
40 import Codec.Archive.Zip (withArchive, getEntry, getEntries)
41 import Path.IO (resolveFile')
42 ------ import qualified Data.ByteString.Lazy as B
43 --import Control.Applicative ( (<$>) )
44 import Control.Concurrent.Async as CCA (mapConcurrently)
45
46 import Data.String (String())
47 import Gargantext.Parsers.WOS (wosParser)
48 ---- import Gargantext.Parsers.XML (xmlParser)
49 ---- import Gargantext.Parsers.DOC (docParser)
50 ---- import Gargantext.Parsers.ODT (odtParser)
51
52 --import Gargantext.Prelude (pm)
53 --import Gargantext.Types.Main (ErrorMessage(), Corpus)
54
55 -- FIXME
56 --type Field = Text
57 type ParseError = String
58 --
59 --data Corpus = Corpus { _corpusErrors :: [ParseError]
60 -- , _corpusMap :: Map FilePath (Map Field Text)
61 -- }
62
63
64 -- | According to the format of Input file,
65 -- different parser are available.
66 data FileFormat = WOS -- Implemented (ISI Format)
67 -- | DOC -- Not Implemented / import Pandoc
68 -- | ODT -- Not Implemented / import Pandoc
69 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
70 -- | XML -- Not Implemented / see :
71 -- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
72
73 -- TODO: to debug maybe add the filepath in error message
74
75
76 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
77 parse format path = do
78 files <- case takeExtension path of
79 ".zip" -> openZip path
80 _ -> pure <$> DB.readFile path
81 (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
82 pure (as, map toText $ concat bs)
83 where
84 -- TODO : decode with bayesian inference on encodings
85 toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
86
87
88 -- | withParser:
89 -- According the format of the text, choosing the right parser.
90 -- TODO withParser :: FileFormat -> Parser [Document]
91 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
92 withParser WOS = wosParser
93 --withParser DOC = docParser
94 --withParser ODT = odtParser
95 --withParser XML = xmlParser
96 --withParser _ = error "[ERROR] Parser not implemented yet"
97
98 runParser :: FileFormat -> DB.ByteString
99 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
100 runParser format text = pure $ parseOnly (withParser format) text
101
102 openZip :: FilePath -> IO [DB.ByteString]
103 openZip fp = do
104 path <- resolveFile' fp
105 entries <- withArchive path (DM.keys <$> getEntries)
106 bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
107 pure bs
108
109