]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Parsers.hs
Integrate the orchestrator with a route to query scrapyd
[gargantext.git] / src / Gargantext / Parsers.hs
1 {-|
2 Module : Gargantext.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE NoImplicitPrelude #-}
22
23 module Gargantext.Parsers -- (parse, FileFormat(..))
24 where
25
26 import Gargantext.Prelude
27
28 import System.FilePath (takeExtension, FilePath())
29 import Data.Attoparsec.ByteString (parseOnly, Parser)
30 import qualified Data.ByteString as DB
31 import qualified Data.Map as DM
32 import Data.Either.Extra (partitionEithers)
33 import Data.Ord()
34 import Data.Foldable (concat)
35 import Data.String()
36 import Data.Either.Extra(Either())
37
38 import Data.Text (Text)
39 import Data.Text.Encoding (decodeUtf8)
40 ----
41 --import Control.Monad (join)
42 import Codec.Archive.Zip (withArchive, getEntry, getEntries)
43 import Path.IO (resolveFile')
44 ------ import qualified Data.ByteString.Lazy as B
45 --import Control.Applicative ( (<$>) )
46 import Control.Concurrent.Async as CCA (mapConcurrently)
47
48 import Data.String (String())
49 import Gargantext.Parsers.WOS (wosParser)
50 ---- import Gargantext.Parsers.XML (xmlParser)
51 ---- import Gargantext.Parsers.DOC (docParser)
52 ---- import Gargantext.Parsers.ODT (odtParser)
53
54 --import Gargantext.Prelude (pm)
55 --import Gargantext.Types.Main (ErrorMessage(), Corpus)
56
57 -- FIXME
58 --type Field = Text
59 type ParseError = String
60 --
61 --data Corpus = Corpus { _corpusErrors :: [ParseError]
62 -- , _corpusMap :: Map FilePath (Map Field Text)
63 -- }
64
65
66 -- | According to the format of Input file,
67 -- different parser are available.
68 data FileFormat = WOS -- Implemented (ISI Format)
69 -- | DOC -- Not Implemented / import Pandoc
70 -- | ODT -- Not Implemented / import Pandoc
71 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
72 -- | XML -- Not Implemented / see :
73 -- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
74
75 -- TODO: to debug maybe add the filepath in error message
76
77
78 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
79 parse format path = do
80 files <- case takeExtension path of
81 ".zip" -> openZip path
82 _ -> pure <$> DB.readFile path
83 (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
84 pure (as, map toText $ concat bs)
85 where
86 -- TODO : decode with bayesian inference on encodings
87 toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
88
89
90 -- | withParser:
91 -- According the format of the text, choosing the right parser.
92 -- TODO withParser :: FileFormat -> Parser [Document]
93 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
94 withParser WOS = wosParser
95 --withParser DOC = docParser
96 --withParser ODT = odtParser
97 --withParser XML = xmlParser
98 --withParser _ = error "[ERROR] Parser not implemented yet"
99
100 runParser :: FileFormat -> DB.ByteString
101 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
102 runParser format text = pure $ parseOnly (withParser format) text
103
104 openZip :: FilePath -> IO [DB.ByteString]
105 openZip fp = do
106 path <- resolveFile' fp
107 entries <- withArchive path (DM.keys <$> getEntries)
108 bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
109 pure bs
110
111