]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers.hs
[DATE] parser -> UTCTime
[gargantext.git] / src / Gargantext / Text / Parsers.hs
1 {-|
2 Module : Gargantext.Text.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE NoImplicitPrelude #-}
22 {-# LANGUAGE PackageImports #-}
23
24 module Gargantext.Text.Parsers (parse, FileFormat(..), clean)
25 where
26
27 import System.FilePath (FilePath(), takeExtension)
28 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
29
30 import Data.Either.Extra (partitionEithers)
31 import Data.List (concat)
32 import qualified Data.Map as DM
33 import qualified Data.ByteString as DB
34 import Data.Ord()
35 import Data.String()
36 import Data.Either(Either(..))
37 import Data.Attoparsec.ByteString (parseOnly, Parser)
38
39 import Data.Text (Text)
40 import qualified Data.Text as DT
41
42 -- Activate Async for to parse in parallel
43 import Control.Concurrent.Async as CCA (mapConcurrently)
44
45 import Data.Text.Encoding (decodeUtf8)
46 import Data.String (String())
47
48 ------------------------------------------------------------------------
49 import Gargantext.Prelude
50 import Gargantext.Text.Parsers.WOS (wosParser)
51 ------------------------------------------------------------------------
52
53 type ParseError = String
54 --type Field = Text
55 --type Document = DM.Map Field Text
56 --type FilesParsed = DM.Map FilePath FileParsed
57 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
58 -- , _fileParsed_result :: [Document]
59 -- } deriving (Show)
60
61
62 -- | According to the format of Input file,
63 -- different parser are available.
64 data FileFormat = WOS -- Implemented (ISI Format)
65 -- | DOC -- Not Implemented / import Pandoc
66 -- | ODT -- Not Implemented / import Pandoc
67 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
68 -- | XML -- Not Implemented / see :
69 -- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
70
71 -- TODO: to debug maybe add the filepath in error message
72
73
74 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
75 parse format path = do
76 files <- case takeExtension path of
77 ".zip" -> openZip path
78 _ -> pure <$> DB.readFile path
79 (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
80 pure (as, map toText $ concat bs)
81 where
82 -- TODO : decode with bayesian inference on encodings
83 toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
84
85
86 -- | withParser:
87 -- According to the format of the text, choose the right parser.
88 -- TODO withParser :: FileFormat -> Parser [Document]
89 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
90 withParser WOS = wosParser
91 --withParser DOC = docParser
92 --withParser ODT = odtParser
93 --withParser XML = xmlParser
94 --withParser _ = error "[ERROR] Parser not implemented yet"
95
96 runParser :: FileFormat -> DB.ByteString
97 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
98 runParser format text = pure $ parseOnly (withParser format) text
99
100 openZip :: FilePath -> IO [DB.ByteString]
101 openZip fp = do
102 entries <- withArchive fp (DM.keys <$> getEntries)
103 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
104 pure bs
105
106 clean :: Text -> Text
107 clean txt = DT.map clean' txt
108 where
109 clean' '’' = '\''
110 clean' c = c
111
112