]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers.hs
Parse and convert text and title of wikipedia dump, #4
[gargantext.git] / src / Gargantext / Text / Parsers.hs
1 {-|
2 Module : Gargantext.Text.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE NoImplicitPrelude #-}
22
23 module Gargantext.Text.Parsers -- (parse, FileFormat(..))
24 where
25
26 import System.FilePath (FilePath(), takeExtension)
27 import Codec.Archive.Zip (withArchive, getEntry, getEntries)
28
29 import Data.Either.Extra (partitionEithers)
30 import Data.List (concat)
31 import qualified Data.Map as DM
32 import qualified Data.ByteString as DB
33 import Data.Ord()
34 import Data.String()
35 import Data.Either(Either(..))
36 import Data.Attoparsec.ByteString (parseOnly, Parser)
37
38 import Data.Text (Text)
39 import qualified Data.Text as DT
40
41 -- Activate Async for to parse in parallel
42 import Control.Concurrent.Async as CCA (mapConcurrently)
43
44 import Data.Text.Encoding (decodeUtf8)
45 import Data.String (String())
46
47 ------------------------------------------------------------------------
48 import Gargantext.Prelude
49 import Gargantext.Text.Parsers.WOS (wosParser)
50 ------------------------------------------------------------------------
51
52
53 type ParseError = String
54 type Field = Text
55 type Document = DM.Map Field Text
56
57 type FilesParsed = DM.Map FilePath FileParsed
58 data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
59 , _fileParsed_result :: [Document]
60 } deriving (Show)
61
62
63 -- | According to the format of Input file,
64 -- different parser are available.
65 data FileFormat = WOS -- Implemented (ISI Format)
66 -- | DOC -- Not Implemented / import Pandoc
67 -- | ODT -- Not Implemented / import Pandoc
68 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
69 -- | XML -- Not Implemented / see :
70 -- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
71
72 -- TODO: to debug maybe add the filepath in error message
73
74
75 parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
76 parse format path = do
77 files <- case takeExtension path of
78 ".zip" -> openZip path
79 _ -> pure <$> DB.readFile path
80 (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
81 pure (as, map toText $ concat bs)
82 where
83 -- TODO : decode with bayesian inference on encodings
84 toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
85
86
87 -- | withParser:
88 -- According the format of the text, choosing the right parser.
89 -- TODO withParser :: FileFormat -> Parser [Document]
90 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
91 withParser WOS = wosParser
92 --withParser DOC = docParser
93 --withParser ODT = odtParser
94 --withParser XML = xmlParser
95 --withParser _ = error "[ERROR] Parser not implemented yet"
96
97 runParser :: FileFormat -> DB.ByteString
98 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
99 runParser format text = pure $ parseOnly (withParser format) text
100
101 openZip :: FilePath -> IO [DB.ByteString]
102 openZip fp = do
103 entries <- withArchive fp (DM.keys <$> getEntries)
104 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
105 pure bs
106
107 clean :: Text -> Text
108 clean txt = DT.map clean' txt
109 where
110 clean' '’' = '\''
111 clean' c = c
112
113