]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers.hs
[FEAT] Adding External module for IMT community manager
[gargantext.git] / src / Gargantext / Text / Parsers.hs
1 {-|
2 Module : Gargantext.Text.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE NoImplicitPrelude #-}
22
23 module Gargantext.Text.Parsers -- (parse, FileFormat(..))
24 where
25
26 import Gargantext.Prelude
27
28 import System.FilePath (FilePath())
29 import qualified Data.Map as DM
30 import Data.Ord()
31 import Data.String()
32
33 import Data.Text (Text)
34 import qualified Data.Text as DT
35 -- | Activate Async for to parse in parallel
36 --import Control.Concurrent.Async as CCA (mapConcurrently)
37
38 import Data.String (String())
39
40
41 type ParseError = String
42 type Field = Text
43 type Document = DM.Map Field Text
44
45 type FilesParsed = DM.Map FilePath FileParsed
46 data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
47 , _fileParsed_result :: [Document]
48 } deriving (Show)
49
50
51 -- | According to the format of Input file,
52 -- different parser are available.
53 data FileFormat = WOS -- Implemented (ISI Format)
54 -- | DOC -- Not Implemented / import Pandoc
55 -- | ODT -- Not Implemented / import Pandoc
56 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
57 -- | XML -- Not Implemented / see :
58 -- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
59
60 -- TODO: to debug maybe add the filepath in error message
61
62
63 --parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
64 --parse format path = do
65 -- files <- case takeExtension path of
66 -- ".zip" -> openZip path
67 -- _ -> pure <$> DB.readFile path
68 -- (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
69 -- pure (as, map toText $ concat bs)
70 -- where
71 -- -- TODO : decode with bayesian inference on encodings
72 -- toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
73 --
74 --
75 ---- | withParser:
76 ---- According the format of the text, choosing the right parser.
77 ---- TODO withParser :: FileFormat -> Parser [Document]
78 --withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
79 --withParser WOS = wosParser
80 ----withParser DOC = docParser
81 ----withParser ODT = odtParser
82 ----withParser XML = xmlParser
83 ----withParser _ = error "[ERROR] Parser not implemented yet"
84 --
85 --runParser :: FileFormat -> DB.ByteString
86 -- -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
87 --runParser format text = pure $ parseOnly (withParser format) text
88 --
89 --openZip :: FilePath -> IO [DB.ByteString]
90 --openZip fp = do
91 -- path <- resolveFile' fp
92 -- entries <- withArchive path (DM.keys <$> getEntries)
93 -- bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
94 -- pure bs
95
96 clean :: Text -> Text
97 clean txt = DT.map clean' txt
98 where
99 clean' '’' = '\''
100 clean' c = c
101
102