]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers.hs
[FIX] multiterms FR extraction.
[gargantext.git] / src / Gargantext / Text / Parsers.hs
1 {-|
2 Module : Gargantext.Text.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE NoImplicitPrelude #-}
22
23 module Gargantext.Text.Parsers -- (parse, FileFormat(..))
24 where
25
26 import Gargantext.Prelude
27
28 import System.FilePath (takeExtension, FilePath())
29 import Data.Attoparsec.ByteString (parseOnly, Parser)
30 import qualified Data.ByteString as DB
31 import qualified Data.Map as DM
32 import Data.Either.Extra (partitionEithers)
33 import Data.Ord()
34 import Data.Foldable (concat)
35 import Data.String()
36 import Data.Either.Extra(Either())
37
38 import Data.Text (Text)
39 import Data.Text.Encoding (decodeUtf8)
40 import qualified Data.Text as DT
41 ----
42 --import Control.Monad (join)
43 import Codec.Archive.Zip (withArchive, getEntry, getEntries)
44 import Path.IO (resolveFile')
45 ------ import qualified Data.ByteString.Lazy as B
46 --import Control.Applicative ( (<$>) )
47 import Control.Concurrent.Async as CCA (mapConcurrently)
48
49 import Data.String (String())
50 import Gargantext.Text.Parsers.WOS (wosParser)
51 ---- import Gargantext.Parsers.XML (xmlParser)
52 ---- import Gargantext.Parsers.DOC (docParser)
53 ---- import Gargantext.Parsers.ODT (odtParser)
54
55 --import Gargantext.Types.Main (ErrorMessage(), Corpus)
56
57 type ParseError = String
58 type Field = Text
59 type Document = DM.Map Field Text
60
61 type FilesParsed = DM.Map FilePath FileParsed
62 data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
63 , _fileParsed_result :: [Document]
64 } deriving (Show)
65
66
67 -- | According to the format of Input file,
68 -- different parser are available.
69 data FileFormat = WOS -- Implemented (ISI Format)
70 -- | DOC -- Not Implemented / import Pandoc
71 -- | ODT -- Not Implemented / import Pandoc
72 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
73 -- | XML -- Not Implemented / see :
74 -- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
75
76 -- TODO: to debug maybe add the filepath in error message
77
78
79 --parse :: FileFormat -> FilePath -> IO ([ParseError], [[(Text, Text)]])
80 --parse format path = do
81 -- files <- case takeExtension path of
82 -- ".zip" -> openZip path
83 -- _ -> pure <$> DB.readFile path
84 -- (as, bs) <- partitionEithers <$> mapConcurrently (runParser format) files
85 -- pure (as, map toText $ concat bs)
86 -- where
87 -- -- TODO : decode with bayesian inference on encodings
88 -- toText = map (\(a,b) -> (decodeUtf8 a, decodeUtf8 b))
89 --
90 --
91 ---- | withParser:
92 ---- According the format of the text, choosing the right parser.
93 ---- TODO withParser :: FileFormat -> Parser [Document]
94 --withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
95 --withParser WOS = wosParser
96 ----withParser DOC = docParser
97 ----withParser ODT = odtParser
98 ----withParser XML = xmlParser
99 ----withParser _ = error "[ERROR] Parser not implemented yet"
100 --
101 --runParser :: FileFormat -> DB.ByteString
102 -- -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
103 --runParser format text = pure $ parseOnly (withParser format) text
104 --
105 --openZip :: FilePath -> IO [DB.ByteString]
106 --openZip fp = do
107 -- path <- resolveFile' fp
108 -- entries <- withArchive path (DM.keys <$> getEntries)
109 -- bs <- mapConcurrently (\s -> withArchive path (getEntry s)) entries
110 -- pure bs
111
112 clean :: Text -> Text
113 clean txt = DT.map clean' txt
114 where
115 clean' '’' = '\''
116 clean' c = c
117
118