2 Module : Gargantext.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
21 {-# LANGUAGE NoImplicitPrelude #-}
22 {-# LANGUAGE PackageImports #-}
23 {-# LANGUAGE OverloadedStrings #-}
25 module Gargantext.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText)
28 --import Data.ByteString (ByteString)
29 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
30 import Control.Concurrent.Async as CCA (mapConcurrently)
31 import Control.Monad (join)
32 import qualified Data.ByteString.Char8 as DBC
33 import Data.Attoparsec.ByteString (parseOnly, Parser)
34 import Data.Either(Either(..))
35 import Data.Either.Extra (partitionEithers)
36 import Data.List (concat)
37 import Data.List (lookup)
39 import Data.String (String())
41 import Data.Text (Text)
42 import Data.Text.Encoding (decodeUtf8)
43 import Data.Tuple.Extra (both, first, second)
44 import System.FilePath (FilePath(), takeExtension)
45 import qualified Data.ByteString as DB
46 import qualified Data.Map as DM
47 import qualified Data.Text as DT
48 import Gargantext.Core (Lang(..))
49 import Gargantext.Prelude
50 import Gargantext.Database.Types.Node (HyperdataDocument(..))
51 import qualified Gargantext.Text.Corpus.Parsers.WOS as WOS
52 import qualified Gargantext.Text.Corpus.Parsers.RIS as RIS
53 import Gargantext.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
54 import qualified Gargantext.Text.Corpus.Parsers.Date as Date
55 import Gargantext.Text.Corpus.Parsers.CSV (parseHal)
56 import Gargantext.Text.Terms.Stop (detectLang)
57 ------------------------------------------------------------------------
59 type ParseError = String
61 --type Document = DM.Map Field Text
62 --type FilesParsed = DM.Map FilePath FileParsed
63 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
64 -- , _fileParsed_result :: [Document]
68 -- | According to the format of Input file,
69 -- different parser are available.
70 data FileFormat = WOS | RIS | RisPresse
71 | CsvGargV3 | CsvHalFormat
74 -- Implemented (ISI Format)
75 -- | DOC -- Not Implemented / import Pandoc
76 -- | ODT -- Not Implemented / import Pandoc
77 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
78 -- | XML -- Not Implemented / see :
82 parseFormat :: FileFormat -> ByteString -> [HyperdataDocument]
83 parseFormat = undefined
86 -- | Parse file into documents
87 -- TODO manage errors here
88 -- TODO: to debug maybe add the filepath in error message
89 parseFile :: FileFormat -> FilePath -> IO [HyperdataDocument]
90 parseFile CsvHalFormat p = parseHal p
91 parseFile RisPresse p = join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
92 parseFile WOS p = join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
93 parseFile ff p = join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
95 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
96 -- TODO use language for RIS
98 let abstract = lookup "abstract" d
99 let lang = maybe EN identity (join $ detectLang <$> (fmap (DT.take 50) abstract))
101 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
103 (utcTime, (pub_year, pub_month, pub_day)) <- Date.split lang dateToParse
105 pure $ HyperdataDocument (Just $ DT.pack $ show ff)
115 (lookup "abstract" d)
116 (fmap (DT.pack . show) utcTime)
123 (Just $ (DT.pack . show) lang)
125 enrichWith :: FileFormat
126 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
127 enrichWith RisPresse = enrichWith' presseEnrich
128 enrichWith WOS = enrichWith' (map (first WOS.keys))
129 enrichWith _ = enrichWith' identity
132 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
133 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
134 enrichWith' f = second (map both' . map f . concat)
136 both' = map (both decodeUtf8)
138 readFileWith :: FileFormat -> FilePath
139 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
140 readFileWith format path = do
141 files <- case takeExtension path of
142 ".zip" -> openZip path
143 _ -> pure <$> clean <$> DB.readFile path
144 partitionEithers <$> mapConcurrently (runParser format) files
148 -- According to the format of the text, choose the right parser.
149 -- TODO withParser :: FileFormat -> Parser [Document]
150 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
151 withParser WOS = WOS.parser
152 withParser RIS = RIS.parser
153 --withParser ODT = odtParser
154 --withParser XML = xmlParser
155 withParser _ = panic "[ERROR] Parser not implemented yet"
157 runParser :: FileFormat -> DB.ByteString
158 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
159 runParser format text = pure $ parseOnly (withParser format) text
161 openZip :: FilePath -> IO [DB.ByteString]
163 entries <- withArchive fp (DM.keys <$> getEntries)
164 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
167 cleanText :: Text -> Text
168 cleanText = cs . clean . cs
170 clean :: DB.ByteString -> DB.ByteString
171 clean txt = DBC.map clean' txt