2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
21 {-# LANGUAGE PackageImports #-}
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
27 import Control.Concurrent.Async as CCA (mapConcurrently)
28 import Control.Monad (join)
29 import Data.Attoparsec.ByteString (parseOnly, Parser)
30 import Data.Either(Either(..))
31 import Data.Either.Extra (partitionEithers)
32 import Data.List (concat, lookup)
34 import Data.String (String())
36 import Data.Text (Text)
37 import Data.Text.Encoding (decodeUtf8)
38 import Data.Tuple.Extra (both, first, second)
39 import System.FilePath (FilePath(), takeExtension)
40 import qualified Data.ByteString as DB
41 import qualified Data.ByteString.Char8 as DBC
42 import qualified Data.ByteString.Lazy as DBL
43 import qualified Data.Map as DM
44 import qualified Data.Text as DT
45 import qualified Prelude as Prelude
47 import Gargantext.Core (Lang(..))
48 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
49 import Gargantext.Prelude
50 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
51 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
52 -- import Gargantext.Core.Text.Learn (detectLangDefault)
53 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
54 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
55 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
56 ------------------------------------------------------------------------
58 type ParseError = String
60 --type Document = DM.Map Field Text
61 --type FilesParsed = DM.Map FilePath FileParsed
62 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
63 -- , _fileParsed_result :: [Document]
67 -- | According to the format of Input file,
68 -- different parser are available.
69 data FileFormat = WOS | RIS | RisPresse
74 -- Implemented (ISI Format)
75 -- | DOC -- Not Implemented / import Pandoc
76 -- | ODT -- Not Implemented / import Pandoc
77 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
78 -- | XML -- Not Implemented / see :
81 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
82 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
83 parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
84 parseFormat RisPresse bs = do
85 docs <- mapM (toDoc RIS)
87 <$> enrichWith RisPresse
89 $ [runParser' RisPresse bs]
91 parseFormat WOS bs = do
92 docs <- mapM (toDoc WOS)
98 parseFormat ZIP _bs = do
99 printDebug "[parseFormat]" ZIP
100 pure $ Left "Not implemented for ZIP"
101 parseFormat _ _ = undefined
103 -- | Parse file into documents
104 -- TODO manage errors here
105 -- TODO: to debug maybe add the filepath in error message
106 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
107 parseFile CsvHal p = parseHal p
108 parseFile CsvGargV3 p = parseCsv p
109 parseFile RisPresse p = do
110 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
113 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
116 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
119 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
120 -- TODO use language for RIS
122 -- let abstract = lookup "abstract" d
123 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
125 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
127 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
129 pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
130 , _hd_doi = lookup "doi" d
131 , _hd_url = lookup "URL" d
132 , _hd_uniqId = Nothing
133 , _hd_uniqIdBdd = Nothing
135 , _hd_title = lookup "title" d
136 , _hd_authors = Nothing
137 , _hd_institutes = lookup "authors" d
138 , _hd_source = lookup "source" d
139 , _hd_abstract = lookup "abstract" d
140 , _hd_publication_date = fmap (DT.pack . show) utcTime
141 , _hd_publication_year = pub_year
142 , _hd_publication_month = pub_month
143 , _hd_publication_day = pub_day
144 , _hd_publication_hour = Nothing
145 , _hd_publication_minute = Nothing
146 , _hd_publication_second = Nothing
147 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
149 enrichWith :: FileFormat
150 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
151 enrichWith RisPresse = enrichWith' presseEnrich
152 enrichWith WOS = enrichWith' (map (first WOS.keys))
153 enrichWith _ = enrichWith' identity
156 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
157 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
158 enrichWith' f = second (map both' . map f . concat)
160 both' = map (both decodeUtf8)
164 readFileWith :: FileFormat -> FilePath
165 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
166 readFileWith format path = do
167 files <- case takeExtension path of
168 ".zip" -> openZip path
169 _ -> pure <$> clean <$> DB.readFile path
170 partitionEithers <$> mapConcurrently (runParser format) files
174 -- According to the format of the text, choose the right parser.
175 -- TODO withParser :: FileFormat -> Parser [Document]
176 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
177 withParser WOS = WOS.parser
178 withParser RIS = RIS.parser
179 --withParser ODT = odtParser
180 --withParser XML = xmlParser
181 withParser _ = panic "[ERROR] Parser not implemented yet"
183 runParser :: FileFormat -> DB.ByteString
184 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
185 runParser format text = pure $ runParser' format text
187 runParser' :: FileFormat -> DB.ByteString
188 -> (Either String [[(DB.ByteString, DB.ByteString)]])
189 runParser' format text = parseOnly (withParser format) text
191 openZip :: FilePath -> IO [DB.ByteString]
193 entries <- withArchive fp (DM.keys <$> getEntries)
194 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
197 cleanText :: Text -> Text
198 cleanText = cs . clean . cs
200 clean :: DB.ByteString -> DB.ByteString
201 clean txt = DBC.map clean' txt