2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
21 {-# LANGUAGE PackageImports #-}
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
28 import Control.Concurrent.Async as CCA (mapConcurrently)
29 import Control.Monad.Identity (runIdentity)
30 import Data.Attoparsec.ByteString (parseOnly, Parser)
31 import Control.Monad (join)
32 import Data.Either(Either(..))
33 import Data.Either.Extra (partitionEithers)
34 import Data.List (concat, lookup)
36 import Data.String (String())
38 import Data.Text (Text)
39 import Data.Text.Encoding (decodeUtf8)
40 import Data.Tuple.Extra (both, first, second)
41 import System.FilePath (FilePath(), takeExtension)
42 import qualified Data.ByteString as DB
43 import qualified Data.ByteString.Char8 as DBC
44 import qualified Data.ByteString.Lazy as DBL
45 import qualified Data.Map as DM
46 import qualified Data.Text as DT
47 import qualified Prelude as Prelude
48 import System.IO.Temp (emptySystemTempFile)
50 import Gargantext.Core (Lang(..))
51 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
52 import Gargantext.Prelude
53 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv', parseCsvC)
54 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
55 -- import Gargantext.Core.Text.Learn (detectLangDefault)
56 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
57 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
58 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
59 ------------------------------------------------------------------------
61 type ParseError = String
63 --type Document = DM.Map Field Text
64 --type FilesParsed = DM.Map FilePath FileParsed
65 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
66 -- , _fileParsed_result :: [Document]
70 -- | According to the format of Input file,
71 -- different parser are available.
72 data FileFormat = WOS | RIS | RisPresse
77 -- Implemented (ISI Format)
78 -- | DOC -- Not Implemented / import Pandoc
79 -- | ODT -- Not Implemented / import Pandoc
80 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
81 -- | XML -- Not Implemented / see :
83 parseFormatC :: FileFormat -> DB.ByteString -> IO (Either Prelude.String (ConduitT () HyperdataDocument IO ()))
84 parseFormatC CsvGargV3 bs = pure $ transPipe (pure . runIdentity) <$> (parseCsvC $ DBL.fromStrict bs)
85 parseFormatC CsvHal bs = pure $ transPipe (pure . runIdentity) <$> (parseCsvC $ DBL.fromStrict bs)
86 parseFormatC RisPresse bs = do
88 <$> enrichWith RisPresse
90 $ [runParser' RisPresse bs]
91 pure $ (\docs' -> yieldMany docs' .| mapMC (toDoc RIS)) <$> docs
92 parseFormatC WOS bs = do
97 pure $ (\docs' -> yieldMany docs' .| mapMC (toDoc WOS)) <$> docs
98 parseFormatC ZIP bs = do
99 path <- emptySystemTempFile "parsed-zip"
101 parsedZip <- withArchive path $ do
102 DM.keys <$> getEntries
103 pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
104 parseFormatC _ _ = undefined
106 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
107 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
108 parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
109 parseFormat RisPresse bs = do
110 docs <- mapM (toDoc RIS)
112 <$> enrichWith RisPresse
114 $ [runParser' RisPresse bs]
116 parseFormat WOS bs = do
117 docs <- mapM (toDoc WOS)
121 $ [runParser' WOS bs]
123 parseFormat ZIP bs = do
124 path <- emptySystemTempFile "parsed-zip"
126 parsedZip <- withArchive path $ do
127 DM.keys <$> getEntries
128 pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
129 parseFormat _ _ = undefined
131 -- | Parse file into documents
132 -- TODO manage errors here
133 -- TODO: to debug maybe add the filepath in error message
134 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
135 parseFile CsvHal p = parseHal p
136 parseFile CsvGargV3 p = parseCsv p
137 parseFile RisPresse p = do
138 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
141 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
144 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
147 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
148 -- TODO use language for RIS
150 -- let abstract = lookup "abstract" d
151 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
153 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
155 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
157 pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
158 , _hd_doi = lookup "doi" d
159 , _hd_url = lookup "URL" d
160 , _hd_uniqId = Nothing
161 , _hd_uniqIdBdd = Nothing
163 , _hd_title = lookup "title" d
164 , _hd_authors = Nothing
165 , _hd_institutes = lookup "authors" d
166 , _hd_source = lookup "source" d
167 , _hd_abstract = lookup "abstract" d
168 , _hd_publication_date = fmap (DT.pack . show) utcTime
169 , _hd_publication_year = pub_year
170 , _hd_publication_month = pub_month
171 , _hd_publication_day = pub_day
172 , _hd_publication_hour = Nothing
173 , _hd_publication_minute = Nothing
174 , _hd_publication_second = Nothing
175 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
177 enrichWith :: FileFormat
178 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
179 enrichWith RisPresse = enrichWith' presseEnrich
180 enrichWith WOS = enrichWith' (map (first WOS.keys))
181 enrichWith _ = enrichWith' identity
184 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
185 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
186 enrichWith' f = second (map both' . map f . concat)
188 both' = map (both decodeUtf8)
192 readFileWith :: FileFormat -> FilePath
193 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
194 readFileWith format path = do
195 files <- case takeExtension path of
196 ".zip" -> openZip path
197 _ -> pure <$> clean <$> DB.readFile path
198 partitionEithers <$> mapConcurrently (runParser format) files
202 -- According to the format of the text, choose the right parser.
203 -- TODO withParser :: FileFormat -> Parser [Document]
204 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
205 withParser WOS = WOS.parser
206 withParser RIS = RIS.parser
207 --withParser ODT = odtParser
208 --withParser XML = xmlParser
209 withParser _ = panic "[ERROR] Parser not implemented yet"
211 runParser :: FileFormat -> DB.ByteString
212 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
213 runParser format text = pure $ runParser' format text
215 runParser' :: FileFormat -> DB.ByteString
216 -> (Either String [[(DB.ByteString, DB.ByteString)]])
217 runParser' format text = parseOnly (withParser format) text
219 openZip :: FilePath -> IO [DB.ByteString]
221 entries <- withArchive fp (DM.keys <$> getEntries)
222 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
225 cleanText :: Text -> Text
226 cleanText = cs . clean . cs
228 clean :: DB.ByteString -> DB.ByteString
229 clean txt = DBC.map clean' txt