2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
21 {-# LANGUAGE PackageImports #-}
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
27 import Control.Concurrent.Async as CCA (mapConcurrently)
28 import Control.Monad (join, sequence)
29 import Control.Monad.IO.Class (liftIO)
30 import Data.Attoparsec.ByteString (parseOnly, Parser)
31 import Data.Either(Either(..))
32 import Data.Either.Extra (partitionEithers)
33 import Data.List (concat, lookup)
35 import Data.String (String())
37 import Data.Text (Text)
38 import Data.Text.Encoding (decodeUtf8)
39 import Data.Tuple.Extra (both, first, second)
40 import System.FilePath (FilePath(), takeExtension)
41 import qualified Data.ByteString as DB
42 import qualified Data.ByteString.Char8 as DBC
43 import qualified Data.ByteString.Lazy as DBL
44 import qualified Data.Map as DM
45 import qualified Data.Text as DT
46 import qualified Prelude as Prelude
47 import System.IO.Temp (emptySystemTempFile)
49 import Gargantext.Core (Lang(..))
50 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
51 import Gargantext.Prelude
52 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
53 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
54 -- import Gargantext.Core.Text.Learn (detectLangDefault)
55 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
56 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
57 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
58 ------------------------------------------------------------------------
60 type ParseError = String
62 --type Document = DM.Map Field Text
63 --type FilesParsed = DM.Map FilePath FileParsed
64 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
65 -- , _fileParsed_result :: [Document]
69 -- | According to the format of Input file,
70 -- different parser are available.
71 data FileFormat = WOS | RIS | RisPresse
76 -- Implemented (ISI Format)
77 -- | DOC -- Not Implemented / import Pandoc
78 -- | ODT -- Not Implemented / import Pandoc
79 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
80 -- | XML -- Not Implemented / see :
83 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
84 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
85 parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
86 parseFormat RisPresse bs = do
87 docs <- mapM (toDoc RIS)
89 <$> enrichWith RisPresse
91 $ [runParser' RisPresse bs]
93 parseFormat WOS bs = do
94 docs <- mapM (toDoc WOS)
100 parseFormat ZIP bs = do
101 path <- emptySystemTempFile "parsed.zip"
103 withArchive path $ do
104 files <- DM.keys <$> getEntries
105 filesContents <- mapM getEntry files
106 ddocs <- liftIO $ mapM (parseFormat CsvGargV3) filesContents
107 pure $ concat <$> sequence ddocs
108 parseFormat _ _ = undefined
110 -- | Parse file into documents
111 -- TODO manage errors here
112 -- TODO: to debug maybe add the filepath in error message
113 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
114 parseFile CsvHal p = parseHal p
115 parseFile CsvGargV3 p = parseCsv p
116 parseFile RisPresse p = do
117 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
120 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
123 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
126 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
127 -- TODO use language for RIS
129 -- let abstract = lookup "abstract" d
130 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
132 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
134 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
136 pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
137 , _hd_doi = lookup "doi" d
138 , _hd_url = lookup "URL" d
139 , _hd_uniqId = Nothing
140 , _hd_uniqIdBdd = Nothing
142 , _hd_title = lookup "title" d
143 , _hd_authors = Nothing
144 , _hd_institutes = lookup "authors" d
145 , _hd_source = lookup "source" d
146 , _hd_abstract = lookup "abstract" d
147 , _hd_publication_date = fmap (DT.pack . show) utcTime
148 , _hd_publication_year = pub_year
149 , _hd_publication_month = pub_month
150 , _hd_publication_day = pub_day
151 , _hd_publication_hour = Nothing
152 , _hd_publication_minute = Nothing
153 , _hd_publication_second = Nothing
154 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
156 enrichWith :: FileFormat
157 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
158 enrichWith RisPresse = enrichWith' presseEnrich
159 enrichWith WOS = enrichWith' (map (first WOS.keys))
160 enrichWith _ = enrichWith' identity
163 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
164 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
165 enrichWith' f = second (map both' . map f . concat)
167 both' = map (both decodeUtf8)
171 readFileWith :: FileFormat -> FilePath
172 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
173 readFileWith format path = do
174 files <- case takeExtension path of
175 ".zip" -> openZip path
176 _ -> pure <$> clean <$> DB.readFile path
177 partitionEithers <$> mapConcurrently (runParser format) files
181 -- According to the format of the text, choose the right parser.
182 -- TODO withParser :: FileFormat -> Parser [Document]
183 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
184 withParser WOS = WOS.parser
185 withParser RIS = RIS.parser
186 --withParser ODT = odtParser
187 --withParser XML = xmlParser
188 withParser _ = panic "[ERROR] Parser not implemented yet"
190 runParser :: FileFormat -> DB.ByteString
191 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
192 runParser format text = pure $ runParser' format text
194 runParser' :: FileFormat -> DB.ByteString
195 -> (Either String [[(DB.ByteString, DB.ByteString)]])
196 runParser' format text = parseOnly (withParser format) text
198 openZip :: FilePath -> IO [DB.ByteString]
200 entries <- withArchive fp (DM.keys <$> getEntries)
201 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
204 cleanText :: Text -> Text
205 cleanText = cs . clean . cs
207 clean :: DB.ByteString -> DB.ByteString
208 clean txt = DBC.map clean' txt