2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
21 {-# LANGUAGE PackageImports #-}
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
27 import Control.Concurrent.Async as CCA (mapConcurrently)
28 import Data.Attoparsec.ByteString (parseOnly, Parser)
29 import Data.Either(Either(..))
30 import Data.Either.Extra (partitionEithers)
31 import Data.List (concat, lookup)
33 import Data.String (String())
35 import Data.Text (Text)
36 import Data.Text.Encoding (decodeUtf8)
37 import Data.Tuple.Extra (both, first, second)
38 import System.FilePath (FilePath(), takeExtension)
39 import qualified Data.ByteString as DB
40 import qualified Data.ByteString.Char8 as DBC
41 import qualified Data.ByteString.Lazy as DBL
42 import qualified Data.Map as DM
43 import qualified Data.Text as DT
44 import qualified Prelude as Prelude
45 import System.IO.Temp (emptySystemTempFile)
47 import Gargantext.Core (Lang(..))
48 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
49 import Gargantext.Prelude
50 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
51 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
52 -- import Gargantext.Core.Text.Learn (detectLangDefault)
53 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
54 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
55 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
56 ------------------------------------------------------------------------
58 type ParseError = String
60 --type Document = DM.Map Field Text
61 --type FilesParsed = DM.Map FilePath FileParsed
62 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
63 -- , _fileParsed_result :: [Document]
67 -- | According to the format of Input file,
68 -- different parser are available.
69 data FileFormat = WOS | RIS | RisPresse
74 -- Implemented (ISI Format)
75 -- | DOC -- Not Implemented / import Pandoc
76 -- | ODT -- Not Implemented / import Pandoc
77 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
78 -- | XML -- Not Implemented / see :
81 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
82 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
83 parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
84 parseFormat RisPresse bs = do
85 let docs = map (toDoc RIS)
87 <$> enrichWith RisPresse
89 $ [runParser' RisPresse bs]
91 parseFormat WOS bs = do
92 let docs = map (toDoc WOS)
98 parseFormat ZIP bs = do
99 path <- emptySystemTempFile "parsed-zip"
101 parsedZip <- withArchive path $ do
102 DM.keys <$> getEntries
103 pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
104 parseFormat _ _ = undefined
106 -- | Parse file into documents
107 -- TODO manage errors here
108 -- TODO: to debug maybe add the filepath in error message
109 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
110 parseFile CsvHal p = parseHal p
111 parseFile CsvGargV3 p = parseCsv p
112 parseFile RisPresse p = do
113 docs <- map (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
116 docs <- map (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
119 docs <- map (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
122 toDoc :: FileFormat -> [(Text, Text)] -> HyperdataDocument
123 -- TODO use language for RIS
125 -- let abstract = lookup "abstract" d
126 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
128 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
130 let (utcTime, (pub_year, pub_month, pub_day)) = Date.dateSplit lang dateToParse
132 HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
133 , _hd_doi = lookup "doi" d
134 , _hd_url = lookup "URL" d
135 , _hd_uniqId = Nothing
136 , _hd_uniqIdBdd = Nothing
138 , _hd_title = lookup "title" d
139 , _hd_authors = Nothing
140 , _hd_institutes = lookup "authors" d
141 , _hd_source = lookup "source" d
142 , _hd_abstract = lookup "abstract" d
143 , _hd_publication_date = fmap (DT.pack . show) utcTime
144 , _hd_publication_year = pub_year
145 , _hd_publication_month = pub_month
146 , _hd_publication_day = pub_day
147 , _hd_publication_hour = Nothing
148 , _hd_publication_minute = Nothing
149 , _hd_publication_second = Nothing
150 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
152 enrichWith :: FileFormat
153 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
154 enrichWith RisPresse = enrichWith' presseEnrich
155 enrichWith WOS = enrichWith' (map (first WOS.keys))
156 enrichWith _ = enrichWith' identity
159 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
160 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
161 enrichWith' f = second (map both' . map f . concat)
163 both' = map (both decodeUtf8)
167 readFileWith :: FileFormat -> FilePath
168 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
169 readFileWith format path = do
170 files <- case takeExtension path of
171 ".zip" -> openZip path
172 _ -> pure <$> clean <$> DB.readFile path
173 partitionEithers <$> mapConcurrently (runParser format) files
177 -- According to the format of the text, choose the right parser.
178 -- TODO withParser :: FileFormat -> Parser [Document]
179 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
180 withParser WOS = WOS.parser
181 withParser RIS = RIS.parser
182 --withParser ODT = odtParser
183 --withParser XML = xmlParser
184 withParser _ = panic "[ERROR] Parser not implemented yet"
186 runParser :: FileFormat -> DB.ByteString
187 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
188 runParser format text = pure $ runParser' format text
190 runParser' :: FileFormat -> DB.ByteString
191 -> (Either String [[(DB.ByteString, DB.ByteString)]])
192 runParser' format text = parseOnly (withParser format) text
194 openZip :: FilePath -> IO [DB.ByteString]
196 entries <- withArchive fp (DM.keys <$> getEntries)
197 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
200 cleanText :: Text -> Text
201 cleanText = cs . clean . cs
203 clean :: DB.ByteString -> DB.ByteString
204 clean txt = DBC.map clean' txt