2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
21 {-# LANGUAGE PackageImports #-}
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
27 import Control.Concurrent.Async as CCA (mapConcurrently)
28 import Control.Monad (join, sequence)
29 import Control.Monad.IO.Class (liftIO)
30 import Data.Attoparsec.ByteString (parseOnly, Parser)
31 import Data.Either(Either(..))
32 import Data.Either.Extra (partitionEithers)
33 import Data.List (concat, lookup)
35 import Data.String (String())
37 import Data.Text (Text)
38 import Data.Text.Encoding (decodeUtf8)
39 import Data.Tuple.Extra (both, first, second)
40 import System.FilePath (FilePath(), takeExtension)
41 import qualified Data.ByteString as DB
42 import qualified Data.ByteString.Char8 as DBC
43 import qualified Data.ByteString.Lazy as DBL
44 import qualified Data.Map as DM
45 import qualified Data.Text as DT
46 import qualified Prelude as Prelude
47 import System.IO.Temp (emptySystemTempFile)
49 import Gargantext.Core (Lang(..))
50 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
51 import Gargantext.Prelude
52 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
53 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
54 -- import Gargantext.Core.Text.Learn (detectLangDefault)
55 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
56 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
57 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
58 ------------------------------------------------------------------------
60 type ParseError = String
62 --type Document = DM.Map Field Text
63 --type FilesParsed = DM.Map FilePath FileParsed
64 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
65 -- , _fileParsed_result :: [Document]
69 -- | According to the format of Input file,
70 -- different parser are available.
71 data FileFormat = WOS | RIS | RisPresse
76 -- Implemented (ISI Format)
77 -- | DOC -- Not Implemented / import Pandoc
78 -- | ODT -- Not Implemented / import Pandoc
79 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
80 -- | XML -- Not Implemented / see :
83 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
84 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
85 parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
86 parseFormat RisPresse bs = do
87 docs <- mapM (toDoc RIS)
89 <$> enrichWith RisPresse
91 $ [runParser' RisPresse bs]
93 parseFormat WOS bs = do
94 docs <- mapM (toDoc WOS)
100 parseFormat ZIP bs = do
101 path <- emptySystemTempFile "parsed-zip"
103 parsedZip <- withArchive path $ do
104 DM.keys <$> getEntries
105 pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
106 parseFormat _ _ = undefined
108 -- | Parse file into documents
109 -- TODO manage errors here
110 -- TODO: to debug maybe add the filepath in error message
111 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
112 parseFile CsvHal p = parseHal p
113 parseFile CsvGargV3 p = parseCsv p
114 parseFile RisPresse p = do
115 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
118 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
121 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
124 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
125 -- TODO use language for RIS
127 -- let abstract = lookup "abstract" d
128 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
130 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
132 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
134 pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
135 , _hd_doi = lookup "doi" d
136 , _hd_url = lookup "URL" d
137 , _hd_uniqId = Nothing
138 , _hd_uniqIdBdd = Nothing
140 , _hd_title = lookup "title" d
141 , _hd_authors = Nothing
142 , _hd_institutes = lookup "authors" d
143 , _hd_source = lookup "source" d
144 , _hd_abstract = lookup "abstract" d
145 , _hd_publication_date = fmap (DT.pack . show) utcTime
146 , _hd_publication_year = pub_year
147 , _hd_publication_month = pub_month
148 , _hd_publication_day = pub_day
149 , _hd_publication_hour = Nothing
150 , _hd_publication_minute = Nothing
151 , _hd_publication_second = Nothing
152 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
154 enrichWith :: FileFormat
155 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
156 enrichWith RisPresse = enrichWith' presseEnrich
157 enrichWith WOS = enrichWith' (map (first WOS.keys))
158 enrichWith _ = enrichWith' identity
161 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
162 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
163 enrichWith' f = second (map both' . map f . concat)
165 both' = map (both decodeUtf8)
169 readFileWith :: FileFormat -> FilePath
170 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
171 readFileWith format path = do
172 files <- case takeExtension path of
173 ".zip" -> openZip path
174 _ -> pure <$> clean <$> DB.readFile path
175 partitionEithers <$> mapConcurrently (runParser format) files
179 -- According to the format of the text, choose the right parser.
180 -- TODO withParser :: FileFormat -> Parser [Document]
181 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
182 withParser WOS = WOS.parser
183 withParser RIS = RIS.parser
184 --withParser ODT = odtParser
185 --withParser XML = xmlParser
186 withParser _ = panic "[ERROR] Parser not implemented yet"
188 runParser :: FileFormat -> DB.ByteString
189 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
190 runParser format text = pure $ runParser' format text
192 runParser' :: FileFormat -> DB.ByteString
193 -> (Either String [[(DB.ByteString, DB.ByteString)]])
194 runParser' format text = parseOnly (withParser format) text
196 openZip :: FilePath -> IO [DB.ByteString]
198 entries <- withArchive fp (DM.keys <$> getEntries)
199 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
202 cleanText :: Text -> Text
203 cleanText = cs . clean . cs
205 clean :: DB.ByteString -> DB.ByteString
206 clean txt = DBC.map clean' txt