2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
21 {-# LANGUAGE PackageImports #-}
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), clean, parseFile, cleanText, parseFormat)
26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
27 import Control.Concurrent.Async as CCA (mapConcurrently)
28 import Control.Monad (join)
29 import Data.Attoparsec.ByteString (parseOnly, Parser)
30 import Data.Either(Either(..))
31 import Data.Either.Extra (partitionEithers)
32 import Data.List (concat, lookup)
34 import Data.String (String())
36 import Data.Text (Text)
37 import Data.Text.Encoding (decodeUtf8)
38 import Data.Tuple.Extra (both, first, second)
39 import System.FilePath (FilePath(), takeExtension)
40 import qualified Data.ByteString as DB
41 import qualified Data.ByteString.Char8 as DBC
42 import qualified Data.ByteString.Lazy as DBL
43 import qualified Data.Map as DM
44 import qualified Data.Text as DT
45 import qualified Prelude as Prelude
47 import Gargantext.Core (Lang(..))
48 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
49 import Gargantext.Prelude
50 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseHal', parseCsv, parseCsv')
51 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
52 -- import Gargantext.Core.Text.Learn (detectLangDefault)
53 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
54 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
55 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
56 ------------------------------------------------------------------------
58 type ParseError = String
60 --type Document = DM.Map Field Text
61 --type FilesParsed = DM.Map FilePath FileParsed
62 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
63 -- , _fileParsed_result :: [Document]
67 -- | According to the format of Input file,
68 -- different parser are available.
69 data FileFormat = WOS | RIS | RisPresse
73 -- Implemented (ISI Format)
74 -- | DOC -- Not Implemented / import Pandoc
75 -- | ODT -- Not Implemented / import Pandoc
76 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
77 -- | XML -- Not Implemented / see :
80 parseFormat :: FileFormat -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
81 parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
82 parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
83 parseFormat RisPresse bs = do
84 docs <- mapM (toDoc RIS)
86 <$> enrichWith RisPresse
88 $ [runParser' RisPresse bs]
90 parseFormat WOS bs = do
91 docs <- mapM (toDoc WOS)
97 parseFormat _ _ = undefined
99 -- | Parse file into documents
100 -- TODO manage errors here
101 -- TODO: to debug maybe add the filepath in error message
102 parseFile :: FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
103 parseFile CsvHal p = parseHal p
104 parseFile CsvGargV3 p = parseCsv p
105 parseFile RisPresse p = do
106 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
109 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
112 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
115 toDoc :: FileFormat -> [(Text, Text)] -> IO HyperdataDocument
116 -- TODO use language for RIS
118 -- let abstract = lookup "abstract" d
119 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
121 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
123 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
125 pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
126 , _hd_doi = lookup "doi" d
127 , _hd_url = lookup "URL" d
128 , _hd_uniqId = Nothing
129 , _hd_uniqIdBdd = Nothing
131 , _hd_title = lookup "title" d
132 , _hd_authors = Nothing
133 , _hd_institutes = lookup "authors" d
134 , _hd_source = lookup "source" d
135 , _hd_abstract = lookup "abstract" d
136 , _hd_publication_date = fmap (DT.pack . show) utcTime
137 , _hd_publication_year = pub_year
138 , _hd_publication_month = pub_month
139 , _hd_publication_day = pub_day
140 , _hd_publication_hour = Nothing
141 , _hd_publication_minute = Nothing
142 , _hd_publication_second = Nothing
143 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
145 enrichWith :: FileFormat
146 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
147 enrichWith RisPresse = enrichWith' presseEnrich
148 enrichWith WOS = enrichWith' (map (first WOS.keys))
149 enrichWith _ = enrichWith' identity
152 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
153 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
154 enrichWith' f = second (map both' . map f . concat)
156 both' = map (both decodeUtf8)
160 readFileWith :: FileFormat -> FilePath
161 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
162 readFileWith format path = do
163 files <- case takeExtension path of
164 ".zip" -> openZip path
165 _ -> pure <$> clean <$> DB.readFile path
166 partitionEithers <$> mapConcurrently (runParser format) files
170 -- According to the format of the text, choose the right parser.
171 -- TODO withParser :: FileFormat -> Parser [Document]
172 withParser :: FileFormat -> Parser [[(DB.ByteString, DB.ByteString)]]
173 withParser WOS = WOS.parser
174 withParser RIS = RIS.parser
175 --withParser ODT = odtParser
176 --withParser XML = xmlParser
177 withParser _ = panic "[ERROR] Parser not implemented yet"
179 runParser :: FileFormat -> DB.ByteString
180 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
181 runParser format text = pure $ runParser' format text
183 runParser' :: FileFormat -> DB.ByteString
184 -> (Either String [[(DB.ByteString, DB.ByteString)]])
185 runParser' format text = parseOnly (withParser format) text
187 openZip :: FilePath -> IO [DB.ByteString]
189 entries <- withArchive fp (DM.keys <$> getEntries)
190 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
193 cleanText :: Text -> Text
194 cleanText = cs . clean . cs
196 clean :: DB.ByteString -> DB.ByteString
197 clean txt = DBC.map clean' txt