2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
21 {-# LANGUAGE PackageImports #-}
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC)
26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
28 import Control.Concurrent.Async as CCA (mapConcurrently)
29 import Control.Monad.Trans.Control (MonadBaseControl)
30 import Control.Monad (join)
31 import Data.Attoparsec.ByteString (parseOnly, Parser)
32 import Data.Either(Either(..))
33 import Data.Either.Extra (partitionEithers)
34 import Data.List (concat, lookup)
36 import Data.String (String())
38 import Data.Text (Text, intercalate, pack, unpack)
39 import Data.Text.Encoding (decodeUtf8)
40 import Data.Tuple.Extra (both, first, second)
41 import System.FilePath (FilePath(), takeExtension)
42 import qualified Data.ByteString as DB
43 import qualified Data.ByteString.Char8 as DBC
44 import qualified Data.ByteString.Lazy as DBL
45 import qualified Data.Map as DM
46 import qualified Data.Text as DT
47 import qualified Prelude
48 import System.IO.Temp (emptySystemTempFile)
50 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
51 import Gargantext.Core (Lang(..))
52 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
53 import Gargantext.Prelude
54 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
55 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
56 -- import Gargantext.Core.Text.Learn (detectLangDefault)
57 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
58 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
59 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
60 ------------------------------------------------------------------------
62 type ParseError = String
64 --type Document = DM.Map Field Text
65 --type FilesParsed = DM.Map FilePath FileParsed
66 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
67 -- , _fileParsed_result :: [Document]
71 -- | According to the format of Input file,
72 -- different parser are available.
73 data FileType = WOS | RIS | RisPresse | CsvGargV3 | CsvHal
76 -- Implemented (ISI Format)
77 -- | DOC -- Not Implemented / import Pandoc
78 -- | ODT -- Not Implemented / import Pandoc
79 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
80 -- | XML -- Not Implemented / see :
82 parseFormatC :: MonadBaseControl IO m
86 -> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
87 parseFormatC CsvGargV3 Plain bs = do
88 let eParsedC = parseCsvC $ DBL.fromStrict bs
90 Left err -> pure $ Left err
91 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
92 parseFormatC CsvHal Plain bs = do
93 let eParsedC = parseCsvC $ DBL.fromStrict bs
95 Left err -> pure $ Left err
96 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
97 parseFormatC RisPresse Plain bs = do
98 --docs <- enrichWith RisPresse
99 let eDocs = runParser' RisPresse bs
101 ( Just $ fromIntegral $ length docs
104 .| mapC (map $ both decodeUtf8)
105 .| mapMC (toDoc RIS)) ) <$> eDocs
106 parseFormatC WOS Plain bs = do
107 let eDocs = runParser' WOS bs
109 ( Just $ fromIntegral $ length docs
111 .| mapC (map $ first WOS.keys)
112 .| mapC (map $ both decodeUtf8)
113 .| mapMC (toDoc WOS)) ) <$> eDocs
114 parseFormatC ft ZIP bs = do
115 path <- liftBase $ emptySystemTempFile "parsed-zip"
116 liftBase $ DB.writeFile path bs
117 fileContents <- liftBase $ withArchive path $ do
118 files <- DM.keys <$> getEntries
120 --printDebug "[parseFormatC] fileContents" fileContents
121 eContents <- mapM (parseFormatC ft Plain) fileContents
122 --printDebug "[parseFormatC] contents" contents
123 --pure $ Left $ "Not implemented for ZIP"
124 let (errs, contents) = partitionEithers eContents
128 [] -> pure $ Left "No files in zip"
130 let lenghts = fst <$> contents
131 let contents' = snd <$> contents
132 let totalLength = sum $ sum <$> lenghts -- Trick: sum (Just 1) = 1, sum Nothing = 0
133 pure $ Right ( Just totalLength
134 , sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
135 _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
137 parseFormatC _ _ _ = undefined
139 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
140 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
141 -- parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
142 -- parseFormat RisPresse bs = do
143 -- docs <- mapM (toDoc RIS)
145 -- <$> enrichWith RisPresse
146 -- $ partitionEithers
147 -- $ [runParser' RisPresse bs]
149 -- parseFormat WOS bs = do
150 -- docs <- mapM (toDoc WOS)
152 -- <$> enrichWith WOS
153 -- $ partitionEithers
154 -- $ [runParser' WOS bs]
156 -- parseFormat ZIP bs = do
157 -- path <- emptySystemTempFile "parsed-zip"
158 -- DB.writeFile path bs
159 -- parsedZip <- withArchive path $ do
160 -- DM.keys <$> getEntries
161 -- pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
162 -- parseFormat _ _ = undefined
164 -- | Parse file into documents
165 -- TODO manage errors here
166 -- TODO: to debug maybe add the filepath in error message
168 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
169 parseFile CsvHal Plain p = parseHal p
170 parseFile CsvGargV3 Plain p = parseCsv p
171 parseFile RisPresse Plain p = do
172 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
174 parseFile WOS Plain p = do
175 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
177 parseFile ff _ p = do
178 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
181 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
182 -- TODO use language for RIS
184 -- let abstract = lookup "abstract" d
185 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
187 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
189 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
191 pure HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
192 , _hd_doi = lookup "doi" d
193 , _hd_url = lookup "URL" d
194 , _hd_uniqId = Nothing
195 , _hd_uniqIdBdd = Nothing
197 , _hd_title = lookup "title" d
198 , _hd_authors = Nothing
199 , _hd_institutes = lookup "authors" d
200 , _hd_source = lookup "source" d
201 , _hd_abstract = lookup "abstract" d
202 , _hd_publication_date = fmap (DT.pack . show) utcTime
203 , _hd_publication_year = pub_year
204 , _hd_publication_month = pub_month
205 , _hd_publication_day = pub_day
206 , _hd_publication_hour = Nothing
207 , _hd_publication_minute = Nothing
208 , _hd_publication_second = Nothing
209 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
211 enrichWith :: FileType
212 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
213 enrichWith RisPresse = enrichWith' presseEnrich
214 enrichWith WOS = enrichWith' (map (first WOS.keys))
215 enrichWith _ = enrichWith' identity
218 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
219 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
220 enrichWith' f = second (map both' . map f . concat)
222 both' = map (both decodeUtf8)
226 readFileWith :: FileType -> FilePath
227 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
228 readFileWith format path = do
229 files <- case takeExtension path of
230 ".zip" -> openZip path
231 _ -> pure <$> clean <$> DB.readFile path
232 partitionEithers <$> mapConcurrently (runParser format) files
236 -- According to the format of the text, choose the right parser.
237 -- TODO withParser :: FileType -> Parser [Document]
238 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
239 withParser WOS = WOS.parser
240 withParser RIS = RIS.parser
241 --withParser ODT = odtParser
242 --withParser XML = xmlParser
243 withParser _ = panic "[ERROR] Parser not implemented yet"
245 runParser :: FileType -> DB.ByteString
246 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
247 runParser format text = pure $ runParser' format text
249 runParser' :: FileType -> DB.ByteString
250 -> (Either String [[(DB.ByteString, DB.ByteString)]])
251 runParser' format text = parseOnly (withParser format) text
253 openZip :: FilePath -> IO [DB.ByteString]
255 entries <- withArchive fp (DM.keys <$> getEntries)
256 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
259 cleanText :: Text -> Text
260 cleanText = cs . clean . cs
262 clean :: DB.ByteString -> DB.ByteString
263 clean txt = DBC.map clean' txt