2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
21 {-# LANGUAGE PackageImports #-}
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC, splitOn, etale)
26 -- import Gargantext.Core.Text.Learn (detectLangDefault)
27 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
29 import Control.Concurrent.Async as CCA (mapConcurrently)
30 import Control.Monad (join)
31 import Control.Monad.Trans.Control (MonadBaseControl)
32 import Data.Attoparsec.ByteString (parseOnly, Parser)
33 import Data.Either(Either(..))
34 import Data.Either.Extra (partitionEithers)
35 import Data.List (concat, lookup)
37 import Data.String (String())
39 import Data.Text (Text, intercalate, pack, unpack)
40 import Data.Text.Encoding (decodeUtf8)
41 import Data.Tuple.Extra (both, first, second)
42 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
43 import Gargantext.Core (Lang(..))
44 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
45 import Gargantext.Core.Text.Corpus.Parsers.JSON (parseJSONC)
46 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
47 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
48 import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
49 import Gargantext.Prelude
50 import System.FilePath (FilePath(), takeExtension)
51 import System.IO.Temp (emptySystemTempFile)
52 import Gargantext.Core.Text.Corpus.Parsers.FrameWrite (text2titleParagraphs)
53 import qualified Data.ByteString as DB
54 import qualified Data.ByteString.Char8 as DBC
55 import qualified Data.ByteString.Lazy as DBL
56 import qualified Data.Map as DM
57 import qualified Data.Text as DT
58 import qualified Data.Text as Text
59 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
60 import qualified Gargantext.Core.Text.Corpus.Parsers.Iramuteq as Iramuteq
61 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
62 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
63 import qualified Prelude
64 ------------------------------------------------------------------------
66 type ParseError = String
68 --type Document = DM.Map Field Text
69 --type FilesParsed = DM.Map FilePath FileParsed
70 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
71 -- , _fileParsed_result :: [Document]
75 -- | According to the format of Input file,
76 -- different parser are available.
86 -- Implemented (ISI Format)
87 -- | DOC -- Not Implemented / import Pandoc
88 -- | ODT -- Not Implemented / import Pandoc
89 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
90 -- | XML -- Not Implemented / see :
92 parseFormatC :: MonadBaseControl IO m
96 -> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
97 parseFormatC CsvGargV3 Plain bs = do
98 let eParsedC = parseCsvC $ DBL.fromStrict bs
100 Left err -> pure $ Left err
101 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
102 parseFormatC CsvHal Plain bs = do
103 let eParsedC = parseCsvC $ DBL.fromStrict bs
105 Left err -> pure $ Left err
106 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
107 parseFormatC RisPresse Plain bs = do
108 --docs <- enrichWith RisPresse
109 let eDocs = runParser' RisPresse bs
111 ( Just $ fromIntegral $ length docs
114 .| mapC (map $ both decodeUtf8)
115 .| mapMC (toDoc RIS)) ) <$> eDocs
116 parseFormatC WOS Plain bs = do
117 let eDocs = runParser' WOS bs
119 ( Just $ fromIntegral $ length docs
121 .| mapC (map $ first WOS.keys)
122 .| mapC (map $ both decodeUtf8)
123 .| mapMC (toDoc WOS)) ) <$> eDocs
125 parseFormatC Iramuteq Plain bs = do
126 let eDocs = runParser' Iramuteq bs
128 ( Just $ fromIntegral $ length docs
130 .| mapC (map $ first Iramuteq.keys)
131 .| mapC (map $ both decodeUtf8)
132 .| mapMC ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
137 parseFormatC JSON Plain bs = do
138 let eParsedC = parseJSONC $ DBL.fromStrict bs
140 Left err -> pure $ Left err
141 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
143 parseFormatC ft ZIP bs = do
144 path <- liftBase $ emptySystemTempFile "parsed-zip"
145 liftBase $ DB.writeFile path bs
146 fileContents <- liftBase $ withArchive path $ do
147 files <- DM.keys <$> getEntries
149 --printDebug "[parseFormatC] fileContents" fileContents
150 eContents <- mapM (parseFormatC ft Plain) fileContents
151 --printDebug "[parseFormatC] contents" contents
152 --pure $ Left $ "Not implemented for ZIP"
153 let (errs, contents) = partitionEithers eContents
157 [] -> pure $ Left "No files in zip"
159 let lenghts = fst <$> contents
160 let contents' = snd <$> contents
161 let totalLength = sum $ sum <$> lenghts -- Trick: sum (Just 1) = 1, sum Nothing = 0
162 pure $ Right ( Just totalLength
163 , sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
164 _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
166 parseFormatC _ _ _ = undefined
169 etale :: [HyperdataDocument] -> [HyperdataDocument]
170 etale = concat . (map etale')
172 etale' :: HyperdataDocument -> [HyperdataDocument]
173 etale' h = map (\t -> h { _hd_abstract = Just t })
175 $ text2titleParagraphs 7 (maybe "" identity $ _hd_abstract h)
178 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
179 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
180 -- parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
181 -- parseFormat RisPresse bs = do
182 -- docs <- mapM (toDoc RIS)
184 -- <$> enrichWith RisPresse
185 -- $ partitionEithers
186 -- $ [runParser' RisPresse bs]
188 -- parseFormat WOS bs = do
189 -- docs <- mapM (toDoc WOS)
191 -- <$> enrichWith WOS
192 -- $ partitionEithers
193 -- $ [runParser' WOS bs]
195 -- parseFormat ZIP bs = do
196 -- path <- emptySystemTempFile "parsed-zip"
197 -- DB.writeFile path bs
198 -- parsedZip <- withArchive path $ do
199 -- DM.keys <$> getEntries
200 -- pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
201 -- parseFormat _ _ = undefined
203 -- | Parse file into documents
204 -- TODO manage errors here
205 -- TODO: to debug maybe add the filepath in error message
207 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
208 parseFile CsvHal Plain p = parseHal p
209 parseFile CsvGargV3 Plain p = parseCsv p
211 parseFile RisPresse Plain p = do
212 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
215 parseFile WOS Plain p = do
216 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
219 parseFile Iramuteq Plain p = do
220 docs <- join $ mapM ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
222 <$> enrichWith Iramuteq
223 <$> readFileWith Iramuteq p
227 parseFile ff _ p = do
228 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
231 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
232 -- TODO use language for RIS
234 -- let abstract = lookup "abstract" d
235 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
237 let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d
238 -- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
239 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
241 let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
242 , _hd_doi = lookup "doi" d
243 , _hd_url = lookup "URL" d
244 , _hd_uniqId = Nothing
245 , _hd_uniqIdBdd = Nothing
247 , _hd_title = lookup "title" d
248 , _hd_authors = lookup "authors" d
249 , _hd_institutes = lookup "institutes" d
250 , _hd_source = lookup "source" d
251 , _hd_abstract = lookup "abstract" d
252 , _hd_publication_date = fmap (DT.pack . show) utcTime
253 , _hd_publication_year = pub_year
254 , _hd_publication_month = pub_month
255 , _hd_publication_day = pub_day
256 , _hd_publication_hour = Nothing
257 , _hd_publication_minute = Nothing
258 , _hd_publication_second = Nothing
259 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
260 -- printDebug "[G.C.T.C.Parsers] HyperdataDocument" hd
263 enrichWith :: FileType
264 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
265 enrichWith RisPresse = enrichWith' presseEnrich
266 enrichWith WOS = enrichWith' (map (first WOS.keys))
267 enrichWith Iramuteq = enrichWith' (map (first Iramuteq.keys))
268 enrichWith _ = enrichWith' identity
271 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
272 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
273 enrichWith' f = second (map both' . map f . concat)
275 both' = map (both decodeUtf8)
279 readFileWith :: FileType -> FilePath
280 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
281 readFileWith format path = do
282 files <- case takeExtension path of
283 ".zip" -> openZip path
284 _ -> pure <$> clean <$> DB.readFile path
285 partitionEithers <$> mapConcurrently (runParser format) files
289 -- According to the format of the text, choose the right parser.
290 -- TODO withParser :: FileType -> Parser [Document]
291 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
292 withParser WOS = WOS.parser
293 withParser RIS = RIS.parser
294 withParser Iramuteq = Iramuteq.parser
295 --withParser ODT = odtParser
296 --withParser XML = xmlParser
297 withParser _ = panic "[ERROR] Parser not implemented yet"
299 runParser :: FileType -> DB.ByteString
300 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
301 runParser format text = pure $ runParser' format text
303 runParser' :: FileType -> DB.ByteString
304 -> (Either String [[(DB.ByteString, DB.ByteString)]])
305 runParser' format text = parseOnly (withParser format) text
307 openZip :: FilePath -> IO [DB.ByteString]
309 entries <- withArchive fp (DM.keys <$> getEntries)
310 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
313 cleanText :: Text -> Text
314 cleanText = cs . clean . cs
316 clean :: DB.ByteString -> DB.ByteString
317 clean txt = DBC.map clean' txt
327 splitOn :: NgramsType -> Maybe Text -> Text -> [Text]
328 splitOn Authors (Just "WOS") = (DT.splitOn "; ")
329 splitOn _ _ = (DT.splitOn ", ")