2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
21 {-# LANGUAGE PackageImports #-}
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC, splitOn, etale)
26 -- import Gargantext.Core.Text.Learn (detectLangDefault)
27 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
29 import Control.Concurrent.Async as CCA (mapConcurrently)
30 import Control.Monad (join)
31 import Control.Monad.Trans.Control (MonadBaseControl)
32 import Data.Attoparsec.ByteString (parseOnly, Parser)
33 import Data.Either(Either(..))
34 import Data.Either.Extra (partitionEithers)
35 import Data.List (concat, lookup)
37 import Data.String (String())
39 import Data.Text (Text, intercalate, pack, unpack)
40 import Data.Text.Encoding (decodeUtf8)
41 import Data.Tuple.Extra (both, first, second)
42 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
43 import Gargantext.Core (Lang(..))
44 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
45 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
46 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
47 import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
48 import Gargantext.Prelude
49 import System.FilePath (FilePath(), takeExtension)
50 import System.IO.Temp (emptySystemTempFile)
51 import Gargantext.Core.Text.Corpus.Parsers.FrameWrite (text2titleParagraphs)
52 import qualified Data.ByteString as DB
53 import qualified Data.ByteString.Char8 as DBC
54 import qualified Data.ByteString.Lazy as DBL
55 import qualified Data.Map as DM
56 import qualified Data.Text as DT
57 import qualified Data.Text as Text
58 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
59 import qualified Gargantext.Core.Text.Corpus.Parsers.Iramuteq as Iramuteq
60 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
61 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
62 import qualified Prelude
63 ------------------------------------------------------------------------
65 type ParseError = String
67 --type Document = DM.Map Field Text
68 --type FilesParsed = DM.Map FilePath FileParsed
69 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
70 -- , _fileParsed_result :: [Document]
74 -- | According to the format of Input file,
75 -- different parser are available.
84 -- Implemented (ISI Format)
85 -- | DOC -- Not Implemented / import Pandoc
86 -- | ODT -- Not Implemented / import Pandoc
87 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
88 -- | XML -- Not Implemented / see :
90 parseFormatC :: MonadBaseControl IO m
94 -> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
95 parseFormatC CsvGargV3 Plain bs = do
96 let eParsedC = parseCsvC $ DBL.fromStrict bs
98 Left err -> pure $ Left err
99 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
100 parseFormatC CsvHal Plain bs = do
101 let eParsedC = parseCsvC $ DBL.fromStrict bs
103 Left err -> pure $ Left err
104 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
105 parseFormatC RisPresse Plain bs = do
106 --docs <- enrichWith RisPresse
107 let eDocs = runParser' RisPresse bs
109 ( Just $ fromIntegral $ length docs
112 .| mapC (map $ both decodeUtf8)
113 .| mapMC (toDoc RIS)) ) <$> eDocs
114 parseFormatC WOS Plain bs = do
115 let eDocs = runParser' WOS bs
117 ( Just $ fromIntegral $ length docs
119 .| mapC (map $ first WOS.keys)
120 .| mapC (map $ both decodeUtf8)
121 .| mapMC (toDoc WOS)) ) <$> eDocs
123 parseFormatC Iramuteq Plain bs = do
124 let eDocs = runParser' Iramuteq bs
126 ( Just $ fromIntegral $ length docs
128 .| mapC (map $ first Iramuteq.keys)
129 .| mapC (map $ both decodeUtf8)
130 .| mapMC ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
135 parseFormatC ft ZIP bs = do
136 path <- liftBase $ emptySystemTempFile "parsed-zip"
137 liftBase $ DB.writeFile path bs
138 fileContents <- liftBase $ withArchive path $ do
139 files <- DM.keys <$> getEntries
141 --printDebug "[parseFormatC] fileContents" fileContents
142 eContents <- mapM (parseFormatC ft Plain) fileContents
143 --printDebug "[parseFormatC] contents" contents
144 --pure $ Left $ "Not implemented for ZIP"
145 let (errs, contents) = partitionEithers eContents
149 [] -> pure $ Left "No files in zip"
151 let lenghts = fst <$> contents
152 let contents' = snd <$> contents
153 let totalLength = sum $ sum <$> lenghts -- Trick: sum (Just 1) = 1, sum Nothing = 0
154 pure $ Right ( Just totalLength
155 , sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
156 _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
158 parseFormatC _ _ _ = undefined
161 etale :: [HyperdataDocument] -> [HyperdataDocument]
162 etale = concat . (map etale')
164 etale' :: HyperdataDocument -> [HyperdataDocument]
165 etale' h = map (\t -> h { _hd_abstract = Just t })
167 $ text2titleParagraphs 7 (maybe "" identity $ _hd_abstract h)
170 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
171 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
172 -- parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
173 -- parseFormat RisPresse bs = do
174 -- docs <- mapM (toDoc RIS)
176 -- <$> enrichWith RisPresse
177 -- $ partitionEithers
178 -- $ [runParser' RisPresse bs]
180 -- parseFormat WOS bs = do
181 -- docs <- mapM (toDoc WOS)
183 -- <$> enrichWith WOS
184 -- $ partitionEithers
185 -- $ [runParser' WOS bs]
187 -- parseFormat ZIP bs = do
188 -- path <- emptySystemTempFile "parsed-zip"
189 -- DB.writeFile path bs
190 -- parsedZip <- withArchive path $ do
191 -- DM.keys <$> getEntries
192 -- pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
193 -- parseFormat _ _ = undefined
195 -- | Parse file into documents
196 -- TODO manage errors here
197 -- TODO: to debug maybe add the filepath in error message
199 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
200 parseFile CsvHal Plain p = parseHal p
201 parseFile CsvGargV3 Plain p = parseCsv p
203 parseFile RisPresse Plain p = do
204 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
207 parseFile WOS Plain p = do
208 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
211 parseFile Iramuteq Plain p = do
212 docs <- join $ mapM ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
214 <$> enrichWith Iramuteq
215 <$> readFileWith Iramuteq p
219 parseFile ff _ p = do
220 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
223 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
224 -- TODO use language for RIS
226 -- let abstract = lookup "abstract" d
227 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
229 let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d
230 -- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
231 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
233 let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
234 , _hd_doi = lookup "doi" d
235 , _hd_url = lookup "URL" d
236 , _hd_uniqId = Nothing
237 , _hd_uniqIdBdd = Nothing
239 , _hd_title = lookup "title" d
240 , _hd_authors = lookup "authors" d
241 , _hd_institutes = lookup "institutes" d
242 , _hd_source = lookup "source" d
243 , _hd_abstract = lookup "abstract" d
244 , _hd_publication_date = fmap (DT.pack . show) utcTime
245 , _hd_publication_year = pub_year
246 , _hd_publication_month = pub_month
247 , _hd_publication_day = pub_day
248 , _hd_publication_hour = Nothing
249 , _hd_publication_minute = Nothing
250 , _hd_publication_second = Nothing
251 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
252 -- printDebug "[G.C.T.C.Parsers] HyperdataDocument" hd
255 enrichWith :: FileType
256 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
257 enrichWith RisPresse = enrichWith' presseEnrich
258 enrichWith WOS = enrichWith' (map (first WOS.keys))
259 enrichWith Iramuteq = enrichWith' (map (first Iramuteq.keys))
260 enrichWith _ = enrichWith' identity
263 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
264 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
265 enrichWith' f = second (map both' . map f . concat)
267 both' = map (both decodeUtf8)
271 readFileWith :: FileType -> FilePath
272 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
273 readFileWith format path = do
274 files <- case takeExtension path of
275 ".zip" -> openZip path
276 _ -> pure <$> clean <$> DB.readFile path
277 partitionEithers <$> mapConcurrently (runParser format) files
281 -- According to the format of the text, choose the right parser.
282 -- TODO withParser :: FileType -> Parser [Document]
283 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
284 withParser WOS = WOS.parser
285 withParser RIS = RIS.parser
286 withParser Iramuteq = Iramuteq.parser
287 --withParser ODT = odtParser
288 --withParser XML = xmlParser
289 withParser _ = panic "[ERROR] Parser not implemented yet"
291 runParser :: FileType -> DB.ByteString
292 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
293 runParser format text = pure $ runParser' format text
295 runParser' :: FileType -> DB.ByteString
296 -> (Either String [[(DB.ByteString, DB.ByteString)]])
297 runParser' format text = parseOnly (withParser format) text
299 openZip :: FilePath -> IO [DB.ByteString]
301 entries <- withArchive fp (DM.keys <$> getEntries)
302 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
305 cleanText :: Text -> Text
306 cleanText = cs . clean . cs
308 clean :: DB.ByteString -> DB.ByteString
309 clean txt = DBC.map clean' txt
319 splitOn :: NgramsType -> Maybe Text -> Text -> [Text]
320 splitOn Authors (Just "WOS") = (DT.splitOn "; ")
321 splitOn _ _ = (DT.splitOn ", ")