2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
21 {-# LANGUAGE PackageImports #-}
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC, splitOn)
26 -- import Gargantext.Core.Text.Learn (detectLangDefault)
27 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
29 import Control.Concurrent.Async as CCA (mapConcurrently)
30 import Control.Monad (join)
31 import Control.Monad.Trans.Control (MonadBaseControl)
32 import Data.Attoparsec.ByteString (parseOnly, Parser)
33 import Data.Either(Either(..))
34 import Data.Either.Extra (partitionEithers)
35 import Data.List (concat, lookup)
37 import Data.String (String())
39 import Data.Text (Text, intercalate, pack, unpack)
40 import Data.Text.Encoding (decodeUtf8)
41 import Data.Tuple.Extra (both, first, second)
42 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
43 import Gargantext.Core (Lang(..))
44 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
45 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
46 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
47 import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
48 import Gargantext.Prelude
49 import System.FilePath (FilePath(), takeExtension)
50 import System.IO.Temp (emptySystemTempFile)
51 import qualified Data.ByteString as DB
52 import qualified Data.ByteString.Char8 as DBC
53 import qualified Data.ByteString.Lazy as DBL
54 import qualified Data.Map as DM
55 import qualified Data.Text as DT
56 import qualified Data.Text as Text
57 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
58 import qualified Gargantext.Core.Text.Corpus.Parsers.Iramuteq as Iramuteq
59 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
60 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
61 import qualified Prelude
62 ------------------------------------------------------------------------
64 type ParseError = String
66 --type Document = DM.Map Field Text
67 --type FilesParsed = DM.Map FilePath FileParsed
68 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
69 -- , _fileParsed_result :: [Document]
73 -- | According to the format of Input file,
74 -- different parser are available.
83 -- Implemented (ISI Format)
84 -- | DOC -- Not Implemented / import Pandoc
85 -- | ODT -- Not Implemented / import Pandoc
86 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
87 -- | XML -- Not Implemented / see :
89 parseFormatC :: MonadBaseControl IO m
93 -> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
94 parseFormatC CsvGargV3 Plain bs = do
95 let eParsedC = parseCsvC $ DBL.fromStrict bs
97 Left err -> pure $ Left err
98 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
99 parseFormatC CsvHal Plain bs = do
100 let eParsedC = parseCsvC $ DBL.fromStrict bs
102 Left err -> pure $ Left err
103 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
104 parseFormatC RisPresse Plain bs = do
105 --docs <- enrichWith RisPresse
106 let eDocs = runParser' RisPresse bs
108 ( Just $ fromIntegral $ length docs
111 .| mapC (map $ both decodeUtf8)
112 .| mapMC (toDoc RIS)) ) <$> eDocs
113 parseFormatC WOS Plain bs = do
114 let eDocs = runParser' WOS bs
116 ( Just $ fromIntegral $ length docs
118 .| mapC (map $ first WOS.keys)
119 .| mapC (map $ both decodeUtf8)
120 .| mapMC (toDoc WOS)) ) <$> eDocs
122 parseFormatC Iramuteq Plain bs = do
123 let eDocs = runParser' Iramuteq bs
125 ( Just $ fromIntegral $ length docs
127 .| mapC (map $ first Iramuteq.keys)
128 .| mapC (map $ both decodeUtf8)
129 .| mapMC ((toDoc Iramuteq) . (map (second (Text.replace "_" " ")))) ))<$> eDocs
131 parseFormatC ft ZIP bs = do
132 path <- liftBase $ emptySystemTempFile "parsed-zip"
133 liftBase $ DB.writeFile path bs
134 fileContents <- liftBase $ withArchive path $ do
135 files <- DM.keys <$> getEntries
137 --printDebug "[parseFormatC] fileContents" fileContents
138 eContents <- mapM (parseFormatC ft Plain) fileContents
139 --printDebug "[parseFormatC] contents" contents
140 --pure $ Left $ "Not implemented for ZIP"
141 let (errs, contents) = partitionEithers eContents
145 [] -> pure $ Left "No files in zip"
147 let lenghts = fst <$> contents
148 let contents' = snd <$> contents
149 let totalLength = sum $ sum <$> lenghts -- Trick: sum (Just 1) = 1, sum Nothing = 0
150 pure $ Right ( Just totalLength
151 , sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
152 _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
154 parseFormatC _ _ _ = undefined
156 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
157 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
158 -- parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
159 -- parseFormat RisPresse bs = do
160 -- docs <- mapM (toDoc RIS)
162 -- <$> enrichWith RisPresse
163 -- $ partitionEithers
164 -- $ [runParser' RisPresse bs]
166 -- parseFormat WOS bs = do
167 -- docs <- mapM (toDoc WOS)
169 -- <$> enrichWith WOS
170 -- $ partitionEithers
171 -- $ [runParser' WOS bs]
173 -- parseFormat ZIP bs = do
174 -- path <- emptySystemTempFile "parsed-zip"
175 -- DB.writeFile path bs
176 -- parsedZip <- withArchive path $ do
177 -- DM.keys <$> getEntries
178 -- pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
179 -- parseFormat _ _ = undefined
181 -- | Parse file into documents
182 -- TODO manage errors here
183 -- TODO: to debug maybe add the filepath in error message
185 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
186 parseFile CsvHal Plain p = parseHal p
187 parseFile CsvGargV3 Plain p = parseCsv p
189 parseFile RisPresse Plain p = do
190 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
193 parseFile WOS Plain p = do
194 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
197 parseFile Iramuteq Plain p = do
198 docs <- join $ mapM ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
200 <$> enrichWith Iramuteq
201 <$> readFileWith Iramuteq p
205 parseFile ff _ p = do
206 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
209 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
210 -- TODO use language for RIS
212 -- let abstract = lookup "abstract" d
213 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
215 let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d
216 -- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
217 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
219 let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
220 , _hd_doi = lookup "doi" d
221 , _hd_url = lookup "URL" d
222 , _hd_uniqId = Nothing
223 , _hd_uniqIdBdd = Nothing
225 , _hd_title = lookup "title" d
226 , _hd_authors = lookup "authors" d
227 , _hd_institutes = lookup "institutes" d
228 , _hd_source = lookup "source" d
229 , _hd_abstract = lookup "abstract" d
230 , _hd_publication_date = fmap (DT.pack . show) utcTime
231 , _hd_publication_year = pub_year
232 , _hd_publication_month = pub_month
233 , _hd_publication_day = pub_day
234 , _hd_publication_hour = Nothing
235 , _hd_publication_minute = Nothing
236 , _hd_publication_second = Nothing
237 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
238 -- printDebug "[G.C.T.C.Parsers] HyperdataDocument" hd
241 enrichWith :: FileType
242 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
243 enrichWith RisPresse = enrichWith' presseEnrich
244 enrichWith WOS = enrichWith' (map (first WOS.keys))
245 enrichWith Iramuteq = enrichWith' (map (first Iramuteq.keys))
246 enrichWith _ = enrichWith' identity
249 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
250 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
251 enrichWith' f = second (map both' . map f . concat)
253 both' = map (both decodeUtf8)
257 readFileWith :: FileType -> FilePath
258 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
259 readFileWith format path = do
260 files <- case takeExtension path of
261 ".zip" -> openZip path
262 _ -> pure <$> clean <$> DB.readFile path
263 partitionEithers <$> mapConcurrently (runParser format) files
267 -- According to the format of the text, choose the right parser.
268 -- TODO withParser :: FileType -> Parser [Document]
269 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
270 withParser WOS = WOS.parser
271 withParser RIS = RIS.parser
272 withParser Iramuteq = Iramuteq.parser
273 --withParser ODT = odtParser
274 --withParser XML = xmlParser
275 withParser _ = panic "[ERROR] Parser not implemented yet"
277 runParser :: FileType -> DB.ByteString
278 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
279 runParser format text = pure $ runParser' format text
281 runParser' :: FileType -> DB.ByteString
282 -> (Either String [[(DB.ByteString, DB.ByteString)]])
283 runParser' format text = parseOnly (withParser format) text
285 openZip :: FilePath -> IO [DB.ByteString]
287 entries <- withArchive fp (DM.keys <$> getEntries)
288 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
291 cleanText :: Text -> Text
292 cleanText = cs . clean . cs
294 clean :: DB.ByteString -> DB.ByteString
295 clean txt = DBC.map clean' txt
305 splitOn :: NgramsType -> Maybe Text -> Text -> [Text]
306 splitOn Authors (Just "WOS") = (DT.splitOn "; ")
307 splitOn _ _ = (DT.splitOn ", ")