2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
21 {-# LANGUAGE PackageImports #-}
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC, splitOn)
26 -- import Gargantext.Core.Text.Learn (detectLangDefault)
27 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
29 import Control.Concurrent.Async as CCA (mapConcurrently)
30 import Control.Monad (join)
31 import Control.Monad.Trans.Control (MonadBaseControl)
32 import Data.Attoparsec.ByteString (parseOnly, Parser)
33 import Data.Either(Either(..))
34 import Data.Either.Extra (partitionEithers)
35 import Data.List (concat, lookup)
37 import Data.String (String())
39 import Data.Text (Text, intercalate, pack, unpack)
40 import Data.Text.Encoding (decodeUtf8)
41 import Data.Tuple.Extra (both, first, second)
42 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
43 import Gargantext.Core (Lang(..))
44 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
45 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
46 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
47 import Gargantext.Prelude
48 import System.FilePath (FilePath(), takeExtension)
49 import System.IO.Temp (emptySystemTempFile)
50 import qualified Data.ByteString as DB
51 import qualified Data.ByteString.Char8 as DBC
52 import qualified Data.ByteString.Lazy as DBL
53 import qualified Data.Map as DM
54 import qualified Data.Text as DT
55 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
56 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
57 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
58 import qualified Prelude
59 import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
60 ------------------------------------------------------------------------
62 type ParseError = String
64 --type Document = DM.Map Field Text
65 --type FilesParsed = DM.Map FilePath FileParsed
66 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
67 -- , _fileParsed_result :: [Document]
71 -- | According to the format of Input file,
72 -- different parser are available.
73 data FileType = WOS | RIS | RisPresse | CsvGargV3 | CsvHal
76 -- Implemented (ISI Format)
77 -- | DOC -- Not Implemented / import Pandoc
78 -- | ODT -- Not Implemented / import Pandoc
79 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
80 -- | XML -- Not Implemented / see :
82 parseFormatC :: MonadBaseControl IO m
86 -> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
87 parseFormatC CsvGargV3 Plain bs = do
88 let eParsedC = parseCsvC $ DBL.fromStrict bs
90 Left err -> pure $ Left err
91 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
92 parseFormatC CsvHal Plain bs = do
93 let eParsedC = parseCsvC $ DBL.fromStrict bs
95 Left err -> pure $ Left err
96 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
97 parseFormatC RisPresse Plain bs = do
98 --docs <- enrichWith RisPresse
99 let eDocs = runParser' RisPresse bs
101 ( Just $ fromIntegral $ length docs
104 .| mapC (map $ both decodeUtf8)
105 .| mapMC (toDoc RIS)) ) <$> eDocs
106 parseFormatC WOS Plain bs = do
107 let eDocs = runParser' WOS bs
109 ( Just $ fromIntegral $ length docs
111 .| mapC (map $ first WOS.keys)
112 .| mapC (map $ both decodeUtf8)
113 .| mapMC (toDoc WOS)) ) <$> eDocs
114 parseFormatC ft ZIP bs = do
115 path <- liftBase $ emptySystemTempFile "parsed-zip"
116 liftBase $ DB.writeFile path bs
117 fileContents <- liftBase $ withArchive path $ do
118 files <- DM.keys <$> getEntries
120 --printDebug "[parseFormatC] fileContents" fileContents
121 eContents <- mapM (parseFormatC ft Plain) fileContents
122 --printDebug "[parseFormatC] contents" contents
123 --pure $ Left $ "Not implemented for ZIP"
124 let (errs, contents) = partitionEithers eContents
128 [] -> pure $ Left "No files in zip"
130 let lenghts = fst <$> contents
131 let contents' = snd <$> contents
132 let totalLength = sum $ sum <$> lenghts -- Trick: sum (Just 1) = 1, sum Nothing = 0
133 pure $ Right ( Just totalLength
134 , sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
135 _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
137 parseFormatC _ _ _ = undefined
139 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
140 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
141 -- parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
142 -- parseFormat RisPresse bs = do
143 -- docs <- mapM (toDoc RIS)
145 -- <$> enrichWith RisPresse
146 -- $ partitionEithers
147 -- $ [runParser' RisPresse bs]
149 -- parseFormat WOS bs = do
150 -- docs <- mapM (toDoc WOS)
152 -- <$> enrichWith WOS
153 -- $ partitionEithers
154 -- $ [runParser' WOS bs]
156 -- parseFormat ZIP bs = do
157 -- path <- emptySystemTempFile "parsed-zip"
158 -- DB.writeFile path bs
159 -- parsedZip <- withArchive path $ do
160 -- DM.keys <$> getEntries
161 -- pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
162 -- parseFormat _ _ = undefined
164 -- | Parse file into documents
165 -- TODO manage errors here
166 -- TODO: to debug maybe add the filepath in error message
168 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
169 parseFile CsvHal Plain p = parseHal p
170 parseFile CsvGargV3 Plain p = parseCsv p
172 parseFile RisPresse Plain p = do
173 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
176 parseFile WOS Plain p = do
177 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
180 parseFile ff _ p = do
181 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
184 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
185 -- TODO use language for RIS
187 -- let abstract = lookup "abstract" d
188 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
190 let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d
191 -- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
192 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
194 let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
195 , _hd_doi = lookup "doi" d
196 , _hd_url = lookup "URL" d
197 , _hd_uniqId = Nothing
198 , _hd_uniqIdBdd = Nothing
200 , _hd_title = lookup "title" d
201 , _hd_authors = lookup "authors" d
202 , _hd_institutes = lookup "institutes" d
203 , _hd_source = lookup "source" d
204 , _hd_abstract = lookup "abstract" d
205 , _hd_publication_date = fmap (DT.pack . show) utcTime
206 , _hd_publication_year = pub_year
207 , _hd_publication_month = pub_month
208 , _hd_publication_day = pub_day
209 , _hd_publication_hour = Nothing
210 , _hd_publication_minute = Nothing
211 , _hd_publication_second = Nothing
212 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
213 -- printDebug "[G.C.T.C.Parsers] HyperdataDocument" hd
216 enrichWith :: FileType
217 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
218 enrichWith RisPresse = enrichWith' presseEnrich
219 enrichWith WOS = enrichWith' (map (first WOS.keys))
220 enrichWith _ = enrichWith' identity
223 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
224 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
225 enrichWith' f = second (map both' . map f . concat)
227 both' = map (both decodeUtf8)
231 readFileWith :: FileType -> FilePath
232 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
233 readFileWith format path = do
234 files <- case takeExtension path of
235 ".zip" -> openZip path
236 _ -> pure <$> clean <$> DB.readFile path
237 partitionEithers <$> mapConcurrently (runParser format) files
241 -- According to the format of the text, choose the right parser.
242 -- TODO withParser :: FileType -> Parser [Document]
243 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
244 withParser WOS = WOS.parser
245 withParser RIS = RIS.parser
246 --withParser ODT = odtParser
247 --withParser XML = xmlParser
248 withParser _ = panic "[ERROR] Parser not implemented yet"
250 runParser :: FileType -> DB.ByteString
251 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
252 runParser format text = pure $ runParser' format text
254 runParser' :: FileType -> DB.ByteString
255 -> (Either String [[(DB.ByteString, DB.ByteString)]])
256 runParser' format text = parseOnly (withParser format) text
258 openZip :: FilePath -> IO [DB.ByteString]
260 entries <- withArchive fp (DM.keys <$> getEntries)
261 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
264 cleanText :: Text -> Text
265 cleanText = cs . clean . cs
267 clean :: DB.ByteString -> DB.ByteString
268 clean txt = DBC.map clean' txt
278 splitOn :: NgramsType -> Maybe Text -> Text -> [Text]
279 splitOn Authors (Just "WOS") = (DT.splitOn "; ")
280 splitOn _ _ = (DT.splitOn ", ")