2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
21 {-# LANGUAGE PackageImports #-}
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC)
26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
28 import Control.Concurrent.Async as CCA (mapConcurrently)
29 import Control.Monad.Trans.Control (MonadBaseControl)
30 import Control.Monad (join)
31 import Data.Attoparsec.ByteString (parseOnly, Parser)
32 import Data.Either(Either(..))
33 import Data.Either.Extra (partitionEithers)
34 import Data.List (concat, lookup)
36 import Data.String (String())
38 import Data.Text (Text, intercalate, pack, unpack)
39 import Data.Text.Encoding (decodeUtf8)
40 import Data.Tuple.Extra (both, first, second)
41 import System.FilePath (FilePath(), takeExtension)
42 import qualified Data.ByteString as DB
43 import qualified Data.ByteString.Char8 as DBC
44 import qualified Data.ByteString.Lazy as DBL
45 import qualified Data.Map as DM
46 import qualified Data.Text as DT
47 import qualified Prelude
48 import System.IO.Temp (emptySystemTempFile)
50 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
51 import Gargantext.Core (Lang(..))
52 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
53 import Gargantext.Prelude
54 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
55 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
56 -- import Gargantext.Core.Text.Learn (detectLangDefault)
57 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
58 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
59 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
60 ------------------------------------------------------------------------
62 type ParseError = String
64 --type Document = DM.Map Field Text
65 --type FilesParsed = DM.Map FilePath FileParsed
66 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
67 -- , _fileParsed_result :: [Document]
71 -- | According to the format of Input file,
72 -- different parser are available.
73 data FileType = WOS | RIS | RisPresse | CsvGargV3 | CsvHal
76 -- Implemented (ISI Format)
77 -- | DOC -- Not Implemented / import Pandoc
78 -- | ODT -- Not Implemented / import Pandoc
79 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
80 -- | XML -- Not Implemented / see :
82 parseFormatC :: MonadBaseControl IO m => FileType -> FileFormat -> DB.ByteString -> m (Either Prelude.String (ConduitT () HyperdataDocument IO ()))
83 parseFormatC CsvGargV3 Plain bs = pure $ transPipe (pure . runIdentity) <$> (parseCsvC $ DBL.fromStrict bs)
84 parseFormatC CsvHal Plain bs = pure $ transPipe (pure . runIdentity) <$> (parseCsvC $ DBL.fromStrict bs)
85 parseFormatC RisPresse Plain bs = do
86 --docs <- enrichWith RisPresse
87 let eDocs = runParser' RisPresse bs
88 pure $ (\docs -> yieldMany docs
90 .| mapC (map $ both decodeUtf8)
91 .| mapMC (toDoc RIS)) <$> eDocs
92 parseFormatC WOS Plain bs = do
93 let eDocs = runParser' WOS bs
94 pure $ (\docs -> yieldMany docs
95 .| mapC (map $ first WOS.keys)
96 .| mapC (map $ both decodeUtf8)
97 .| mapMC (toDoc WOS)) <$> eDocs
98 parseFormatC ft ZIP bs = do
99 path <- liftBase $ emptySystemTempFile "parsed-zip"
100 liftBase $ DB.writeFile path bs
101 fileContents <- liftBase $ withArchive path $ do
102 files <- DM.keys <$> getEntries
104 --printDebug "[parseFormatC] fileContents" fileContents
105 eContents <- mapM (parseFormatC ft Plain) fileContents
106 --printDebug "[parseFormatC] contents" contents
107 --pure $ Left $ "Not implemented for ZIP"
108 let (errs, contents) = partitionEithers eContents
112 [] -> pure $ Left "No files in zip"
113 _ -> pure $ Right $ ( sequenceConduits contents >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
114 _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
116 parseFormatC _ _ _ = undefined
118 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
119 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
120 -- parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
121 -- parseFormat RisPresse bs = do
122 -- docs <- mapM (toDoc RIS)
124 -- <$> enrichWith RisPresse
125 -- $ partitionEithers
126 -- $ [runParser' RisPresse bs]
128 -- parseFormat WOS bs = do
129 -- docs <- mapM (toDoc WOS)
131 -- <$> enrichWith WOS
132 -- $ partitionEithers
133 -- $ [runParser' WOS bs]
135 -- parseFormat ZIP bs = do
136 -- path <- emptySystemTempFile "parsed-zip"
137 -- DB.writeFile path bs
138 -- parsedZip <- withArchive path $ do
139 -- DM.keys <$> getEntries
140 -- pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
141 -- parseFormat _ _ = undefined
143 -- | Parse file into documents
144 -- TODO manage errors here
145 -- TODO: to debug maybe add the filepath in error message
146 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
147 parseFile CsvHal Plain p = parseHal p
148 parseFile CsvGargV3 Plain p = parseCsv p
149 parseFile RisPresse Plain p = do
150 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
152 parseFile WOS Plain p = do
153 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
155 parseFile ff _ p = do
156 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
159 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
160 -- TODO use language for RIS
162 -- let abstract = lookup "abstract" d
163 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
165 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
167 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
169 pure $ HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
170 , _hd_doi = lookup "doi" d
171 , _hd_url = lookup "URL" d
172 , _hd_uniqId = Nothing
173 , _hd_uniqIdBdd = Nothing
175 , _hd_title = lookup "title" d
176 , _hd_authors = Nothing
177 , _hd_institutes = lookup "authors" d
178 , _hd_source = lookup "source" d
179 , _hd_abstract = lookup "abstract" d
180 , _hd_publication_date = fmap (DT.pack . show) utcTime
181 , _hd_publication_year = pub_year
182 , _hd_publication_month = pub_month
183 , _hd_publication_day = pub_day
184 , _hd_publication_hour = Nothing
185 , _hd_publication_minute = Nothing
186 , _hd_publication_second = Nothing
187 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
189 enrichWith :: FileType
190 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
191 enrichWith RisPresse = enrichWith' presseEnrich
192 enrichWith WOS = enrichWith' (map (first WOS.keys))
193 enrichWith _ = enrichWith' identity
196 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
197 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
198 enrichWith' f = second (map both' . map f . concat)
200 both' = map (both decodeUtf8)
204 readFileWith :: FileType -> FilePath
205 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
206 readFileWith format path = do
207 files <- case takeExtension path of
208 ".zip" -> openZip path
209 _ -> pure <$> clean <$> DB.readFile path
210 partitionEithers <$> mapConcurrently (runParser format) files
214 -- According to the format of the text, choose the right parser.
215 -- TODO withParser :: FileType -> Parser [Document]
216 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
217 withParser WOS = WOS.parser
218 withParser RIS = RIS.parser
219 --withParser ODT = odtParser
220 --withParser XML = xmlParser
221 withParser _ = panic "[ERROR] Parser not implemented yet"
223 runParser :: FileType -> DB.ByteString
224 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
225 runParser format text = pure $ runParser' format text
227 runParser' :: FileType -> DB.ByteString
228 -> (Either String [[(DB.ByteString, DB.ByteString)]])
229 runParser' format text = parseOnly (withParser format) text
231 openZip :: FilePath -> IO [DB.ByteString]
233 entries <- withArchive fp (DM.keys <$> getEntries)
234 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
237 cleanText :: Text -> Text
238 cleanText = cs . clean . cs
240 clean :: DB.ByteString -> DB.ByteString
241 clean txt = DBC.map clean' txt