]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers.hs
use gargantext-graph's confluence implementation in graph construction
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE PackageImports #-}
22
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC)
24 where
25
26 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
27 import Conduit
28 import Control.Concurrent.Async as CCA (mapConcurrently)
29 import Control.Monad.Trans.Control (MonadBaseControl)
30 import Control.Monad (join)
31 import Data.Attoparsec.ByteString (parseOnly, Parser)
32 import Data.Either(Either(..))
33 import Data.Either.Extra (partitionEithers)
34 import Data.List (concat, lookup)
35 import Data.Ord()
36 import Data.String (String())
37 import Data.String()
38 import Data.Text (Text, intercalate, pack, unpack)
39 import Data.Text.Encoding (decodeUtf8)
40 import Data.Tuple.Extra (both, first, second)
41 import System.FilePath (FilePath(), takeExtension)
42 import qualified Data.ByteString as DB
43 import qualified Data.ByteString.Char8 as DBC
44 import qualified Data.ByteString.Lazy as DBL
45 import qualified Data.Map as DM
46 import qualified Data.Text as DT
47 import qualified Prelude
48 import System.IO.Temp (emptySystemTempFile)
49
50 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
51 import Gargantext.Core (Lang(..))
52 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
53 import Gargantext.Prelude
54 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
55 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
56 -- import Gargantext.Core.Text.Learn (detectLangDefault)
57 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
58 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
59 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
60 ------------------------------------------------------------------------
61
62 type ParseError = String
63 --type Field = Text
64 --type Document = DM.Map Field Text
65 --type FilesParsed = DM.Map FilePath FileParsed
66 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
67 -- , _fileParsed_result :: [Document]
68 -- } deriving (Show)
69
70
71 -- | According to the format of Input file,
72 -- different parser are available.
73 data FileType = WOS | RIS | RisPresse | CsvGargV3 | CsvHal
74 deriving (Show)
75
76 -- Implemented (ISI Format)
77 -- | DOC -- Not Implemented / import Pandoc
78 -- | ODT -- Not Implemented / import Pandoc
79 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
80 -- | XML -- Not Implemented / see :
81
82 parseFormatC :: MonadBaseControl IO m
83 => FileType
84 -> FileFormat
85 -> DB.ByteString
86 -> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
87 parseFormatC CsvGargV3 Plain bs = do
88 let eParsedC = parseCsvC $ DBL.fromStrict bs
89 case eParsedC of
90 Left err -> pure $ Left err
91 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
92 parseFormatC CsvHal Plain bs = do
93 let eParsedC = parseCsvC $ DBL.fromStrict bs
94 case eParsedC of
95 Left err -> pure $ Left err
96 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
97 parseFormatC RisPresse Plain bs = do
98 --docs <- enrichWith RisPresse
99 let eDocs = runParser' RisPresse bs
100 pure $ (\docs ->
101 ( Just $ fromIntegral $ length docs
102 , yieldMany docs
103 .| mapC presseEnrich
104 .| mapC (map $ both decodeUtf8)
105 .| mapMC (toDoc RIS)) ) <$> eDocs
106 parseFormatC WOS Plain bs = do
107 let eDocs = runParser' WOS bs
108 pure $ (\docs ->
109 ( Just $ fromIntegral $ length docs
110 , yieldMany docs
111 .| mapC (map $ first WOS.keys)
112 .| mapC (map $ both decodeUtf8)
113 .| mapMC (toDoc WOS)) ) <$> eDocs
114 parseFormatC ft ZIP bs = do
115 path <- liftBase $ emptySystemTempFile "parsed-zip"
116 liftBase $ DB.writeFile path bs
117 fileContents <- liftBase $ withArchive path $ do
118 files <- DM.keys <$> getEntries
119 mapM getEntry files
120 --printDebug "[parseFormatC] fileContents" fileContents
121 eContents <- mapM (parseFormatC ft Plain) fileContents
122 --printDebug "[parseFormatC] contents" contents
123 --pure $ Left $ "Not implemented for ZIP"
124 let (errs, contents) = partitionEithers eContents
125 case errs of
126 [] ->
127 case contents of
128 [] -> pure $ Left "No files in zip"
129 _ -> do
130 let lenghts = fst <$> contents
131 let contents' = snd <$> contents
132 let totalLength = sum $ sum <$> lenghts -- Trick: sum (Just 1) = 1, sum Nothing = 0
133 pure $ Right ( Just totalLength
134 , sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
135 _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
136
137 parseFormatC _ _ _ = undefined
138
139 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
140 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
141 -- parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
142 -- parseFormat RisPresse bs = do
143 -- docs <- mapM (toDoc RIS)
144 -- <$> snd
145 -- <$> enrichWith RisPresse
146 -- $ partitionEithers
147 -- $ [runParser' RisPresse bs]
148 -- pure $ Right docs
149 -- parseFormat WOS bs = do
150 -- docs <- mapM (toDoc WOS)
151 -- <$> snd
152 -- <$> enrichWith WOS
153 -- $ partitionEithers
154 -- $ [runParser' WOS bs]
155 -- pure $ Right docs
156 -- parseFormat ZIP bs = do
157 -- path <- emptySystemTempFile "parsed-zip"
158 -- DB.writeFile path bs
159 -- parsedZip <- withArchive path $ do
160 -- DM.keys <$> getEntries
161 -- pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
162 -- parseFormat _ _ = undefined
163
164 -- | Parse file into documents
165 -- TODO manage errors here
166 -- TODO: to debug maybe add the filepath in error message
167
168 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
169 parseFile CsvHal Plain p = parseHal p
170 parseFile CsvGargV3 Plain p = parseCsv p
171 parseFile RisPresse Plain p = do
172 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
173 pure $ Right docs
174 parseFile WOS Plain p = do
175 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
176 pure $ Right docs
177 parseFile ff _ p = do
178 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
179 pure $ Right docs
180
181 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
182 -- TODO use language for RIS
183 toDoc ff d = do
184 -- let abstract = lookup "abstract" d
185 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
186
187 let dateToParse = DT.replace "-" " " <$> lookup "PY" d <> Just " " <> lookup "publication_date" d
188
189 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
190
191 pure HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
192 , _hd_doi = lookup "doi" d
193 , _hd_url = lookup "URL" d
194 , _hd_uniqId = Nothing
195 , _hd_uniqIdBdd = Nothing
196 , _hd_page = Nothing
197 , _hd_title = lookup "title" d
198 , _hd_authors = Nothing
199 , _hd_institutes = lookup "authors" d
200 , _hd_source = lookup "source" d
201 , _hd_abstract = lookup "abstract" d
202 , _hd_publication_date = fmap (DT.pack . show) utcTime
203 , _hd_publication_year = pub_year
204 , _hd_publication_month = pub_month
205 , _hd_publication_day = pub_day
206 , _hd_publication_hour = Nothing
207 , _hd_publication_minute = Nothing
208 , _hd_publication_second = Nothing
209 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
210
211 enrichWith :: FileType
212 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
213 enrichWith RisPresse = enrichWith' presseEnrich
214 enrichWith WOS = enrichWith' (map (first WOS.keys))
215 enrichWith _ = enrichWith' identity
216
217
218 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
219 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
220 enrichWith' f = second (map both' . map f . concat)
221 where
222 both' = map (both decodeUtf8)
223
224
225
226 readFileWith :: FileType -> FilePath
227 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
228 readFileWith format path = do
229 files <- case takeExtension path of
230 ".zip" -> openZip path
231 _ -> pure <$> clean <$> DB.readFile path
232 partitionEithers <$> mapConcurrently (runParser format) files
233
234
235 -- | withParser:
236 -- According to the format of the text, choose the right parser.
237 -- TODO withParser :: FileType -> Parser [Document]
238 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
239 withParser WOS = WOS.parser
240 withParser RIS = RIS.parser
241 --withParser ODT = odtParser
242 --withParser XML = xmlParser
243 withParser _ = panic "[ERROR] Parser not implemented yet"
244
245 runParser :: FileType -> DB.ByteString
246 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
247 runParser format text = pure $ runParser' format text
248
249 runParser' :: FileType -> DB.ByteString
250 -> (Either String [[(DB.ByteString, DB.ByteString)]])
251 runParser' format text = parseOnly (withParser format) text
252
253 openZip :: FilePath -> IO [DB.ByteString]
254 openZip fp = do
255 entries <- withArchive fp (DM.keys <$> getEntries)
256 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
257 pure bs
258
259 cleanText :: Text -> Text
260 cleanText = cs . clean . cs
261
262 clean :: DB.ByteString -> DB.ByteString
263 clean txt = DBC.map clean' txt
264 where
265 clean' '’' = '\''
266 clean' '\r' = ' '
267 clean' '\t' = ' '
268 clean' ';' = '.'
269 clean' c = c