]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers.hs
[VERSION] +1 to 0.0.6.9.9.4.3
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE PackageImports #-}
22
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC, splitOn, etale)
24 where
25
26 -- import Gargantext.Core.Text.Learn (detectLangDefault)
27 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
28 import Conduit
29 import Control.Concurrent.Async as CCA (mapConcurrently)
30 import Control.Monad (join)
31 import Control.Monad.Trans.Control (MonadBaseControl)
32 import Data.Attoparsec.ByteString (parseOnly, Parser)
33 import Data.Either(Either(..))
34 import Data.Either.Extra (partitionEithers)
35 import Data.List (concat, lookup)
36 import Data.Ord()
37 import Data.String (String())
38 import Data.String()
39 import Data.Text (Text, intercalate, pack, unpack)
40 import Data.Text.Encoding (decodeUtf8)
41 import Data.Tuple.Extra (both, first, second)
42 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
43 import Gargantext.Core (Lang(..))
44 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
45 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
46 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
47 import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
48 import Gargantext.Prelude
49 import System.FilePath (FilePath(), takeExtension)
50 import System.IO.Temp (emptySystemTempFile)
51 import Gargantext.Core.Text.Corpus.Parsers.FrameWrite (text2titleParagraphs)
52 import qualified Data.ByteString as DB
53 import qualified Data.ByteString.Char8 as DBC
54 import qualified Data.ByteString.Lazy as DBL
55 import qualified Data.Map as DM
56 import qualified Data.Text as DT
57 import qualified Data.Text as Text
58 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
59 import qualified Gargantext.Core.Text.Corpus.Parsers.Iramuteq as Iramuteq
60 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
61 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
62 import qualified Prelude
63 ------------------------------------------------------------------------
64
65 type ParseError = String
66 --type Field = Text
67 --type Document = DM.Map Field Text
68 --type FilesParsed = DM.Map FilePath FileParsed
69 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
70 -- , _fileParsed_result :: [Document]
71 -- } deriving (Show)
72
73
74 -- | According to the format of Input file,
75 -- different parser are available.
76 data FileType = WOS
77 | RIS
78 | RisPresse
79 | CsvGargV3
80 | CsvHal
81 | Iramuteq
82 deriving (Show)
83
84 -- Implemented (ISI Format)
85 -- | DOC -- Not Implemented / import Pandoc
86 -- | ODT -- Not Implemented / import Pandoc
87 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
88 -- | XML -- Not Implemented / see :
89
90 parseFormatC :: MonadBaseControl IO m
91 => FileType
92 -> FileFormat
93 -> DB.ByteString
94 -> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
95 parseFormatC CsvGargV3 Plain bs = do
96 let eParsedC = parseCsvC $ DBL.fromStrict bs
97 case eParsedC of
98 Left err -> pure $ Left err
99 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
100 parseFormatC CsvHal Plain bs = do
101 let eParsedC = parseCsvC $ DBL.fromStrict bs
102 case eParsedC of
103 Left err -> pure $ Left err
104 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
105 parseFormatC RisPresse Plain bs = do
106 --docs <- enrichWith RisPresse
107 let eDocs = runParser' RisPresse bs
108 pure $ (\docs ->
109 ( Just $ fromIntegral $ length docs
110 , yieldMany docs
111 .| mapC presseEnrich
112 .| mapC (map $ both decodeUtf8)
113 .| mapMC (toDoc RIS)) ) <$> eDocs
114 parseFormatC WOS Plain bs = do
115 let eDocs = runParser' WOS bs
116 pure $ (\docs ->
117 ( Just $ fromIntegral $ length docs
118 , yieldMany docs
119 .| mapC (map $ first WOS.keys)
120 .| mapC (map $ both decodeUtf8)
121 .| mapMC (toDoc WOS)) ) <$> eDocs
122
123 parseFormatC Iramuteq Plain bs = do
124 let eDocs = runParser' Iramuteq bs
125 pure $ (\docs ->
126 ( Just $ fromIntegral $ length docs
127 , yieldMany docs
128 .| mapC (map $ first Iramuteq.keys)
129 .| mapC (map $ both decodeUtf8)
130 .| mapMC ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
131 )
132 )
133 <$> eDocs
134
135 parseFormatC ft ZIP bs = do
136 path <- liftBase $ emptySystemTempFile "parsed-zip"
137 liftBase $ DB.writeFile path bs
138 fileContents <- liftBase $ withArchive path $ do
139 files <- DM.keys <$> getEntries
140 mapM getEntry files
141 --printDebug "[parseFormatC] fileContents" fileContents
142 eContents <- mapM (parseFormatC ft Plain) fileContents
143 --printDebug "[parseFormatC] contents" contents
144 --pure $ Left $ "Not implemented for ZIP"
145 let (errs, contents) = partitionEithers eContents
146 case errs of
147 [] ->
148 case contents of
149 [] -> pure $ Left "No files in zip"
150 _ -> do
151 let lenghts = fst <$> contents
152 let contents' = snd <$> contents
153 let totalLength = sum $ sum <$> lenghts -- Trick: sum (Just 1) = 1, sum Nothing = 0
154 pure $ Right ( Just totalLength
155 , sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
156 _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
157
158 parseFormatC _ _ _ = undefined
159
160
161 etale :: [HyperdataDocument] -> [HyperdataDocument]
162 etale = concat . (map etale')
163 where
164 etale' :: HyperdataDocument -> [HyperdataDocument]
165 etale' h = map (\t -> h { _hd_abstract = Just t })
166 $ map snd
167 $ text2titleParagraphs 7 (maybe "" identity $ _hd_abstract h)
168
169
170 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
171 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
172 -- parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
173 -- parseFormat RisPresse bs = do
174 -- docs <- mapM (toDoc RIS)
175 -- <$> snd
176 -- <$> enrichWith RisPresse
177 -- $ partitionEithers
178 -- $ [runParser' RisPresse bs]
179 -- pure $ Right docs
180 -- parseFormat WOS bs = do
181 -- docs <- mapM (toDoc WOS)
182 -- <$> snd
183 -- <$> enrichWith WOS
184 -- $ partitionEithers
185 -- $ [runParser' WOS bs]
186 -- pure $ Right docs
187 -- parseFormat ZIP bs = do
188 -- path <- emptySystemTempFile "parsed-zip"
189 -- DB.writeFile path bs
190 -- parsedZip <- withArchive path $ do
191 -- DM.keys <$> getEntries
192 -- pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
193 -- parseFormat _ _ = undefined
194
195 -- | Parse file into documents
196 -- TODO manage errors here
197 -- TODO: to debug maybe add the filepath in error message
198
199 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
200 parseFile CsvHal Plain p = parseHal p
201 parseFile CsvGargV3 Plain p = parseCsv p
202
203 parseFile RisPresse Plain p = do
204 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
205 pure $ Right docs
206
207 parseFile WOS Plain p = do
208 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
209 pure $ Right docs
210
211 parseFile Iramuteq Plain p = do
212 docs <- join $ mapM ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
213 <$> snd
214 <$> enrichWith Iramuteq
215 <$> readFileWith Iramuteq p
216 pure $ Right docs
217
218
219 parseFile ff _ p = do
220 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
221 pure $ Right docs
222
223 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
224 -- TODO use language for RIS
225 toDoc ff d = do
226 -- let abstract = lookup "abstract" d
227 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
228
229 let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d
230 -- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
231 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
232
233 let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
234 , _hd_doi = lookup "doi" d
235 , _hd_url = lookup "URL" d
236 , _hd_uniqId = Nothing
237 , _hd_uniqIdBdd = Nothing
238 , _hd_page = Nothing
239 , _hd_title = lookup "title" d
240 , _hd_authors = lookup "authors" d
241 , _hd_institutes = lookup "institutes" d
242 , _hd_source = lookup "source" d
243 , _hd_abstract = lookup "abstract" d
244 , _hd_publication_date = fmap (DT.pack . show) utcTime
245 , _hd_publication_year = pub_year
246 , _hd_publication_month = pub_month
247 , _hd_publication_day = pub_day
248 , _hd_publication_hour = Nothing
249 , _hd_publication_minute = Nothing
250 , _hd_publication_second = Nothing
251 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
252 -- printDebug "[G.C.T.C.Parsers] HyperdataDocument" hd
253 pure hd
254
255 enrichWith :: FileType
256 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
257 enrichWith RisPresse = enrichWith' presseEnrich
258 enrichWith WOS = enrichWith' (map (first WOS.keys))
259 enrichWith Iramuteq = enrichWith' (map (first Iramuteq.keys))
260 enrichWith _ = enrichWith' identity
261
262
263 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
264 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
265 enrichWith' f = second (map both' . map f . concat)
266 where
267 both' = map (both decodeUtf8)
268
269
270
271 readFileWith :: FileType -> FilePath
272 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
273 readFileWith format path = do
274 files <- case takeExtension path of
275 ".zip" -> openZip path
276 _ -> pure <$> clean <$> DB.readFile path
277 partitionEithers <$> mapConcurrently (runParser format) files
278
279
280 -- | withParser:
281 -- According to the format of the text, choose the right parser.
282 -- TODO withParser :: FileType -> Parser [Document]
283 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
284 withParser WOS = WOS.parser
285 withParser RIS = RIS.parser
286 withParser Iramuteq = Iramuteq.parser
287 --withParser ODT = odtParser
288 --withParser XML = xmlParser
289 withParser _ = panic "[ERROR] Parser not implemented yet"
290
291 runParser :: FileType -> DB.ByteString
292 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
293 runParser format text = pure $ runParser' format text
294
295 runParser' :: FileType -> DB.ByteString
296 -> (Either String [[(DB.ByteString, DB.ByteString)]])
297 runParser' format text = parseOnly (withParser format) text
298
299 openZip :: FilePath -> IO [DB.ByteString]
300 openZip fp = do
301 entries <- withArchive fp (DM.keys <$> getEntries)
302 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
303 pure bs
304
305 cleanText :: Text -> Text
306 cleanText = cs . clean . cs
307
308 clean :: DB.ByteString -> DB.ByteString
309 clean txt = DBC.map clean' txt
310 where
311 clean' '’' = '\''
312 clean' '\r' = ' '
313 clean' '\t' = ' '
314 clean' ';' = '.'
315 clean' c = c
316
317 --
318
319 splitOn :: NgramsType -> Maybe Text -> Text -> [Text]
320 splitOn Authors (Just "WOS") = (DT.splitOn "; ")
321 splitOn _ _ = (DT.splitOn ", ")
322