]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers.hs
Merge remote-tracking branch 'origin/adinapoli/fix-datafield-instance' into dev-merge
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers
3 Description : All parsers of Gargantext in one file.
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Gargantext enables analyzing semi-structured text that should be parsed
11 in order to be analyzed.
12
13 The parsers suppose we know the format of the Text (TextFormat data
14 type) according to which the right parser is chosen among the list of
15 available parsers.
16
17 This module mainly describe how to add a new parser to Gargantext,
18 please follow the types.
19 -}
20
21 {-# LANGUAGE PackageImports #-}
22
23 module Gargantext.Core.Text.Corpus.Parsers (FileFormat(..), FileType(..), clean, parseFile, cleanText, parseFormatC, splitOn, etale)
24 where
25
26 -- import Gargantext.Core.Text.Learn (detectLangDefault)
27 import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
28 import Conduit
29 import Control.Concurrent.Async as CCA (mapConcurrently)
30 import Control.Monad (join)
31 import Control.Monad.Trans.Control (MonadBaseControl)
32 import Data.Attoparsec.ByteString (parseOnly, Parser)
33 import Data.Either(Either(..))
34 import Data.Either.Extra (partitionEithers)
35 import Data.List (concat, lookup)
36 import Data.Ord()
37 import Data.String (String())
38 import Data.String()
39 import Data.Text (Text, intercalate, pack, unpack)
40 import Data.Text.Encoding (decodeUtf8)
41 import Data.Tuple.Extra (both, first, second)
42 import Gargantext.API.Node.Corpus.New.Types (FileFormat(..))
43 import Gargantext.Core (Lang(..))
44 import Gargantext.Core.Text.Corpus.Parsers.CSV (parseHal, parseCsv, parseCsvC)
45 import Gargantext.Core.Text.Corpus.Parsers.JSON (parseJSONC)
46 import Gargantext.Core.Text.Corpus.Parsers.RIS.Presse (presseEnrich)
47 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
48 import Gargantext.Database.Query.Table.Ngrams (NgramsType(..))
49 import Gargantext.Prelude
50 import System.FilePath (FilePath(), takeExtension)
51 import System.IO.Temp (emptySystemTempFile)
52 import Gargantext.Core.Text.Corpus.Parsers.FrameWrite (text2titleParagraphs)
53 import qualified Data.ByteString as DB
54 import qualified Data.ByteString.Char8 as DBC
55 import qualified Data.ByteString.Lazy as DBL
56 import qualified Data.Map as DM
57 import qualified Data.Text as DT
58 import qualified Data.Text as Text
59 import qualified Gargantext.Core.Text.Corpus.Parsers.Date as Date
60 import qualified Gargantext.Core.Text.Corpus.Parsers.Iramuteq as Iramuteq
61 import qualified Gargantext.Core.Text.Corpus.Parsers.RIS as RIS
62 import qualified Gargantext.Core.Text.Corpus.Parsers.WOS as WOS
63 import qualified Prelude
64 ------------------------------------------------------------------------
65
66 type ParseError = String
67 --type Field = Text
68 --type Document = DM.Map Field Text
69 --type FilesParsed = DM.Map FilePath FileParsed
70 --data FileParsed = FileParsed { _fileParsed_errors :: Maybe ParseError
71 -- , _fileParsed_result :: [Document]
72 -- } deriving (Show)
73
74
75 -- | According to the format of Input file,
76 -- different parser are available.
77 data FileType = WOS
78 | RIS
79 | RisPresse
80 | CsvGargV3
81 | CsvHal
82 | Iramuteq
83 | JSON
84 deriving (Show, Eq)
85
86 -- Implemented (ISI Format)
87 -- | DOC -- Not Implemented / import Pandoc
88 -- | ODT -- Not Implemented / import Pandoc
89 -- | PDF -- Not Implemented / pdftotext and import Pandoc ?
90 -- | XML -- Not Implemented / see :
91
92 parseFormatC :: MonadBaseControl IO m
93 => FileType
94 -> FileFormat
95 -> DB.ByteString
96 -> m (Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument IO ()))
97 parseFormatC CsvGargV3 Plain bs = do
98 let eParsedC = parseCsvC $ DBL.fromStrict bs
99 case eParsedC of
100 Left err -> pure $ Left err
101 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
102 parseFormatC CsvHal Plain bs = do
103 let eParsedC = parseCsvC $ DBL.fromStrict bs
104 case eParsedC of
105 Left err -> pure $ Left err
106 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
107 parseFormatC RisPresse Plain bs = do
108 --docs <- enrichWith RisPresse
109 let eDocs = runParser' RisPresse bs
110 pure $ (\docs ->
111 ( Just $ fromIntegral $ length docs
112 , yieldMany docs
113 .| mapC presseEnrich
114 .| mapC (map $ both decodeUtf8)
115 .| mapMC (toDoc RIS)) ) <$> eDocs
116 parseFormatC WOS Plain bs = do
117 let eDocs = runParser' WOS bs
118 pure $ (\docs ->
119 ( Just $ fromIntegral $ length docs
120 , yieldMany docs
121 .| mapC (map $ first WOS.keys)
122 .| mapC (map $ both decodeUtf8)
123 .| mapMC (toDoc WOS)) ) <$> eDocs
124
125 parseFormatC Iramuteq Plain bs = do
126 let eDocs = runParser' Iramuteq bs
127 pure $ (\docs ->
128 ( Just $ fromIntegral $ length docs
129 , yieldMany docs
130 .| mapC (map $ first Iramuteq.keys)
131 .| mapC (map $ both decodeUtf8)
132 .| mapMC ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
133 )
134 )
135 <$> eDocs
136
137 parseFormatC JSON Plain bs = do
138 let eParsedC = parseJSONC $ DBL.fromStrict bs
139 case eParsedC of
140 Left err -> pure $ Left err
141 Right (mLen, parsedC) -> pure $ Right (mLen, transPipe (pure . runIdentity) parsedC)
142
143 parseFormatC ft ZIP bs = do
144 path <- liftBase $ emptySystemTempFile "parsed-zip"
145 liftBase $ DB.writeFile path bs
146 fileContents <- liftBase $ withArchive path $ do
147 files <- DM.keys <$> getEntries
148 mapM getEntry files
149 --printDebug "[parseFormatC] fileContents" fileContents
150 eContents <- mapM (parseFormatC ft Plain) fileContents
151 --printDebug "[parseFormatC] contents" contents
152 --pure $ Left $ "Not implemented for ZIP"
153 let (errs, contents) = partitionEithers eContents
154 case errs of
155 [] ->
156 case contents of
157 [] -> pure $ Left "No files in zip"
158 _ -> do
159 let lenghts = fst <$> contents
160 let contents' = snd <$> contents
161 let totalLength = sum $ sum <$> lenghts -- Trick: sum (Just 1) = 1, sum Nothing = 0
162 pure $ Right ( Just totalLength
163 , sequenceConduits contents' >> pure () ) -- .| mapM_C (printDebug "[parseFormatC] doc")
164 _ -> pure $ Left $ unpack $ intercalate "\n" $ pack <$> errs
165
166 parseFormatC _ _ _ = undefined
167
168
169 etale :: [HyperdataDocument] -> [HyperdataDocument]
170 etale = concat . (map etale')
171 where
172 etale' :: HyperdataDocument -> [HyperdataDocument]
173 etale' h = map (\t -> h { _hd_abstract = Just t })
174 $ map snd
175 $ text2titleParagraphs 7 (maybe "" identity $ _hd_abstract h)
176
177
178 -- parseFormat :: FileType -> DB.ByteString -> IO (Either Prelude.String [HyperdataDocument])
179 -- parseFormat CsvGargV3 bs = pure $ parseCsv' $ DBL.fromStrict bs
180 -- parseFormat CsvHal bs = pure $ parseHal' $ DBL.fromStrict bs
181 -- parseFormat RisPresse bs = do
182 -- docs <- mapM (toDoc RIS)
183 -- <$> snd
184 -- <$> enrichWith RisPresse
185 -- $ partitionEithers
186 -- $ [runParser' RisPresse bs]
187 -- pure $ Right docs
188 -- parseFormat WOS bs = do
189 -- docs <- mapM (toDoc WOS)
190 -- <$> snd
191 -- <$> enrichWith WOS
192 -- $ partitionEithers
193 -- $ [runParser' WOS bs]
194 -- pure $ Right docs
195 -- parseFormat ZIP bs = do
196 -- path <- emptySystemTempFile "parsed-zip"
197 -- DB.writeFile path bs
198 -- parsedZip <- withArchive path $ do
199 -- DM.keys <$> getEntries
200 -- pure $ Left $ "Not implemented for ZIP, parsedZip" <> show parsedZip
201 -- parseFormat _ _ = undefined
202
203 -- | Parse file into documents
204 -- TODO manage errors here
205 -- TODO: to debug maybe add the filepath in error message
206
207 parseFile :: FileType -> FileFormat -> FilePath -> IO (Either Prelude.String [HyperdataDocument])
208 parseFile CsvHal Plain p = parseHal p
209 parseFile CsvGargV3 Plain p = parseCsv p
210
211 parseFile RisPresse Plain p = do
212 docs <- join $ mapM (toDoc RIS) <$> snd <$> enrichWith RisPresse <$> readFileWith RIS p
213 pure $ Right docs
214
215 parseFile WOS Plain p = do
216 docs <- join $ mapM (toDoc WOS) <$> snd <$> enrichWith WOS <$> readFileWith WOS p
217 pure $ Right docs
218
219 parseFile Iramuteq Plain p = do
220 docs <- join $ mapM ((toDoc Iramuteq) . (map (second (Text.replace "_" " "))))
221 <$> snd
222 <$> enrichWith Iramuteq
223 <$> readFileWith Iramuteq p
224 pure $ Right docs
225
226
227 parseFile ff _ p = do
228 docs <- join $ mapM (toDoc ff) <$> snd <$> enrichWith ff <$> readFileWith ff p
229 pure $ Right docs
230
231 toDoc :: FileType -> [(Text, Text)] -> IO HyperdataDocument
232 -- TODO use language for RIS
233 toDoc ff d = do
234 -- let abstract = lookup "abstract" d
235 let lang = EN -- maybe EN identity (join $ detectLangDefault <$> (fmap (DT.take 50) abstract))
236
237 let dateToParse = DT.replace " " "" <$> lookup "PY" d -- <> Just " " <> lookup "publication_date" d
238 -- printDebug "[G.C.T.C.Parsers] dateToParse" dateToParse
239 (utcTime, (pub_year, pub_month, pub_day)) <- Date.dateSplit lang dateToParse
240
241 let hd = HyperdataDocument { _hd_bdd = Just $ DT.pack $ show ff
242 , _hd_doi = lookup "doi" d
243 , _hd_url = lookup "URL" d
244 , _hd_uniqId = Nothing
245 , _hd_uniqIdBdd = Nothing
246 , _hd_page = Nothing
247 , _hd_title = lookup "title" d
248 , _hd_authors = lookup "authors" d
249 , _hd_institutes = lookup "institutes" d
250 , _hd_source = lookup "source" d
251 , _hd_abstract = lookup "abstract" d
252 , _hd_publication_date = fmap (DT.pack . show) utcTime
253 , _hd_publication_year = pub_year
254 , _hd_publication_month = pub_month
255 , _hd_publication_day = pub_day
256 , _hd_publication_hour = Nothing
257 , _hd_publication_minute = Nothing
258 , _hd_publication_second = Nothing
259 , _hd_language_iso2 = Just $ (DT.pack . show) lang }
260 -- printDebug "[G.C.T.C.Parsers] HyperdataDocument" hd
261 pure hd
262
263 enrichWith :: FileType
264 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
265 enrichWith RisPresse = enrichWith' presseEnrich
266 enrichWith WOS = enrichWith' (map (first WOS.keys))
267 enrichWith Iramuteq = enrichWith' (map (first Iramuteq.keys))
268 enrichWith _ = enrichWith' identity
269
270
271 enrichWith' :: ([(DB.ByteString, DB.ByteString)] -> [(DB.ByteString, DB.ByteString)])
272 -> (a, [[[(DB.ByteString, DB.ByteString)]]]) -> (a, [[(Text, Text)]])
273 enrichWith' f = second (map both' . map f . concat)
274 where
275 both' = map (both decodeUtf8)
276
277
278
279 readFileWith :: FileType -> FilePath
280 -> IO ([ParseError], [[[(DB.ByteString, DB.ByteString)]]])
281 readFileWith format path = do
282 files <- case takeExtension path of
283 ".zip" -> openZip path
284 _ -> pure <$> clean <$> DB.readFile path
285 partitionEithers <$> mapConcurrently (runParser format) files
286
287
288 -- | withParser:
289 -- According to the format of the text, choose the right parser.
290 -- TODO withParser :: FileType -> Parser [Document]
291 withParser :: FileType -> Parser [[(DB.ByteString, DB.ByteString)]]
292 withParser WOS = WOS.parser
293 withParser RIS = RIS.parser
294 withParser Iramuteq = Iramuteq.parser
295 --withParser ODT = odtParser
296 --withParser XML = xmlParser
297 withParser _ = panic "[ERROR] Parser not implemented yet"
298
299 runParser :: FileType -> DB.ByteString
300 -> IO (Either String [[(DB.ByteString, DB.ByteString)]])
301 runParser format text = pure $ runParser' format text
302
303 runParser' :: FileType -> DB.ByteString
304 -> (Either String [[(DB.ByteString, DB.ByteString)]])
305 runParser' format text = parseOnly (withParser format) text
306
307 openZip :: FilePath -> IO [DB.ByteString]
308 openZip fp = do
309 entries <- withArchive fp (DM.keys <$> getEntries)
310 bs <- mapConcurrently (\s -> withArchive fp (getEntry s)) entries
311 pure bs
312
313 cleanText :: Text -> Text
314 cleanText = cs . clean . cs
315
316 clean :: DB.ByteString -> DB.ByteString
317 clean txt = DBC.map clean' txt
318 where
319 clean' '’' = '\''
320 clean' '\r' = ' '
321 clean' '\t' = ' '
322 clean' ';' = '.'
323 clean' c = c
324
325 --
326
327 splitOn :: NgramsType -> Maybe Text -> Text -> [Text]
328 splitOn Authors (Just "WOS") = (DT.splitOn "; ")
329 splitOn _ _ = (DT.splitOn ", ")