]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
[conduit] parsers length works so full progress report should work as well
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14
15 module Gargantext.Core.Text.Corpus.Parsers.CSV where
16
17 import Conduit
18 import Control.Applicative
19 import qualified Data.ByteString as BS
20 import qualified Data.ByteString.Lazy as BL
21 import Data.Char (ord)
22 import Data.Csv
23 import Data.Either (Either(..))
24 import Data.Maybe (fromMaybe)
25 import Data.Text (Text, pack, length, intercalate)
26 import Data.Time.Segment (jour)
27 import qualified Data.Vector as V
28 import Data.Vector (Vector)
29 import GHC.IO (FilePath)
30 import GHC.Word (Word8)
31
32 import qualified Prelude
33
34 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
35 import Gargantext.Prelude hiding (length)
36 import Gargantext.Core.Text
37 import Gargantext.Core.Text.Context
38
39 ---------------------------------------------------------------
40 headerCsvGargV3 :: Header
41 headerCsvGargV3 =
42 header [ "title"
43 , "source"
44 , "publication_year"
45 , "publication_month"
46 , "publication_day"
47 , "abstract"
48 , "authors"
49 ]
50 ---------------------------------------------------------------
51 data CsvGargV3 = CsvGargV3
52 { d_docId :: !Int
53 , d_title :: !Text
54 , d_source :: !Text
55 , d_publication_year :: !Int
56 , d_publication_month :: !Int
57 , d_publication_day :: !Int
58 , d_abstract :: !Text
59 , d_authors :: !Text
60 }
61 deriving (Show)
62 ---------------------------------------------------------------
63 -- | Doc 2 HyperdataDocument
64 toDoc :: CsvGargV3 -> HyperdataDocument
65 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
66 HyperdataDocument { _hd_bdd = Just "CSV"
67 , _hd_doi = Just . pack . show $ did
68 , _hd_url = Nothing
69 , _hd_uniqId = Nothing
70 , _hd_uniqIdBdd = Nothing
71 , _hd_page = Nothing
72 , _hd_title = Just dt
73 , _hd_authors = Nothing
74 , _hd_institutes = Just dau
75 , _hd_source = Just dab
76 , _hd_abstract = Nothing
77 , _hd_publication_date = Nothing
78 , _hd_publication_year = Just dpy
79 , _hd_publication_month = Just dpm
80 , _hd_publication_day = Just dpd
81 , _hd_publication_hour = Nothing
82 , _hd_publication_minute = Nothing
83 , _hd_publication_second = Nothing
84 , _hd_language_iso2 = Nothing }
85
86 ---------------------------------------------------------------
87 -- | Types Conversions
88 toDocs :: Vector CsvDoc -> [CsvGargV3]
89 toDocs v = V.toList
90 $ V.zipWith (\nId (CsvDoc { .. }) -- (CsvDoc t s mPy pm pd abst auth)
91 -> CsvGargV3 { d_docId = nId
92 , d_title = csv_title
93 , d_source = csv_source
94 , d_publication_year = fromMIntOrDec defaultYear csv_publication_year
95 , d_publication_month = fromMaybe defaultMonth csv_publication_month
96 , d_publication_day = fromMaybe defaultDay csv_publication_day
97 , d_abstract = csv_abstract
98 , d_authors = csv_authors })
99 (V.enumFromN 1 (V.length v'')) v''
100 where
101 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
102 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
103
104 ---------------------------------------------------------------
105 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
106 fromDocs docs = V.map fromDocs' docs
107 where
108 fromDocs' (CsvGargV3 { .. }) = CsvDoc { csv_title = d_title
109 , csv_source = d_source
110 , csv_publication_year = Just $ IntOrDec d_publication_year
111 , csv_publication_month = Just d_publication_month
112 , csv_publication_day = Just d_publication_day
113 , csv_abstract = d_abstract
114 , csv_authors = d_authors }
115
116 ---------------------------------------------------------------
117 -- | Split a document in its context
118 -- TODO adapt the size of the paragraph according to the corpus average
119 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
120 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
121 if docSize > 1000
122 then
123 if (mod (round m) docSize) >= 10
124 then
125 splitDoc' splt doc
126 else
127 V.fromList [doc]
128 else
129 V.fromList [doc]
130 where
131 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
132 splitDoc' contextSize (CsvDoc { .. }) = V.fromList $ [firstDoc] <> nextDocs
133 where
134 firstDoc = CsvDoc { csv_abstract = firstAbstract, .. }
135 firstAbstract = head' "splitDoc'1" abstracts
136
137 nextDocs = map (\txt -> CsvDoc { csv_title = head' "splitDoc'2" $ sentences txt
138 , csv_abstract = unsentences $ tail' "splitDoc'1" $ sentences txt
139 , .. }
140 ) (tail' "splitDoc'2" abstracts)
141
142 abstracts = (splitBy $ contextSize) csv_abstract
143
144 ---------------------------------------------------------------
145 ---------------------------------------------------------------
146 type Mean = Double
147
148 docsSize :: Vector CsvDoc -> Mean
149 docsSize csvDoc = mean ls
150 where
151 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
152
153
154 ---------------------------------------------------------------
155 newtype IntOrDec = IntOrDec Int
156 deriving (Show, Eq, Read)
157 unIntOrDec :: IntOrDec -> Int
158 unIntOrDec (IntOrDec i) = i
159 instance FromField IntOrDec where
160 parseField s = case runParser (parseField s :: Parser Int) of
161 Left _err -> IntOrDec <$> Prelude.floor <$> (parseField s :: Parser Double)
162 Right n -> pure $ IntOrDec n
163 instance ToField IntOrDec where
164 toField (IntOrDec i) = toField i
165
166 fromMIntOrDec :: Int -> Maybe IntOrDec -> Int
167 fromMIntOrDec default' mVal = unIntOrDec $ fromMaybe (IntOrDec default') mVal
168 defaultYear :: Int
169 defaultYear = 1973
170 defaultMonth :: Int
171 defaultMonth = 1
172 defaultDay :: Int
173 defaultDay = 1
174
175 data CsvDoc = CsvDoc
176 { csv_title :: !Text
177 , csv_source :: !Text
178 , csv_publication_year :: !(Maybe IntOrDec)
179 , csv_publication_month :: !(Maybe Int)
180 , csv_publication_day :: !(Maybe Int)
181 , csv_abstract :: !Text
182 , csv_authors :: !Text
183 }
184 deriving (Show)
185
186 instance FromNamedRecord CsvDoc where
187 parseNamedRecord r = do
188 csv_title <- r .: "title" <|> r .: "Title"
189 csv_source <- r .: "source" <|> r .: "Source"
190 csv_publication_year <- r .: "publication_year" <|> r .: "Publication Year"
191 csv_publication_month <- r .: "publication_month" <|> r .: "Publication Month"
192 csv_publication_day <- r .: "publication_day" <|> r .: "Publication Day"
193 csv_abstract <- r .: "abstract" <|> r .: "Abstract"
194 csv_authors <- r .: "authors" <|> r .: "Authors"
195 pure $ CsvDoc { .. }
196
197 instance ToNamedRecord CsvDoc where
198 toNamedRecord (CsvDoc{ .. }) =
199 namedRecord [ "title" .= csv_title
200 , "source" .= csv_source
201 , "publication_year" .= csv_publication_year
202 , "publication_month" .= csv_publication_month
203 , "publication_day" .= csv_publication_day
204 , "abstract" .= csv_abstract
205 , "authors" .= csv_authors
206 ]
207
208 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
209 hyperdataDocument2csvDoc h = CsvDoc { csv_title = m $ _hd_title h
210 , csv_source = m $ _hd_source h
211 , csv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
212 , csv_publication_month = Just $ mI $ _hd_publication_month h
213 , csv_publication_day = Just $ mI $ _hd_publication_day h
214 , csv_abstract = m $ _hd_abstract h
215 , csv_authors = m $ _hd_authors h }
216
217 where
218 m = maybe "" identity
219 mI = maybe 0 identity
220
221
222 data Delimiter = Tab | Comma
223
224 csvDecodeOptions :: Delimiter -> DecodeOptions
225 csvDecodeOptions d = defaultDecodeOptions {decDelimiter = delimiter d}
226
227 csvEncodeOptions :: Delimiter -> EncodeOptions
228 csvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
229
230 delimiter :: Delimiter -> Word8
231 delimiter Tab = fromIntegral $ ord '\t'
232 delimiter Comma = fromIntegral $ ord ','
233 ------------------------------------------------------------------------
234 ------------------------------------------------------------------------
235 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO (Either Prelude.String [Text])
236 readCsvOn' fields fp = do
237 r <- readFile fp
238 pure $ ( V.toList
239 . V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
240 . snd ) <$> r
241
242 ------------------------------------------------------------------------
243
244 readFileLazy :: (FromNamedRecord a) => proxy a -> Delimiter -> FilePath -> IO (Either Prelude.String (Header, Vector a))
245 readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile
246
247 readFileStrict :: (FromNamedRecord a)
248 => proxy a
249 -> Delimiter
250 -> FilePath
251 -> IO (Either Prelude.String (Header, Vector a))
252 readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile
253
254 readByteStringLazy :: (FromNamedRecord a)
255 => proxy a
256 -> Delimiter
257 -> BL.ByteString
258 -> Either Prelude.String (Header, Vector a)
259 readByteStringLazy _f d bs = decodeByNameWith (csvDecodeOptions d) bs
260
261 readByteStringStrict :: (FromNamedRecord a)
262 => proxy a
263 -> Delimiter
264 -> BS.ByteString
265 -> Either Prelude.String (Header, Vector a)
266 readByteStringStrict d ff = (readByteStringLazy d ff) . BL.fromStrict
267
268 ------------------------------------------------------------------------
269 -- | TODO use readFileLazy
270 readFile :: FilePath -> IO (Either Prelude.String (Header, Vector CsvDoc))
271 readFile fp = do
272 result <- fmap (readCsvLazyBS Comma) $ BL.readFile fp
273 case result of
274 Left _err -> fmap (readCsvLazyBS Tab) $ BL.readFile fp
275 Right res -> pure $ Right res
276
277
278
279 -- | TODO use readByteStringLazy
280 readCsvLazyBS :: Delimiter -> BL.ByteString -> Either Prelude.String (Header, Vector CsvDoc)
281 readCsvLazyBS d bs = decodeByNameWith (csvDecodeOptions d) bs
282
283 ------------------------------------------------------------------------
284 -- | TODO use readFileLazy
285 readCsvHal :: FilePath -> IO (Either Prelude.String (Header, Vector CsvHal))
286 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
287
288 -- | TODO use readByteStringLazy
289 readCsvHalLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvHal)
290 readCsvHalLazyBS bs = decodeByNameWith (csvDecodeOptions Tab) bs
291
292 readCsvHalBSStrict :: BS.ByteString -> Either Prelude.String (Header, Vector CsvHal)
293 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
294
295 ------------------------------------------------------------------------
296 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
297 writeFile fp (h, vs) = BL.writeFile fp $
298 encodeByNameWith (csvEncodeOptions Tab) h (V.toList vs)
299
300 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
301 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
302
303 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
304 hyperdataDocument2csv hs = encodeByNameWith (csvEncodeOptions Tab) headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
305
306 ------------------------------------------------------------------------
307 -- Hal Format
308 data CsvHal = CsvHal
309 { csvHal_title :: !Text
310 , csvHal_source :: !Text
311 , csvHal_publication_year :: !Integer
312 , csvHal_publication_month :: !Int
313 , csvHal_publication_day :: !Int
314 , csvHal_abstract :: !Text
315 , csvHal_authors :: !Text
316
317 , csvHal_url :: !Text
318 , csvHal_isbn_s :: !Text
319 , csvHal_issue_s :: !Text
320 , csvHal_journalPublisher_s:: !Text
321 , csvHal_language_s :: !Text
322
323 , csvHal_doiId_s :: !Text
324 , csvHal_authId_i :: !Text
325 , csvHal_instStructId_i :: !Text
326 , csvHal_deptStructId_i :: !Text
327 , csvHal_labStructId_i :: !Text
328
329 , csvHal_rteamStructId_i :: !Text
330 , csvHal_docType_s :: !Text
331 }
332 deriving (Show)
333
334 instance FromNamedRecord CsvHal where
335 parseNamedRecord r = do
336 csvHal_title <- r .: "title"
337 csvHal_source <- r .: "source"
338 csvHal_publication_year <- r .: "publication_year"
339 csvHal_publication_month <- r .: "publication_month"
340 csvHal_publication_day <- r .: "publication_day"
341 csvHal_abstract <- r .: "abstract"
342 csvHal_authors <- r .: "authors"
343 csvHal_url <- r .: "url"
344 csvHal_isbn_s <- r .: "isbn_s"
345 csvHal_issue_s <- r .: "issue_s"
346 csvHal_journalPublisher_s <- r .: "journalPublisher_s"
347 csvHal_language_s <- r .: "language_s"
348 csvHal_doiId_s <- r .: "doiId_s"
349 csvHal_authId_i <- r .: "authId_i"
350 csvHal_instStructId_i <- r .: "instStructId_i"
351 csvHal_deptStructId_i <- r .: "deptStructId_i"
352 csvHal_labStructId_i <- r .: "labStructId_i"
353 csvHal_rteamStructId_i <- r .: "rteamStructId_i"
354 csvHal_docType_s <- r .: "docType_s"
355 pure $ CsvHal { .. }
356
357 instance ToNamedRecord CsvHal where
358 --toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
359 toNamedRecord (CsvHal { .. }) =
360 namedRecord [ "title" .= csvHal_title
361 , "source" .= csvHal_source
362
363 , "publication_year" .= csvHal_publication_year
364 , "publication_month" .= csvHal_publication_month
365 , "publication_day" .= csvHal_publication_day
366
367 , "abstract" .= csvHal_abstract
368 , "authors" .= csvHal_authors
369
370 , "url" .= csvHal_url
371 , "isbn_s" .= csvHal_isbn_s
372 , "issue_s" .= csvHal_issue_s
373 , "journalPublisher_s" .= csvHal_journalPublisher_s
374 , "language_s" .= csvHal_language_s
375
376 , "doiId_s" .= csvHal_doiId_s
377 , "authId_i" .= csvHal_authId_i
378 , "instStructId_i" .= csvHal_instStructId_i
379 , "deptStructId_i" .= csvHal_deptStructId_i
380 , "labStructId_i" .= csvHal_labStructId_i
381
382 , "rteamStructId_i" .= csvHal_rteamStructId_i
383 , "docType_s" .= csvHal_docType_s
384 ]
385
386 csvHal2doc :: CsvHal -> HyperdataDocument
387 csvHal2doc (CsvHal { .. }) =
388 HyperdataDocument { _hd_bdd = Just "CsvHal"
389 , _hd_doi = Just csvHal_doiId_s
390 , _hd_url = Just csvHal_url
391 , _hd_uniqId = Nothing
392 , _hd_uniqIdBdd = Nothing
393 , _hd_page = Nothing
394 , _hd_title = Just csvHal_title
395 , _hd_authors = Just csvHal_authors
396 , _hd_institutes = Just csvHal_instStructId_i
397 , _hd_source = Just csvHal_source
398 , _hd_abstract = Just csvHal_abstract
399 , _hd_publication_date = Just $ pack . show $ jour csvHal_publication_year
400 csvHal_publication_month
401 csvHal_publication_day
402 , _hd_publication_year = Just $ fromIntegral csvHal_publication_year
403 , _hd_publication_month = Just csvHal_publication_month
404 , _hd_publication_day = Just csvHal_publication_day
405 , _hd_publication_hour = Nothing
406 , _hd_publication_minute = Nothing
407 , _hd_publication_second = Nothing
408 , _hd_language_iso2 = Nothing }
409
410
411 csv2doc :: CsvDoc -> HyperdataDocument
412 csv2doc (CsvDoc { .. })
413 = HyperdataDocument { _hd_bdd = Just "CsvHal"
414 , _hd_doi = Nothing
415 , _hd_url = Nothing
416 , _hd_uniqId = Nothing
417 , _hd_uniqIdBdd = Nothing
418 , _hd_page = Nothing
419 , _hd_title = Just csv_title
420 , _hd_authors = Just csv_authors
421 , _hd_institutes = Nothing
422 , _hd_source = Just csv_source
423 , _hd_abstract = Just csv_abstract
424 , _hd_publication_date = Just $ pack . show $ jour (fromIntegral pubYear)
425 pubMonth
426 pubDay
427 , _hd_publication_year = Just pubYear
428 , _hd_publication_month = Just pubMonth
429 , _hd_publication_day = Just pubDay
430 , _hd_publication_hour = Nothing
431 , _hd_publication_minute = Nothing
432 , _hd_publication_second = Nothing
433 , _hd_language_iso2 = Nothing }
434 where
435 pubYear = fromMIntOrDec defaultYear csv_publication_year
436 pubMonth = fromMaybe defaultMonth csv_publication_month
437 pubDay = fromMaybe defaultDay csv_publication_day
438
439 ------------------------------------------------------------------------
440 parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
441 parseHal fp = do
442 r <- readCsvHal fp
443 pure $ (V.toList . V.map csvHal2doc . snd) <$> r
444
445 parseHal' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
446 parseHal' bs = (V.toList . V.map csvHal2doc . snd) <$> readCsvHalLazyBS bs
447
448 ------------------------------------------------------------------------
449
450 parseCsv :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
451 parseCsv fp = fmap (V.toList . V.map csv2doc . snd) <$> readFile fp
452
453 {-
454 parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
455 parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS Comma bs
456 -}
457
458 parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
459 parseCsv' bs = do
460 let
461 result = case readCsvLazyBS Comma bs of
462 Left _err -> readCsvLazyBS Tab bs
463 Right res -> Right res
464 (V.toList . V.map csv2doc . snd) <$> result
465
466 parseCsvC :: BL.ByteString
467 -> Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument Identity ())
468 parseCsvC bs = do
469 let
470 result = case readCsvLazyBS Comma bs of
471 Left _err -> readCsvLazyBS Tab bs
472 Right res -> Right res
473 case result of
474 Left err -> Left err
475 Right r -> Right $ (Just $ Prelude.fromIntegral $ Prelude.length $ snd r, (yieldMany $ snd r) .| mapC csv2doc)
476
477 ------------------------------------------------------------------------
478 -- Csv v3 weighted for phylo
479
480 data Csv' = Csv'
481 { csv'_title :: !Text
482 , csv'_source :: !Text
483 , csv'_publication_year :: !Int
484 , csv'_publication_month :: !Int
485 , csv'_publication_day :: !Int
486 , csv'_abstract :: !Text
487 , csv'_authors :: !Text
488 , csv'_weight :: !Double } deriving (Show)
489
490
491 instance FromNamedRecord Csv' where
492 parseNamedRecord r = do
493 csv'_title <- r .: "title"
494 csv'_source <- r .: "source"
495 csv'_publication_year <- r .: "publication_year"
496 csv'_publication_month <- r .: "publication_month"
497 csv'_publication_day <- r .: "publication_day"
498 csv'_abstract <- r .: "abstract"
499 csv'_authors <- r .: "authors"
500 csv'_weight <- r .: "weight"
501 pure $ Csv' { .. }
502
503 readWeightedCsv :: FilePath -> IO (Header, Vector Csv')
504 readWeightedCsv fp =
505 fmap (\bs ->
506 case decodeByNameWith (csvDecodeOptions Tab) bs of
507 Left e -> panic (pack e)
508 Right corpus -> corpus
509 ) $ BL.readFile fp