2 Module : Gargantext.Core.Text.Corpus.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
15 module Gargantext.Core.Text.Corpus.Parsers.CSV where
18 import Control.Applicative
19 import qualified Data.ByteString as BS
20 import qualified Data.ByteString.Lazy as BL
21 import Data.Char (ord)
23 import Data.Either (Either(..))
24 import Data.Maybe (fromMaybe)
25 import Data.Text (Text, pack, length, intercalate)
26 import Data.Time.Segment (jour)
27 import qualified Data.Vector as V
28 import Data.Vector (Vector)
29 import GHC.IO (FilePath)
30 import GHC.Word (Word8)
32 import qualified Prelude
34 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
35 import Gargantext.Prelude hiding (length)
36 import Gargantext.Core.Text
37 import Gargantext.Core.Text.Context
39 ---------------------------------------------------------------
40 headerCsvGargV3 :: Header
50 ---------------------------------------------------------------
51 data CsvGargV3 = CsvGargV3
55 , d_publication_year :: !Int
56 , d_publication_month :: !Int
57 , d_publication_day :: !Int
62 ---------------------------------------------------------------
63 -- | Doc 2 HyperdataDocument
64 toDoc :: CsvGargV3 -> HyperdataDocument
65 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
66 HyperdataDocument { _hd_bdd = Just "CSV"
67 , _hd_doi = Just . pack . show $ did
69 , _hd_uniqId = Nothing
70 , _hd_uniqIdBdd = Nothing
73 , _hd_authors = Nothing
74 , _hd_institutes = Just dau
75 , _hd_source = Just dab
76 , _hd_abstract = Nothing
77 , _hd_publication_date = Nothing
78 , _hd_publication_year = Just dpy
79 , _hd_publication_month = Just dpm
80 , _hd_publication_day = Just dpd
81 , _hd_publication_hour = Nothing
82 , _hd_publication_minute = Nothing
83 , _hd_publication_second = Nothing
84 , _hd_language_iso2 = Nothing }
86 ---------------------------------------------------------------
87 -- | Types Conversions
88 toDocs :: Vector CsvDoc -> [CsvGargV3]
90 $ V.zipWith (\nId (CsvDoc { .. }) -- (CsvDoc t s mPy pm pd abst auth)
91 -> CsvGargV3 { d_docId = nId
93 , d_source = csv_source
94 , d_publication_year = fromMIntOrDec defaultYear csv_publication_year
95 , d_publication_month = fromMaybe defaultMonth csv_publication_month
96 , d_publication_day = fromMaybe defaultDay csv_publication_day
97 , d_abstract = csv_abstract
98 , d_authors = csv_authors })
99 (V.enumFromN 1 (V.length v'')) v''
101 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
102 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
104 ---------------------------------------------------------------
105 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
106 fromDocs docs = V.map fromDocs' docs
108 fromDocs' (CsvGargV3 { .. }) = CsvDoc { csv_title = d_title
109 , csv_source = d_source
110 , csv_publication_year = Just $ IntOrDec d_publication_year
111 , csv_publication_month = Just d_publication_month
112 , csv_publication_day = Just d_publication_day
113 , csv_abstract = d_abstract
114 , csv_authors = d_authors }
116 ---------------------------------------------------------------
117 -- | Split a document in its context
118 -- TODO adapt the size of the paragraph according to the corpus average
119 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
120 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
123 if (mod (round m) docSize) >= 10
131 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
132 splitDoc' contextSize (CsvDoc { .. }) = V.fromList $ [firstDoc] <> nextDocs
134 firstDoc = CsvDoc { csv_abstract = firstAbstract, .. }
135 firstAbstract = head' "splitDoc'1" abstracts
137 nextDocs = map (\txt -> CsvDoc { csv_title = head' "splitDoc'2" $ sentences txt
138 , csv_abstract = unsentences $ tail' "splitDoc'1" $ sentences txt
140 ) (tail' "splitDoc'2" abstracts)
142 abstracts = (splitBy $ contextSize) csv_abstract
144 ---------------------------------------------------------------
145 ---------------------------------------------------------------
148 docsSize :: Vector CsvDoc -> Mean
149 docsSize csvDoc = mean ls
151 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
154 ---------------------------------------------------------------
155 newtype IntOrDec = IntOrDec Int
156 deriving (Show, Eq, Read)
157 unIntOrDec :: IntOrDec -> Int
158 unIntOrDec (IntOrDec i) = i
159 instance FromField IntOrDec where
160 parseField s = case runParser (parseField s :: Parser Int) of
161 Left _err -> IntOrDec <$> Prelude.floor <$> (parseField s :: Parser Double)
162 Right n -> pure $ IntOrDec n
163 instance ToField IntOrDec where
164 toField (IntOrDec i) = toField i
166 fromMIntOrDec :: Int -> Maybe IntOrDec -> Int
167 fromMIntOrDec default' mVal = unIntOrDec $ fromMaybe (IntOrDec default') mVal
177 , csv_source :: !Text
178 , csv_publication_year :: !(Maybe IntOrDec)
179 , csv_publication_month :: !(Maybe Int)
180 , csv_publication_day :: !(Maybe Int)
181 , csv_abstract :: !Text
182 , csv_authors :: !Text
186 instance FromNamedRecord CsvDoc where
187 parseNamedRecord r = do
188 csv_title <- r .: "title" <|> r .: "Title"
189 csv_source <- r .: "source" <|> r .: "Source"
190 csv_publication_year <- r .: "publication_year" <|> r .: "Publication Year"
191 csv_publication_month <- r .: "publication_month" <|> r .: "Publication Month"
192 csv_publication_day <- r .: "publication_day" <|> r .: "Publication Day"
193 csv_abstract <- r .: "abstract" <|> r .: "Abstract"
194 csv_authors <- r .: "authors" <|> r .: "Authors"
197 instance ToNamedRecord CsvDoc where
198 toNamedRecord (CsvDoc{ .. }) =
199 namedRecord [ "title" .= csv_title
200 , "source" .= csv_source
201 , "publication_year" .= csv_publication_year
202 , "publication_month" .= csv_publication_month
203 , "publication_day" .= csv_publication_day
204 , "abstract" .= csv_abstract
205 , "authors" .= csv_authors
208 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
209 hyperdataDocument2csvDoc h = CsvDoc { csv_title = m $ _hd_title h
210 , csv_source = m $ _hd_source h
211 , csv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
212 , csv_publication_month = Just $ mI $ _hd_publication_month h
213 , csv_publication_day = Just $ mI $ _hd_publication_day h
214 , csv_abstract = m $ _hd_abstract h
215 , csv_authors = m $ _hd_authors h }
218 m = maybe "" identity
219 mI = maybe 0 identity
222 data Delimiter = Tab | Comma
224 csvDecodeOptions :: Delimiter -> DecodeOptions
225 csvDecodeOptions d = defaultDecodeOptions {decDelimiter = delimiter d}
227 csvEncodeOptions :: Delimiter -> EncodeOptions
228 csvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
230 delimiter :: Delimiter -> Word8
231 delimiter Tab = fromIntegral $ ord '\t'
232 delimiter Comma = fromIntegral $ ord ','
233 ------------------------------------------------------------------------
234 ------------------------------------------------------------------------
235 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO (Either Prelude.String [Text])
236 readCsvOn' fields fp = do
239 . V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
242 ------------------------------------------------------------------------
244 readFileLazy :: (FromNamedRecord a) => proxy a -> Delimiter -> FilePath -> IO (Either Prelude.String (Header, Vector a))
245 readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile
247 readFileStrict :: (FromNamedRecord a)
251 -> IO (Either Prelude.String (Header, Vector a))
252 readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile
254 readByteStringLazy :: (FromNamedRecord a)
258 -> Either Prelude.String (Header, Vector a)
259 readByteStringLazy _f d bs = decodeByNameWith (csvDecodeOptions d) bs
261 readByteStringStrict :: (FromNamedRecord a)
265 -> Either Prelude.String (Header, Vector a)
266 readByteStringStrict d ff = (readByteStringLazy d ff) . BL.fromStrict
268 ------------------------------------------------------------------------
269 -- | TODO use readFileLazy
270 readCSVFile :: FilePath -> IO (Either Prelude.String (Header, Vector CsvDoc))
272 result <- fmap (readCsvLazyBS Comma) $ BL.readFile fp
274 Left _err -> fmap (readCsvLazyBS Tab) $ BL.readFile fp
275 Right res -> pure $ Right res
279 -- | TODO use readByteStringLazy
280 readCsvLazyBS :: Delimiter -> BL.ByteString -> Either Prelude.String (Header, Vector CsvDoc)
281 readCsvLazyBS d bs = decodeByNameWith (csvDecodeOptions d) bs
283 ------------------------------------------------------------------------
284 -- | TODO use readFileLazy
285 readCsvHal :: FilePath -> IO (Either Prelude.String (Header, Vector CsvHal))
286 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
288 -- | TODO use readByteStringLazy
289 readCsvHalLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvHal)
290 readCsvHalLazyBS bs = decodeByNameWith (csvDecodeOptions Tab) bs
292 readCsvHalBSStrict :: BS.ByteString -> Either Prelude.String (Header, Vector CsvHal)
293 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
295 ------------------------------------------------------------------------
296 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
297 writeFile fp (h, vs) = BL.writeFile fp $
298 encodeByNameWith (csvEncodeOptions Tab) h (V.toList vs)
300 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
301 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
303 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
304 hyperdataDocument2csv hs = encodeByNameWith (csvEncodeOptions Tab) headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
306 ------------------------------------------------------------------------
309 { csvHal_title :: !Text
310 , csvHal_source :: !Text
311 , csvHal_publication_year :: !Integer
312 , csvHal_publication_month :: !Int
313 , csvHal_publication_day :: !Int
314 , csvHal_abstract :: !Text
315 , csvHal_authors :: !Text
317 , csvHal_url :: !Text
318 , csvHal_isbn_s :: !Text
319 , csvHal_issue_s :: !Text
320 , csvHal_journalPublisher_s:: !Text
321 , csvHal_language_s :: !Text
323 , csvHal_doiId_s :: !Text
324 , csvHal_authId_i :: !Text
325 , csvHal_instStructId_i :: !Text
326 , csvHal_deptStructId_i :: !Text
327 , csvHal_labStructId_i :: !Text
329 , csvHal_rteamStructId_i :: !Text
330 , csvHal_docType_s :: !Text
334 instance FromNamedRecord CsvHal where
335 parseNamedRecord r = do
336 csvHal_title <- r .: "title"
337 csvHal_source <- r .: "source"
338 csvHal_publication_year <- r .: "publication_year"
339 csvHal_publication_month <- r .: "publication_month"
340 csvHal_publication_day <- r .: "publication_day"
341 csvHal_abstract <- r .: "abstract"
342 csvHal_authors <- r .: "authors"
343 csvHal_url <- r .: "url"
344 csvHal_isbn_s <- r .: "isbn_s"
345 csvHal_issue_s <- r .: "issue_s"
346 csvHal_journalPublisher_s <- r .: "journalPublisher_s"
347 csvHal_language_s <- r .: "language_s"
348 csvHal_doiId_s <- r .: "doiId_s"
349 csvHal_authId_i <- r .: "authId_i"
350 csvHal_instStructId_i <- r .: "instStructId_i"
351 csvHal_deptStructId_i <- r .: "deptStructId_i"
352 csvHal_labStructId_i <- r .: "labStructId_i"
353 csvHal_rteamStructId_i <- r .: "rteamStructId_i"
354 csvHal_docType_s <- r .: "docType_s"
357 instance ToNamedRecord CsvHal where
358 --toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
359 toNamedRecord (CsvHal { .. }) =
360 namedRecord [ "title" .= csvHal_title
361 , "source" .= csvHal_source
363 , "publication_year" .= csvHal_publication_year
364 , "publication_month" .= csvHal_publication_month
365 , "publication_day" .= csvHal_publication_day
367 , "abstract" .= csvHal_abstract
368 , "authors" .= csvHal_authors
370 , "url" .= csvHal_url
371 , "isbn_s" .= csvHal_isbn_s
372 , "issue_s" .= csvHal_issue_s
373 , "journalPublisher_s" .= csvHal_journalPublisher_s
374 , "language_s" .= csvHal_language_s
376 , "doiId_s" .= csvHal_doiId_s
377 , "authId_i" .= csvHal_authId_i
378 , "instStructId_i" .= csvHal_instStructId_i
379 , "deptStructId_i" .= csvHal_deptStructId_i
380 , "labStructId_i" .= csvHal_labStructId_i
382 , "rteamStructId_i" .= csvHal_rteamStructId_i
383 , "docType_s" .= csvHal_docType_s
386 csvHal2doc :: CsvHal -> HyperdataDocument
387 csvHal2doc (CsvHal { .. }) =
388 HyperdataDocument { _hd_bdd = Just "CsvHal"
389 , _hd_doi = Just csvHal_doiId_s
390 , _hd_url = Just csvHal_url
391 , _hd_uniqId = Nothing
392 , _hd_uniqIdBdd = Nothing
394 , _hd_title = Just csvHal_title
395 , _hd_authors = Just csvHal_authors
396 , _hd_institutes = Just csvHal_instStructId_i
397 , _hd_source = Just csvHal_source
398 , _hd_abstract = Just csvHal_abstract
399 , _hd_publication_date = Just $ pack . show $ jour csvHal_publication_year
400 csvHal_publication_month
401 csvHal_publication_day
402 , _hd_publication_year = Just $ fromIntegral csvHal_publication_year
403 , _hd_publication_month = Just csvHal_publication_month
404 , _hd_publication_day = Just csvHal_publication_day
405 , _hd_publication_hour = Nothing
406 , _hd_publication_minute = Nothing
407 , _hd_publication_second = Nothing
408 , _hd_language_iso2 = Nothing }
411 csv2doc :: CsvDoc -> HyperdataDocument
412 csv2doc (CsvDoc { .. })
413 = HyperdataDocument { _hd_bdd = Just "CsvHal"
416 , _hd_uniqId = Nothing
417 , _hd_uniqIdBdd = Nothing
419 , _hd_title = Just csv_title
420 , _hd_authors = Just csv_authors
421 , _hd_institutes = Nothing
422 , _hd_source = Just csv_source
423 , _hd_abstract = Just csv_abstract
424 , _hd_publication_date = Just $ pack . show $ jour (fromIntegral pubYear)
427 , _hd_publication_year = Just pubYear
428 , _hd_publication_month = Just pubMonth
429 , _hd_publication_day = Just pubDay
430 , _hd_publication_hour = Nothing
431 , _hd_publication_minute = Nothing
432 , _hd_publication_second = Nothing
433 , _hd_language_iso2 = Nothing }
435 pubYear = fromMIntOrDec defaultYear csv_publication_year
436 pubMonth = fromMaybe defaultMonth csv_publication_month
437 pubDay = fromMaybe defaultDay csv_publication_day
439 ------------------------------------------------------------------------
440 parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
443 pure $ (V.toList . V.map csvHal2doc . snd) <$> r
445 parseHal' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
446 parseHal' bs = (V.toList . V.map csvHal2doc . snd) <$> readCsvHalLazyBS bs
448 ------------------------------------------------------------------------
450 parseCsv :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
451 parseCsv fp = fmap (V.toList . V.map csv2doc . snd) <$> readCSVFile fp
454 parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
455 parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS Comma bs
458 parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
461 result = case readCsvLazyBS Comma bs of
462 Left _err -> readCsvLazyBS Tab bs
463 Right res -> Right res
464 (V.toList . V.map csv2doc . snd) <$> result
466 parseCsvC :: BL.ByteString
467 -> Either Prelude.String (Maybe Integer, ConduitT () HyperdataDocument Identity ())
470 result = case readCsvLazyBS Comma bs of
471 Left _err -> readCsvLazyBS Tab bs
472 Right res -> Right res
475 Right r -> Right $ (Just $ Prelude.fromIntegral $ Prelude.length $ snd r, (yieldMany $ snd r) .| mapC csv2doc)
477 ------------------------------------------------------------------------
478 -- Csv v3 weighted for phylo
481 { csv'_title :: !Text
482 , csv'_source :: !Text
483 , csv'_publication_year :: !Int
484 , csv'_publication_month :: !Int
485 , csv'_publication_day :: !Int
486 , csv'_abstract :: !Text
487 , csv'_authors :: !Text
488 , csv'_weight :: !Double } deriving (Show)
491 instance FromNamedRecord Csv' where
492 parseNamedRecord r = do
493 csv'_title <- r .: "title"
494 csv'_source <- r .: "source"
495 csv'_publication_year <- r .: "publication_year"
496 csv'_publication_month <- r .: "publication_month"
497 csv'_publication_day <- r .: "publication_day"
498 csv'_abstract <- r .: "abstract"
499 csv'_authors <- r .: "authors"
500 csv'_weight <- r .: "weight"
503 readWeightedCsv :: FilePath -> IO (Header, Vector Csv')
506 case decodeByNameWith (csvDecodeOptions Tab) bs of
507 Left e -> panic (pack e)
508 Right corpus -> corpus