2 Module : Gargantext.Core.Text.Corpus.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
15 module Gargantext.Core.Text.Corpus.Parsers.CSV where
17 import Control.Applicative
18 import qualified Data.ByteString as BS
19 import qualified Data.ByteString.Lazy as BL
20 import Data.Char (ord)
22 import Data.Either (Either(..))
23 import Data.Maybe (fromMaybe)
24 import Data.Text (Text, pack, length, intercalate)
25 import Data.Time.Segment (jour)
26 import qualified Data.Vector as V
27 import Data.Vector (Vector)
28 import GHC.IO (FilePath)
29 import GHC.Word (Word8)
31 import qualified Prelude as Prelude
33 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
34 import Gargantext.Prelude hiding (length)
35 import Gargantext.Core.Text
36 import Gargantext.Core.Text.Context
38 ---------------------------------------------------------------
39 headerCsvGargV3 :: Header
49 ---------------------------------------------------------------
50 data CsvGargV3 = CsvGargV3
54 , d_publication_year :: !Int
55 , d_publication_month :: !Int
56 , d_publication_day :: !Int
61 ---------------------------------------------------------------
62 -- | Doc 2 HyperdataDocument
63 toDoc :: CsvGargV3 -> HyperdataDocument
64 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
65 HyperdataDocument { _hd_bdd = Just "CSV"
66 , _hd_doi = Just . pack . show $ did
68 , _hd_uniqId = Nothing
69 , _hd_uniqIdBdd = Nothing
72 , _hd_authors = Nothing
73 , _hd_institutes = Just dau
74 , _hd_source = Just dab
75 , _hd_abstract = Nothing
76 , _hd_publication_date = Nothing
77 , _hd_publication_year = Just dpy
78 , _hd_publication_month = Just dpm
79 , _hd_publication_day = Just dpd
80 , _hd_publication_hour = Nothing
81 , _hd_publication_minute = Nothing
82 , _hd_publication_second = Nothing
83 , _hd_language_iso2 = Nothing }
85 ---------------------------------------------------------------
86 -- | Types Conversions
87 toDocs :: Vector CsvDoc -> [CsvGargV3]
89 $ V.zipWith (\nId (CsvDoc { .. }) -- (CsvDoc t s mPy pm pd abst auth)
90 -> CsvGargV3 { d_docId = nId
92 , d_source = csv_source
93 , d_publication_year = fromMIntOrDec defaultYear csv_publication_year
94 , d_publication_month = fromMaybe defaultMonth csv_publication_month
95 , d_publication_day = fromMaybe defaultDay csv_publication_day
96 , d_abstract = csv_abstract
97 , d_authors = csv_authors })
98 (V.enumFromN 1 (V.length v'')) v''
100 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
101 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
103 ---------------------------------------------------------------
104 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
105 fromDocs docs = V.map fromDocs' docs
107 fromDocs' (CsvGargV3 { .. }) = CsvDoc { csv_title = d_title
108 , csv_source = d_source
109 , csv_publication_year = Just $ IntOrDec d_publication_year
110 , csv_publication_month = Just d_publication_month
111 , csv_publication_day = Just d_publication_day
112 , csv_abstract = d_abstract
113 , csv_authors = d_authors }
115 ---------------------------------------------------------------
116 -- | Split a document in its context
117 -- TODO adapt the size of the paragraph according to the corpus average
118 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
119 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
122 if (mod (round m) docSize) >= 10
130 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
131 splitDoc' contextSize (CsvDoc { .. }) = V.fromList $ [firstDoc] <> nextDocs
133 firstDoc = CsvDoc { csv_abstract = firstAbstract, .. }
134 firstAbstract = head' "splitDoc'1" abstracts
136 nextDocs = map (\txt -> CsvDoc { csv_title = head' "splitDoc'2" $ sentences txt
137 , csv_abstract = unsentences $ tail' "splitDoc'1" $ sentences txt
139 ) (tail' "splitDoc'2" abstracts)
141 abstracts = (splitBy $ contextSize) csv_abstract
143 ---------------------------------------------------------------
144 ---------------------------------------------------------------
147 docsSize :: Vector CsvDoc -> Mean
148 docsSize csvDoc = mean ls
150 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
153 ---------------------------------------------------------------
154 newtype IntOrDec = IntOrDec Int
155 deriving (Show, Eq, Read)
156 unIntOrDec :: IntOrDec -> Int
157 unIntOrDec (IntOrDec i) = i
158 instance FromField IntOrDec where
159 parseField s = case runParser (parseField s :: Parser Int) of
160 Left _err -> IntOrDec <$> Prelude.floor <$> (parseField s :: Parser Double)
161 Right n -> pure $ IntOrDec n
162 instance ToField IntOrDec where
163 toField (IntOrDec i) = toField i
165 fromMIntOrDec :: Int -> Maybe IntOrDec -> Int
166 fromMIntOrDec default' mVal = unIntOrDec $ fromMaybe (IntOrDec default') mVal
176 , csv_source :: !Text
177 , csv_publication_year :: !(Maybe IntOrDec)
178 , csv_publication_month :: !(Maybe Int)
179 , csv_publication_day :: !(Maybe Int)
180 , csv_abstract :: !Text
181 , csv_authors :: !Text
185 instance FromNamedRecord CsvDoc where
186 parseNamedRecord r = do
187 csv_title <- r .: "title" <|> r .: "Title"
188 csv_source <- r .: "source" <|> r .: "Source"
189 csv_publication_year <- r .: "publication_year" <|> r .: "Publication Year"
190 csv_publication_month <- r .: "publication_month" <|> r .: "Publication Month"
191 csv_publication_day <- r .: "publication_day" <|> r .: "Publication Day"
192 csv_abstract <- r .: "abstract" <|> r .: "Abstract"
193 csv_authors <- r .: "authors" <|> r .: "Authors"
196 instance ToNamedRecord CsvDoc where
197 toNamedRecord (CsvDoc{ .. }) =
198 namedRecord [ "title" .= csv_title
199 , "source" .= csv_source
200 , "publication_year" .= csv_publication_year
201 , "publication_month" .= csv_publication_month
202 , "publication_day" .= csv_publication_day
203 , "abstract" .= csv_abstract
204 , "authors" .= csv_authors
207 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
208 hyperdataDocument2csvDoc h = CsvDoc { csv_title = m $ _hd_title h
209 , csv_source = m $ _hd_source h
210 , csv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
211 , csv_publication_month = Just $ mI $ _hd_publication_month h
212 , csv_publication_day = Just $ mI $ _hd_publication_day h
213 , csv_abstract = m $ _hd_abstract h
214 , csv_authors = m $ _hd_authors h }
217 m = maybe "" identity
218 mI = maybe 0 identity
221 data Delimiter = Tab | Comma
223 csvDecodeOptions :: Delimiter -> DecodeOptions
224 csvDecodeOptions d = defaultDecodeOptions {decDelimiter = delimiter d}
226 csvEncodeOptions :: Delimiter -> EncodeOptions
227 csvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
229 delimiter :: Delimiter -> Word8
230 delimiter Tab = fromIntegral $ ord '\t'
231 delimiter Comma = fromIntegral $ ord ','
232 ------------------------------------------------------------------------
233 ------------------------------------------------------------------------
234 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO (Either Prelude.String [Text])
235 readCsvOn' fields fp = do
238 . V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
241 ------------------------------------------------------------------------
243 readFileLazy :: (FromNamedRecord a) => proxy a -> Delimiter -> FilePath -> IO (Either Prelude.String (Header, Vector a))
244 readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile
246 readFileStrict :: (FromNamedRecord a)
250 -> IO (Either Prelude.String (Header, Vector a))
251 readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile
253 readByteStringLazy :: (FromNamedRecord a)
257 -> Either Prelude.String (Header, Vector a)
258 readByteStringLazy _f d bs = decodeByNameWith (csvDecodeOptions d) bs
260 readByteStringStrict :: (FromNamedRecord a)
264 -> Either Prelude.String (Header, Vector a)
265 readByteStringStrict d ff = (readByteStringLazy d ff) . BL.fromStrict
267 ------------------------------------------------------------------------
268 -- | TODO use readFileLazy
269 readFile :: FilePath -> IO (Either Prelude.String (Header, Vector CsvDoc))
271 result <- fmap (readCsvLazyBS Comma) $ BL.readFile fp
273 Left _err -> fmap (readCsvLazyBS Tab) $ BL.readFile fp
274 Right res -> pure $ Right res
278 -- | TODO use readByteStringLazy
279 readCsvLazyBS :: Delimiter -> BL.ByteString -> Either Prelude.String (Header, Vector CsvDoc)
280 readCsvLazyBS d bs = decodeByNameWith (csvDecodeOptions d) bs
282 ------------------------------------------------------------------------
283 -- | TODO use readFileLazy
284 readCsvHal :: FilePath -> IO (Either Prelude.String (Header, Vector CsvHal))
285 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
287 -- | TODO use readByteStringLazy
288 readCsvHalLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvHal)
289 readCsvHalLazyBS bs = decodeByNameWith (csvDecodeOptions Tab) bs
291 readCsvHalBSStrict :: BS.ByteString -> Either Prelude.String (Header, Vector CsvHal)
292 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
294 ------------------------------------------------------------------------
295 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
296 writeFile fp (h, vs) = BL.writeFile fp $
297 encodeByNameWith (csvEncodeOptions Tab) h (V.toList vs)
299 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
300 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
302 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
303 hyperdataDocument2csv hs = encodeByNameWith (csvEncodeOptions Tab) headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
305 ------------------------------------------------------------------------
308 { csvHal_title :: !Text
309 , csvHal_source :: !Text
310 , csvHal_publication_year :: !Integer
311 , csvHal_publication_month :: !Int
312 , csvHal_publication_day :: !Int
313 , csvHal_abstract :: !Text
314 , csvHal_authors :: !Text
316 , csvHal_url :: !Text
317 , csvHal_isbn_s :: !Text
318 , csvHal_issue_s :: !Text
319 , csvHal_journalPublisher_s:: !Text
320 , csvHal_language_s :: !Text
322 , csvHal_doiId_s :: !Text
323 , csvHal_authId_i :: !Text
324 , csvHal_instStructId_i :: !Text
325 , csvHal_deptStructId_i :: !Text
326 , csvHal_labStructId_i :: !Text
328 , csvHal_rteamStructId_i :: !Text
329 , csvHal_docType_s :: !Text
333 instance FromNamedRecord CsvHal where
334 parseNamedRecord r = do
335 csvHal_title <- r .: "title"
336 csvHal_source <- r .: "source"
337 csvHal_publication_year <- r .: "publication_year"
338 csvHal_publication_month <- r .: "publication_month"
339 csvHal_publication_day <- r .: "publication_day"
340 csvHal_abstract <- r .: "abstract"
341 csvHal_authors <- r .: "authors"
342 csvHal_url <- r .: "url"
343 csvHal_isbn_s <- r .: "isbn_s"
344 csvHal_issue_s <- r .: "issue_s"
345 csvHal_journalPublisher_s <- r .: "journalPublisher_s"
346 csvHal_language_s <- r .: "language_s"
347 csvHal_doiId_s <- r .: "doiId_s"
348 csvHal_authId_i <- r .: "authId_i"
349 csvHal_instStructId_i <- r .: "instStructId_i"
350 csvHal_deptStructId_i <- r .: "deptStructId_i"
351 csvHal_labStructId_i <- r .: "labStructId_i"
352 csvHal_rteamStructId_i <- r .: "rteamStructId_i"
353 csvHal_docType_s <- r .: "docType_s"
356 instance ToNamedRecord CsvHal where
357 --toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
358 toNamedRecord (CsvHal { .. }) =
359 namedRecord [ "title" .= csvHal_title
360 , "source" .= csvHal_source
362 , "publication_year" .= csvHal_publication_year
363 , "publication_month" .= csvHal_publication_month
364 , "publication_day" .= csvHal_publication_day
366 , "abstract" .= csvHal_abstract
367 , "authors" .= csvHal_authors
369 , "url" .= csvHal_url
370 , "isbn_s" .= csvHal_isbn_s
371 , "issue_s" .= csvHal_issue_s
372 , "journalPublisher_s" .= csvHal_journalPublisher_s
373 , "language_s" .= csvHal_language_s
375 , "doiId_s" .= csvHal_doiId_s
376 , "authId_i" .= csvHal_authId_i
377 , "instStructId_i" .= csvHal_instStructId_i
378 , "deptStructId_i" .= csvHal_deptStructId_i
379 , "labStructId_i" .= csvHal_labStructId_i
381 , "rteamStructId_i" .= csvHal_rteamStructId_i
382 , "docType_s" .= csvHal_docType_s
385 csvHal2doc :: CsvHal -> HyperdataDocument
386 csvHal2doc (CsvHal { .. }) =
387 HyperdataDocument { _hd_bdd = Just "CsvHal"
388 , _hd_doi = Just csvHal_doiId_s
389 , _hd_url = Just csvHal_url
390 , _hd_uniqId = Nothing
391 , _hd_uniqIdBdd = Nothing
393 , _hd_title = Just csvHal_title
394 , _hd_authors = Just csvHal_authors
395 , _hd_institutes = Just csvHal_instStructId_i
396 , _hd_source = Just csvHal_source
397 , _hd_abstract = Just csvHal_abstract
398 , _hd_publication_date = Just $ pack . show $ jour csvHal_publication_year
399 csvHal_publication_month
400 csvHal_publication_day
401 , _hd_publication_year = Just $ fromIntegral csvHal_publication_year
402 , _hd_publication_month = Just csvHal_publication_month
403 , _hd_publication_day = Just csvHal_publication_day
404 , _hd_publication_hour = Nothing
405 , _hd_publication_minute = Nothing
406 , _hd_publication_second = Nothing
407 , _hd_language_iso2 = Nothing }
410 csv2doc :: CsvDoc -> HyperdataDocument
411 csv2doc (CsvDoc { .. })
412 = HyperdataDocument { _hd_bdd = Just "CsvHal"
415 , _hd_uniqId = Nothing
416 , _hd_uniqIdBdd = Nothing
418 , _hd_title = Just csv_title
419 , _hd_authors = Just csv_authors
420 , _hd_institutes = Nothing
421 , _hd_source = Just csv_source
422 , _hd_abstract = Just csv_abstract
423 , _hd_publication_date = Just $ pack . show $ jour (fromIntegral pubYear)
426 , _hd_publication_year = Just pubYear
427 , _hd_publication_month = Just pubMonth
428 , _hd_publication_day = Just pubDay
429 , _hd_publication_hour = Nothing
430 , _hd_publication_minute = Nothing
431 , _hd_publication_second = Nothing
432 , _hd_language_iso2 = Nothing }
434 pubYear = fromMIntOrDec defaultYear csv_publication_year
435 pubMonth = fromMaybe defaultMonth csv_publication_month
436 pubDay = fromMaybe defaultDay csv_publication_day
438 ------------------------------------------------------------------------
439 parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
442 pure $ (V.toList . V.map csvHal2doc . snd) <$> r
444 parseHal' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
445 parseHal' bs = (V.toList . V.map csvHal2doc . snd) <$> readCsvHalLazyBS bs
447 ------------------------------------------------------------------------
449 parseCsv :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
450 parseCsv fp = fmap (V.toList . V.map csv2doc . snd) <$> readFile fp
453 parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
454 parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS Comma bs
457 parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
460 result = case readCsvLazyBS Comma bs of
461 Left _err -> readCsvLazyBS Tab bs
462 Right res -> Right res
463 (V.toList . V.map csv2doc . snd) <$> result
465 ------------------------------------------------------------------------
466 -- Csv v3 weighted for phylo
469 { csv'_title :: !Text
470 , csv'_source :: !Text
471 , csv'_publication_year :: !Int
472 , csv'_publication_month :: !Int
473 , csv'_publication_day :: !Int
474 , csv'_abstract :: !Text
475 , csv'_authors :: !Text
476 , csv'_weight :: !Double } deriving (Show)
479 instance FromNamedRecord Csv' where
480 parseNamedRecord r = do
481 csv'_title <- r .: "title"
482 csv'_source <- r .: "source"
483 csv'_publication_year <- r .: "publication_year"
484 csv'_publication_month <- r .: "publication_month"
485 csv'_publication_day <- r .: "publication_day"
486 csv'_abstract <- r .: "abstract"
487 csv'_authors <- r .: "authors"
488 csv'_weight <- r .: "weight"
491 readWeightedCsv :: FilePath -> IO (Header, Vector Csv')
494 case decodeByNameWith (csvDecodeOptions Tab) bs of
495 Left e -> panic (pack e)
496 Right corpus -> corpus