]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
Merge branch 'dev' into 70-dev-searx-parser
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14
15 module Gargantext.Core.Text.Corpus.Parsers.CSV where
16
17 import Control.Applicative
18 import qualified Data.ByteString as BS
19 import qualified Data.ByteString.Lazy as BL
20 import Data.Char (ord)
21 import Data.Csv
22 import Data.Either (Either(..))
23 import Data.Maybe (fromMaybe)
24 import Data.Text (Text, pack, length, intercalate)
25 import Data.Time.Segment (jour)
26 import qualified Data.Vector as V
27 import Data.Vector (Vector)
28 import GHC.IO (FilePath)
29 import GHC.Word (Word8)
30
31 import qualified Prelude as Prelude
32
33 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
34 import Gargantext.Prelude hiding (length)
35 import Gargantext.Core.Text
36 import Gargantext.Core.Text.Context
37
38 ---------------------------------------------------------------
39 headerCsvGargV3 :: Header
40 headerCsvGargV3 =
41 header [ "title"
42 , "source"
43 , "publication_year"
44 , "publication_month"
45 , "publication_day"
46 , "abstract"
47 , "authors"
48 ]
49 ---------------------------------------------------------------
50 data CsvGargV3 = CsvGargV3
51 { d_docId :: !Int
52 , d_title :: !Text
53 , d_source :: !Text
54 , d_publication_year :: !Int
55 , d_publication_month :: !Int
56 , d_publication_day :: !Int
57 , d_abstract :: !Text
58 , d_authors :: !Text
59 }
60 deriving (Show)
61 ---------------------------------------------------------------
62 -- | Doc 2 HyperdataDocument
63 toDoc :: CsvGargV3 -> HyperdataDocument
64 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
65 HyperdataDocument { _hd_bdd = Just "CSV"
66 , _hd_doi = Just . pack . show $ did
67 , _hd_url = Nothing
68 , _hd_uniqId = Nothing
69 , _hd_uniqIdBdd = Nothing
70 , _hd_page = Nothing
71 , _hd_title = Just dt
72 , _hd_authors = Nothing
73 , _hd_institutes = Just dau
74 , _hd_source = Just dab
75 , _hd_abstract = Nothing
76 , _hd_publication_date = Nothing
77 , _hd_publication_year = Just dpy
78 , _hd_publication_month = Just dpm
79 , _hd_publication_day = Just dpd
80 , _hd_publication_hour = Nothing
81 , _hd_publication_minute = Nothing
82 , _hd_publication_second = Nothing
83 , _hd_language_iso2 = Nothing }
84
85 ---------------------------------------------------------------
86 -- | Types Conversions
87 toDocs :: Vector CsvDoc -> [CsvGargV3]
88 toDocs v = V.toList
89 $ V.zipWith (\nId (CsvDoc { .. }) -- (CsvDoc t s mPy pm pd abst auth)
90 -> CsvGargV3 { d_docId = nId
91 , d_title = csv_title
92 , d_source = csv_source
93 , d_publication_year = fromMIntOrDec defaultYear csv_publication_year
94 , d_publication_month = fromMaybe defaultMonth csv_publication_month
95 , d_publication_day = fromMaybe defaultDay csv_publication_day
96 , d_abstract = csv_abstract
97 , d_authors = csv_authors })
98 (V.enumFromN 1 (V.length v'')) v''
99 where
100 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
101 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
102
103 ---------------------------------------------------------------
104 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
105 fromDocs docs = V.map fromDocs' docs
106 where
107 fromDocs' (CsvGargV3 { .. }) = CsvDoc { csv_title = d_title
108 , csv_source = d_source
109 , csv_publication_year = Just $ IntOrDec d_publication_year
110 , csv_publication_month = Just d_publication_month
111 , csv_publication_day = Just d_publication_day
112 , csv_abstract = d_abstract
113 , csv_authors = d_authors }
114
115 ---------------------------------------------------------------
116 -- | Split a document in its context
117 -- TODO adapt the size of the paragraph according to the corpus average
118 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
119 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
120 if docSize > 1000
121 then
122 if (mod (round m) docSize) >= 10
123 then
124 splitDoc' splt doc
125 else
126 V.fromList [doc]
127 else
128 V.fromList [doc]
129 where
130 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
131 splitDoc' contextSize (CsvDoc { .. }) = V.fromList $ [firstDoc] <> nextDocs
132 where
133 firstDoc = CsvDoc { csv_abstract = firstAbstract, .. }
134 firstAbstract = head' "splitDoc'1" abstracts
135
136 nextDocs = map (\txt -> CsvDoc { csv_title = head' "splitDoc'2" $ sentences txt
137 , csv_abstract = unsentences $ tail' "splitDoc'1" $ sentences txt
138 , .. }
139 ) (tail' "splitDoc'2" abstracts)
140
141 abstracts = (splitBy $ contextSize) csv_abstract
142
143 ---------------------------------------------------------------
144 ---------------------------------------------------------------
145 type Mean = Double
146
147 docsSize :: Vector CsvDoc -> Mean
148 docsSize csvDoc = mean ls
149 where
150 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
151
152
153 ---------------------------------------------------------------
154 newtype IntOrDec = IntOrDec Int
155 deriving (Show, Eq, Read)
156 unIntOrDec :: IntOrDec -> Int
157 unIntOrDec (IntOrDec i) = i
158 instance FromField IntOrDec where
159 parseField s = case runParser (parseField s :: Parser Int) of
160 Left _err -> IntOrDec <$> Prelude.floor <$> (parseField s :: Parser Double)
161 Right n -> pure $ IntOrDec n
162 instance ToField IntOrDec where
163 toField (IntOrDec i) = toField i
164
165 fromMIntOrDec :: Int -> Maybe IntOrDec -> Int
166 fromMIntOrDec default' mVal = unIntOrDec $ fromMaybe (IntOrDec default') mVal
167 defaultYear :: Int
168 defaultYear = 1973
169 defaultMonth :: Int
170 defaultMonth = 1
171 defaultDay :: Int
172 defaultDay = 1
173
174 data CsvDoc = CsvDoc
175 { csv_title :: !Text
176 , csv_source :: !Text
177 , csv_publication_year :: !(Maybe IntOrDec)
178 , csv_publication_month :: !(Maybe Int)
179 , csv_publication_day :: !(Maybe Int)
180 , csv_abstract :: !Text
181 , csv_authors :: !Text
182 }
183 deriving (Show)
184
185 instance FromNamedRecord CsvDoc where
186 parseNamedRecord r = do
187 csv_title <- r .: "title" <|> r .: "Title"
188 csv_source <- r .: "source" <|> r .: "Source"
189 csv_publication_year <- r .: "publication_year" <|> r .: "Publication Year"
190 csv_publication_month <- r .: "publication_month" <|> r .: "Publication Month"
191 csv_publication_day <- r .: "publication_day" <|> r .: "Publication Day"
192 csv_abstract <- r .: "abstract" <|> r .: "Abstract"
193 csv_authors <- r .: "authors" <|> r .: "Authors"
194 pure $ CsvDoc { .. }
195
196 instance ToNamedRecord CsvDoc where
197 toNamedRecord (CsvDoc{ .. }) =
198 namedRecord [ "title" .= csv_title
199 , "source" .= csv_source
200 , "publication_year" .= csv_publication_year
201 , "publication_month" .= csv_publication_month
202 , "publication_day" .= csv_publication_day
203 , "abstract" .= csv_abstract
204 , "authors" .= csv_authors
205 ]
206
207 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
208 hyperdataDocument2csvDoc h = CsvDoc { csv_title = m $ _hd_title h
209 , csv_source = m $ _hd_source h
210 , csv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
211 , csv_publication_month = Just $ mI $ _hd_publication_month h
212 , csv_publication_day = Just $ mI $ _hd_publication_day h
213 , csv_abstract = m $ _hd_abstract h
214 , csv_authors = m $ _hd_authors h }
215
216 where
217 m = maybe "" identity
218 mI = maybe 0 identity
219
220
221 data Delimiter = Tab | Comma
222
223 csvDecodeOptions :: Delimiter -> DecodeOptions
224 csvDecodeOptions d = defaultDecodeOptions {decDelimiter = delimiter d}
225
226 csvEncodeOptions :: Delimiter -> EncodeOptions
227 csvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
228
229 delimiter :: Delimiter -> Word8
230 delimiter Tab = fromIntegral $ ord '\t'
231 delimiter Comma = fromIntegral $ ord ','
232 ------------------------------------------------------------------------
233 ------------------------------------------------------------------------
234 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO (Either Prelude.String [Text])
235 readCsvOn' fields fp = do
236 r <- readFile fp
237 pure $ ( V.toList
238 . V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
239 . snd ) <$> r
240
241 ------------------------------------------------------------------------
242
243 readFileLazy :: (FromNamedRecord a) => proxy a -> Delimiter -> FilePath -> IO (Either Prelude.String (Header, Vector a))
244 readFileLazy d f = fmap (readByteStringLazy d f) . BL.readFile
245
246 readFileStrict :: (FromNamedRecord a)
247 => proxy a
248 -> Delimiter
249 -> FilePath
250 -> IO (Either Prelude.String (Header, Vector a))
251 readFileStrict d f = fmap (readByteStringStrict d f) . BS.readFile
252
253 readByteStringLazy :: (FromNamedRecord a)
254 => proxy a
255 -> Delimiter
256 -> BL.ByteString
257 -> Either Prelude.String (Header, Vector a)
258 readByteStringLazy _f d bs = decodeByNameWith (csvDecodeOptions d) bs
259
260 readByteStringStrict :: (FromNamedRecord a)
261 => proxy a
262 -> Delimiter
263 -> BS.ByteString
264 -> Either Prelude.String (Header, Vector a)
265 readByteStringStrict d ff = (readByteStringLazy d ff) . BL.fromStrict
266
267 ------------------------------------------------------------------------
268 -- | TODO use readFileLazy
269 readFile :: FilePath -> IO (Either Prelude.String (Header, Vector CsvDoc))
270 readFile fp = do
271 result <- fmap (readCsvLazyBS Comma) $ BL.readFile fp
272 case result of
273 Left _err -> fmap (readCsvLazyBS Tab) $ BL.readFile fp
274 Right res -> pure $ Right res
275
276
277
278 -- | TODO use readByteStringLazy
279 readCsvLazyBS :: Delimiter -> BL.ByteString -> Either Prelude.String (Header, Vector CsvDoc)
280 readCsvLazyBS d bs = decodeByNameWith (csvDecodeOptions d) bs
281
282 ------------------------------------------------------------------------
283 -- | TODO use readFileLazy
284 readCsvHal :: FilePath -> IO (Either Prelude.String (Header, Vector CsvHal))
285 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
286
287 -- | TODO use readByteStringLazy
288 readCsvHalLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvHal)
289 readCsvHalLazyBS bs = decodeByNameWith (csvDecodeOptions Tab) bs
290
291 readCsvHalBSStrict :: BS.ByteString -> Either Prelude.String (Header, Vector CsvHal)
292 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
293
294 ------------------------------------------------------------------------
295 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
296 writeFile fp (h, vs) = BL.writeFile fp $
297 encodeByNameWith (csvEncodeOptions Tab) h (V.toList vs)
298
299 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
300 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
301
302 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
303 hyperdataDocument2csv hs = encodeByNameWith (csvEncodeOptions Tab) headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
304
305 ------------------------------------------------------------------------
306 -- Hal Format
307 data CsvHal = CsvHal
308 { csvHal_title :: !Text
309 , csvHal_source :: !Text
310 , csvHal_publication_year :: !Integer
311 , csvHal_publication_month :: !Int
312 , csvHal_publication_day :: !Int
313 , csvHal_abstract :: !Text
314 , csvHal_authors :: !Text
315
316 , csvHal_url :: !Text
317 , csvHal_isbn_s :: !Text
318 , csvHal_issue_s :: !Text
319 , csvHal_journalPublisher_s:: !Text
320 , csvHal_language_s :: !Text
321
322 , csvHal_doiId_s :: !Text
323 , csvHal_authId_i :: !Text
324 , csvHal_instStructId_i :: !Text
325 , csvHal_deptStructId_i :: !Text
326 , csvHal_labStructId_i :: !Text
327
328 , csvHal_rteamStructId_i :: !Text
329 , csvHal_docType_s :: !Text
330 }
331 deriving (Show)
332
333 instance FromNamedRecord CsvHal where
334 parseNamedRecord r = do
335 csvHal_title <- r .: "title"
336 csvHal_source <- r .: "source"
337 csvHal_publication_year <- r .: "publication_year"
338 csvHal_publication_month <- r .: "publication_month"
339 csvHal_publication_day <- r .: "publication_day"
340 csvHal_abstract <- r .: "abstract"
341 csvHal_authors <- r .: "authors"
342 csvHal_url <- r .: "url"
343 csvHal_isbn_s <- r .: "isbn_s"
344 csvHal_issue_s <- r .: "issue_s"
345 csvHal_journalPublisher_s <- r .: "journalPublisher_s"
346 csvHal_language_s <- r .: "language_s"
347 csvHal_doiId_s <- r .: "doiId_s"
348 csvHal_authId_i <- r .: "authId_i"
349 csvHal_instStructId_i <- r .: "instStructId_i"
350 csvHal_deptStructId_i <- r .: "deptStructId_i"
351 csvHal_labStructId_i <- r .: "labStructId_i"
352 csvHal_rteamStructId_i <- r .: "rteamStructId_i"
353 csvHal_docType_s <- r .: "docType_s"
354 pure $ CsvHal { .. }
355
356 instance ToNamedRecord CsvHal where
357 --toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
358 toNamedRecord (CsvHal { .. }) =
359 namedRecord [ "title" .= csvHal_title
360 , "source" .= csvHal_source
361
362 , "publication_year" .= csvHal_publication_year
363 , "publication_month" .= csvHal_publication_month
364 , "publication_day" .= csvHal_publication_day
365
366 , "abstract" .= csvHal_abstract
367 , "authors" .= csvHal_authors
368
369 , "url" .= csvHal_url
370 , "isbn_s" .= csvHal_isbn_s
371 , "issue_s" .= csvHal_issue_s
372 , "journalPublisher_s" .= csvHal_journalPublisher_s
373 , "language_s" .= csvHal_language_s
374
375 , "doiId_s" .= csvHal_doiId_s
376 , "authId_i" .= csvHal_authId_i
377 , "instStructId_i" .= csvHal_instStructId_i
378 , "deptStructId_i" .= csvHal_deptStructId_i
379 , "labStructId_i" .= csvHal_labStructId_i
380
381 , "rteamStructId_i" .= csvHal_rteamStructId_i
382 , "docType_s" .= csvHal_docType_s
383 ]
384
385 csvHal2doc :: CsvHal -> HyperdataDocument
386 csvHal2doc (CsvHal { .. }) =
387 HyperdataDocument { _hd_bdd = Just "CsvHal"
388 , _hd_doi = Just csvHal_doiId_s
389 , _hd_url = Just csvHal_url
390 , _hd_uniqId = Nothing
391 , _hd_uniqIdBdd = Nothing
392 , _hd_page = Nothing
393 , _hd_title = Just csvHal_title
394 , _hd_authors = Just csvHal_authors
395 , _hd_institutes = Just csvHal_instStructId_i
396 , _hd_source = Just csvHal_source
397 , _hd_abstract = Just csvHal_abstract
398 , _hd_publication_date = Just $ pack . show $ jour csvHal_publication_year
399 csvHal_publication_month
400 csvHal_publication_day
401 , _hd_publication_year = Just $ fromIntegral csvHal_publication_year
402 , _hd_publication_month = Just csvHal_publication_month
403 , _hd_publication_day = Just csvHal_publication_day
404 , _hd_publication_hour = Nothing
405 , _hd_publication_minute = Nothing
406 , _hd_publication_second = Nothing
407 , _hd_language_iso2 = Nothing }
408
409
410 csv2doc :: CsvDoc -> HyperdataDocument
411 csv2doc (CsvDoc { .. })
412 = HyperdataDocument { _hd_bdd = Just "CsvHal"
413 , _hd_doi = Nothing
414 , _hd_url = Nothing
415 , _hd_uniqId = Nothing
416 , _hd_uniqIdBdd = Nothing
417 , _hd_page = Nothing
418 , _hd_title = Just csv_title
419 , _hd_authors = Just csv_authors
420 , _hd_institutes = Nothing
421 , _hd_source = Just csv_source
422 , _hd_abstract = Just csv_abstract
423 , _hd_publication_date = Just $ pack . show $ jour (fromIntegral pubYear)
424 pubMonth
425 pubDay
426 , _hd_publication_year = Just pubYear
427 , _hd_publication_month = Just pubMonth
428 , _hd_publication_day = Just pubDay
429 , _hd_publication_hour = Nothing
430 , _hd_publication_minute = Nothing
431 , _hd_publication_second = Nothing
432 , _hd_language_iso2 = Nothing }
433 where
434 pubYear = fromMIntOrDec defaultYear csv_publication_year
435 pubMonth = fromMaybe defaultMonth csv_publication_month
436 pubDay = fromMaybe defaultDay csv_publication_day
437
438 ------------------------------------------------------------------------
439 parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
440 parseHal fp = do
441 r <- readCsvHal fp
442 pure $ (V.toList . V.map csvHal2doc . snd) <$> r
443
444 parseHal' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
445 parseHal' bs = (V.toList . V.map csvHal2doc . snd) <$> readCsvHalLazyBS bs
446
447 ------------------------------------------------------------------------
448
449 parseCsv :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
450 parseCsv fp = fmap (V.toList . V.map csv2doc . snd) <$> readFile fp
451
452 {-
453 parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
454 parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS Comma bs
455 -}
456
457 parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
458 parseCsv' bs = do
459 let
460 result = case readCsvLazyBS Comma bs of
461 Left _err -> readCsvLazyBS Tab bs
462 Right res -> Right res
463 (V.toList . V.map csv2doc . snd) <$> result
464
465 ------------------------------------------------------------------------
466 -- Csv v3 weighted for phylo
467
468 data Csv' = Csv'
469 { csv'_title :: !Text
470 , csv'_source :: !Text
471 , csv'_publication_year :: !Int
472 , csv'_publication_month :: !Int
473 , csv'_publication_day :: !Int
474 , csv'_abstract :: !Text
475 , csv'_authors :: !Text
476 , csv'_weight :: !Double } deriving (Show)
477
478
479 instance FromNamedRecord Csv' where
480 parseNamedRecord r = do
481 csv'_title <- r .: "title"
482 csv'_source <- r .: "source"
483 csv'_publication_year <- r .: "publication_year"
484 csv'_publication_month <- r .: "publication_month"
485 csv'_publication_day <- r .: "publication_day"
486 csv'_abstract <- r .: "abstract"
487 csv'_authors <- r .: "authors"
488 csv'_weight <- r .: "weight"
489 pure $ Csv' { .. }
490
491 readWeightedCsv :: FilePath -> IO (Header, Vector Csv')
492 readWeightedCsv fp =
493 fmap (\bs ->
494 case decodeByNameWith (csvDecodeOptions Tab) bs of
495 Left e -> panic (pack e)
496 Right corpus -> corpus
497 ) $ BL.readFile fp