]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
Merge branch 'dev' into dev-wikidata
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14
15 module Gargantext.Core.Text.Corpus.Parsers.CSV where
16
17 import Control.Applicative
18 import qualified Data.ByteString as BS
19 import qualified Data.ByteString.Lazy as BL
20 import Data.Char (ord)
21 import Data.Csv
22 import Data.Either (Either(..))
23 import Data.Maybe (fromMaybe)
24 import Data.Text (Text, pack, length, intercalate)
25 import Data.Time.Segment (jour)
26 import qualified Data.Vector as V
27 import Data.Vector (Vector)
28 import GHC.IO (FilePath)
29 import GHC.Word (Word8)
30
31 import qualified Prelude as Prelude
32
33 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
34 import Gargantext.Prelude hiding (length)
35 import Gargantext.Core.Text
36 import Gargantext.Core.Text.Context
37
38 ---------------------------------------------------------------
39 headerCsvGargV3 :: Header
40 headerCsvGargV3 =
41 header [ "title"
42 , "source"
43 , "publication_year"
44 , "publication_month"
45 , "publication_day"
46 , "abstract"
47 , "authors"
48 ]
49 ---------------------------------------------------------------
50 data CsvGargV3 = CsvGargV3
51 { d_docId :: !Int
52 , d_title :: !Text
53 , d_source :: !Text
54 , d_publication_year :: !Int
55 , d_publication_month :: !Int
56 , d_publication_day :: !Int
57 , d_abstract :: !Text
58 , d_authors :: !Text
59 }
60 deriving (Show)
61 ---------------------------------------------------------------
62 -- | Doc 2 HyperdataDocument
63 toDoc :: CsvGargV3 -> HyperdataDocument
64 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
65 HyperdataDocument { _hd_bdd = Just "CSV"
66 , _hd_doi = Just . pack . show $ did
67 , _hd_url = Nothing
68 , _hd_uniqId = Nothing
69 , _hd_uniqIdBdd = Nothing
70 , _hd_page = Nothing
71 , _hd_title = Just dt
72 , _hd_authors = Nothing
73 , _hd_institutes = Just dau
74 , _hd_source = Just dab
75 , _hd_abstract = Nothing
76 , _hd_publication_date = Nothing
77 , _hd_publication_year = Just dpy
78 , _hd_publication_month = Just dpm
79 , _hd_publication_day = Just dpd
80 , _hd_publication_hour = Nothing
81 , _hd_publication_minute = Nothing
82 , _hd_publication_second = Nothing
83 , _hd_language_iso2 = Nothing }
84
85 ---------------------------------------------------------------
86 -- | Types Conversions
87 toDocs :: Vector CsvDoc -> [CsvGargV3]
88 toDocs v = V.toList
89 $ V.zipWith (\nId (CsvDoc { .. }) -- (CsvDoc t s mPy pm pd abst auth)
90 -> CsvGargV3 { d_docId = nId
91 , d_title = csv_title
92 , d_source = csv_source
93 , d_publication_year = fromMIntOrDec defaultYear csv_publication_year
94 , d_publication_month = fromMaybe defaultMonth csv_publication_month
95 , d_publication_day = fromMaybe defaultDay csv_publication_day
96 , d_abstract = csv_abstract
97 , d_authors = csv_authors })
98 (V.enumFromN 1 (V.length v'')) v''
99 where
100 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
101 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
102
103 ---------------------------------------------------------------
104 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
105 fromDocs docs = V.map fromDocs' docs
106 where
107 fromDocs' (CsvGargV3 { .. }) = CsvDoc { csv_title = d_title
108 , csv_source = d_source
109 , csv_publication_year = Just $ IntOrDec d_publication_year
110 , csv_publication_month = Just d_publication_month
111 , csv_publication_day = Just d_publication_day
112 , csv_abstract = d_abstract
113 , csv_authors = d_authors }
114
115 ---------------------------------------------------------------
116 -- | Split a document in its context
117 -- TODO adapt the size of the paragraph according to the corpus average
118 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
119 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
120 if docSize > 1000
121 then
122 if (mod (round m) docSize) >= 10
123 then
124 splitDoc' splt doc
125 else
126 V.fromList [doc]
127 else
128 V.fromList [doc]
129 where
130 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
131 splitDoc' contextSize (CsvDoc { .. }) = V.fromList $ [firstDoc] <> nextDocs
132 where
133 firstDoc = CsvDoc { csv_abstract = firstAbstract, .. }
134 firstAbstract = head' "splitDoc'1" abstracts
135
136 nextDocs = map (\txt -> CsvDoc { csv_title = head' "splitDoc'2" $ sentences txt
137 , csv_abstract = unsentences $ tail' "splitDoc'1" $ sentences txt
138 , .. }
139 ) (tail' "splitDoc'2" abstracts)
140
141 abstracts = (splitBy $ contextSize) csv_abstract
142
143 ---------------------------------------------------------------
144 ---------------------------------------------------------------
145 type Mean = Double
146
147 docsSize :: Vector CsvDoc -> Mean
148 docsSize csvDoc = mean ls
149 where
150 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
151
152
153 ---------------------------------------------------------------
154 newtype IntOrDec = IntOrDec Int
155 deriving (Show, Eq, Read)
156 unIntOrDec :: IntOrDec -> Int
157 unIntOrDec (IntOrDec i) = i
158 instance FromField IntOrDec where
159 parseField s = case runParser (parseField s :: Parser Int) of
160 Left _err -> IntOrDec <$> Prelude.floor <$> (parseField s :: Parser Double)
161 Right n -> pure $ IntOrDec n
162 instance ToField IntOrDec where
163 toField (IntOrDec i) = toField i
164
165 fromMIntOrDec :: Int -> Maybe IntOrDec -> Int
166 fromMIntOrDec default' mVal = unIntOrDec $ fromMaybe (IntOrDec default') mVal
167 defaultYear :: Int
168 defaultYear = 1973
169 defaultMonth :: Int
170 defaultMonth = 1
171 defaultDay :: Int
172 defaultDay = 1
173
174 data CsvDoc = CsvDoc
175 { csv_title :: !Text
176 , csv_source :: !Text
177 , csv_publication_year :: !(Maybe IntOrDec)
178 , csv_publication_month :: !(Maybe Int)
179 , csv_publication_day :: !(Maybe Int)
180 , csv_abstract :: !Text
181 , csv_authors :: !Text
182 }
183 deriving (Show)
184
185 instance FromNamedRecord CsvDoc where
186 parseNamedRecord r = do
187 csv_title <- r .: "title" <|> r .: "Title"
188 csv_source <- r .: "source" <|> r .: "Source"
189 csv_publication_year <- r .: "publication_year" <|> r .: "Publication Year"
190 csv_publication_month <- r .: "publication_month" <|> r .: "Publication Month"
191 csv_publication_day <- r .: "publication_day" <|> r .: "Publication Day"
192 csv_abstract <- r .: "abstract" <|> r .: "Abstract"
193 csv_authors <- r .: "authors" <|> r .: "Authors"
194 pure $ CsvDoc { .. }
195
196 instance ToNamedRecord CsvDoc where
197 toNamedRecord (CsvDoc{ .. }) =
198 namedRecord [ "title" .= csv_title
199 , "source" .= csv_source
200 , "publication_year" .= csv_publication_year
201 , "publication_month" .= csv_publication_month
202 , "publication_day" .= csv_publication_day
203 , "abstract" .= csv_abstract
204 , "authors" .= csv_authors
205 ]
206
207 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
208 hyperdataDocument2csvDoc h = CsvDoc { csv_title = m $ _hd_title h
209 , csv_source = m $ _hd_source h
210 , csv_publication_year = Just $ IntOrDec $ mI $ _hd_publication_year h
211 , csv_publication_month = Just $ mI $ _hd_publication_month h
212 , csv_publication_day = Just $ mI $ _hd_publication_day h
213 , csv_abstract = m $ _hd_abstract h
214 , csv_authors = m $ _hd_authors h }
215
216 where
217 m = maybe "" identity
218 mI = maybe 0 identity
219
220
221 csvDecodeOptions :: DecodeOptions
222 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
223
224 csvEncodeOptions :: EncodeOptions
225 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
226
227 delimiter :: Word8
228 delimiter = fromIntegral $ ord '\t'
229 ------------------------------------------------------------------------
230 ------------------------------------------------------------------------
231 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO (Either Prelude.String [Text])
232 readCsvOn' fields fp = do
233 r <- readFile fp
234 pure $ ( V.toList
235 . V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
236 . snd ) <$> r
237
238 ------------------------------------------------------------------------
239
240 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
241 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
242
243 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
244 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
245
246 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> Either Prelude.String (Header, Vector a)
247 readByteStringLazy _f bs = decodeByNameWith csvDecodeOptions bs
248
249 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> Either Prelude.String (Header, Vector a)
250 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
251
252 ------------------------------------------------------------------------
253 -- | TODO use readFileLazy
254 readFile :: FilePath -> IO (Either Prelude.String (Header, Vector CsvDoc))
255 readFile = fmap readCsvLazyBS . BL.readFile
256
257
258 -- | TODO use readByteStringLazy
259 readCsvLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvDoc)
260 readCsvLazyBS bs = decodeByNameWith csvDecodeOptions bs
261
262 ------------------------------------------------------------------------
263 -- | TODO use readFileLazy
264 readCsvHal :: FilePath -> IO (Either Prelude.String (Header, Vector CsvHal))
265 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
266
267 -- | TODO use readByteStringLazy
268 readCsvHalLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvHal)
269 readCsvHalLazyBS bs = decodeByNameWith csvDecodeOptions bs
270
271 readCsvHalBSStrict :: BS.ByteString -> Either Prelude.String (Header, Vector CsvHal)
272 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
273
274 ------------------------------------------------------------------------
275 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
276 writeFile fp (h, vs) = BL.writeFile fp $
277 encodeByNameWith csvEncodeOptions h (V.toList vs)
278
279 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
280 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
281
282 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
283 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
284
285 ------------------------------------------------------------------------
286 -- Hal Format
287 data CsvHal = CsvHal
288 { csvHal_title :: !Text
289 , csvHal_source :: !Text
290 , csvHal_publication_year :: !Integer
291 , csvHal_publication_month :: !Int
292 , csvHal_publication_day :: !Int
293 , csvHal_abstract :: !Text
294 , csvHal_authors :: !Text
295
296 , csvHal_url :: !Text
297 , csvHal_isbn_s :: !Text
298 , csvHal_issue_s :: !Text
299 , csvHal_journalPublisher_s:: !Text
300 , csvHal_language_s :: !Text
301
302 , csvHal_doiId_s :: !Text
303 , csvHal_authId_i :: !Text
304 , csvHal_instStructId_i :: !Text
305 , csvHal_deptStructId_i :: !Text
306 , csvHal_labStructId_i :: !Text
307
308 , csvHal_rteamStructId_i :: !Text
309 , csvHal_docType_s :: !Text
310 }
311 deriving (Show)
312
313 instance FromNamedRecord CsvHal where
314 parseNamedRecord r = do
315 csvHal_title <- r .: "title"
316 csvHal_source <- r .: "source"
317 csvHal_publication_year <- r .: "publication_year"
318 csvHal_publication_month <- r .: "publication_month"
319 csvHal_publication_day <- r .: "publication_day"
320 csvHal_abstract <- r .: "abstract"
321 csvHal_authors <- r .: "authors"
322 csvHal_url <- r .: "url"
323 csvHal_isbn_s <- r .: "isbn_s"
324 csvHal_issue_s <- r .: "issue_s"
325 csvHal_journalPublisher_s <- r .: "journalPublisher_s"
326 csvHal_language_s <- r .: "language_s"
327 csvHal_doiId_s <- r .: "doiId_s"
328 csvHal_authId_i <- r .: "authId_i"
329 csvHal_instStructId_i <- r .: "instStructId_i"
330 csvHal_deptStructId_i <- r .: "deptStructId_i"
331 csvHal_labStructId_i <- r .: "labStructId_i"
332 csvHal_rteamStructId_i <- r .: "rteamStructId_i"
333 csvHal_docType_s <- r .: "docType_s"
334 pure $ CsvHal { .. }
335
336 instance ToNamedRecord CsvHal where
337 --toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
338 toNamedRecord (CsvHal { .. }) =
339 namedRecord [ "title" .= csvHal_title
340 , "source" .= csvHal_source
341
342 , "publication_year" .= csvHal_publication_year
343 , "publication_month" .= csvHal_publication_month
344 , "publication_day" .= csvHal_publication_day
345
346 , "abstract" .= csvHal_abstract
347 , "authors" .= csvHal_authors
348
349 , "url" .= csvHal_url
350 , "isbn_s" .= csvHal_isbn_s
351 , "issue_s" .= csvHal_issue_s
352 , "journalPublisher_s" .= csvHal_journalPublisher_s
353 , "language_s" .= csvHal_language_s
354
355 , "doiId_s" .= csvHal_doiId_s
356 , "authId_i" .= csvHal_authId_i
357 , "instStructId_i" .= csvHal_instStructId_i
358 , "deptStructId_i" .= csvHal_deptStructId_i
359 , "labStructId_i" .= csvHal_labStructId_i
360
361 , "rteamStructId_i" .= csvHal_rteamStructId_i
362 , "docType_s" .= csvHal_docType_s
363 ]
364
365 csvHal2doc :: CsvHal -> HyperdataDocument
366 csvHal2doc (CsvHal { .. }) =
367 HyperdataDocument { _hd_bdd = Just "CsvHal"
368 , _hd_doi = Just csvHal_doiId_s
369 , _hd_url = Just csvHal_url
370 , _hd_uniqId = Nothing
371 , _hd_uniqIdBdd = Nothing
372 , _hd_page = Nothing
373 , _hd_title = Just csvHal_title
374 , _hd_authors = Just csvHal_authors
375 , _hd_institutes = Just csvHal_instStructId_i
376 , _hd_source = Just csvHal_source
377 , _hd_abstract = Just csvHal_abstract
378 , _hd_publication_date = Just $ pack . show $ jour csvHal_publication_year
379 csvHal_publication_month
380 csvHal_publication_day
381 , _hd_publication_year = Just $ fromIntegral csvHal_publication_year
382 , _hd_publication_month = Just csvHal_publication_month
383 , _hd_publication_day = Just csvHal_publication_day
384 , _hd_publication_hour = Nothing
385 , _hd_publication_minute = Nothing
386 , _hd_publication_second = Nothing
387 , _hd_language_iso2 = Nothing }
388
389
390 csv2doc :: CsvDoc -> HyperdataDocument
391 csv2doc (CsvDoc { .. })
392 = HyperdataDocument { _hd_bdd = Just "CsvHal"
393 , _hd_doi = Nothing
394 , _hd_url = Nothing
395 , _hd_uniqId = Nothing
396 , _hd_uniqIdBdd = Nothing
397 , _hd_page = Nothing
398 , _hd_title = Just csv_title
399 , _hd_authors = Just csv_authors
400 , _hd_institutes = Nothing
401 , _hd_source = Just csv_source
402 , _hd_abstract = Just csv_abstract
403 , _hd_publication_date = Just $ pack . show $ jour (fromIntegral pubYear)
404 pubMonth
405 pubDay
406 , _hd_publication_year = Just pubYear
407 , _hd_publication_month = Just pubMonth
408 , _hd_publication_day = Just pubDay
409 , _hd_publication_hour = Nothing
410 , _hd_publication_minute = Nothing
411 , _hd_publication_second = Nothing
412 , _hd_language_iso2 = Nothing }
413 where
414 pubYear = fromMIntOrDec defaultYear csv_publication_year
415 pubMonth = fromMaybe defaultMonth csv_publication_month
416 pubDay = fromMaybe defaultDay csv_publication_day
417
418 ------------------------------------------------------------------------
419 parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
420 parseHal fp = do
421 r <- readCsvHal fp
422 pure $ (V.toList . V.map csvHal2doc . snd) <$> r
423
424 parseHal' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
425 parseHal' bs = (V.toList . V.map csvHal2doc . snd) <$> readCsvHalLazyBS bs
426
427 ------------------------------------------------------------------------
428 parseCsv :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
429 parseCsv fp = do
430 r <- readFile fp
431 pure $ (V.toList . V.map csv2doc . snd) <$> r
432
433 parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
434 parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS bs
435
436 ------------------------------------------------------------------------
437 -- Csv v3 weighted for phylo
438
439 data Csv' = Csv'
440 { csv'_title :: !Text
441 , csv'_source :: !Text
442 , csv'_publication_year :: !Int
443 , csv'_publication_month :: !Int
444 , csv'_publication_day :: !Int
445 , csv'_abstract :: !Text
446 , csv'_authors :: !Text
447 , csv'_weight :: !Double } deriving (Show)
448
449
450 instance FromNamedRecord Csv' where
451 parseNamedRecord r = do
452 csv'_title <- r .: "title"
453 csv'_source <- r .: "source"
454 csv'_publication_year <- r .: "publication_year"
455 csv'_publication_month <- r .: "publication_month"
456 csv'_publication_day <- r .: "publication_day"
457 csv'_abstract <- r .: "abstract"
458 csv'_authors <- r .: "authors"
459 csv'_weight <- r .: "weight"
460 pure $ Csv' { .. }
461
462 readWeightedCsv :: FilePath -> IO (Header, Vector Csv')
463 readWeightedCsv fp =
464 fmap (\bs ->
465 case decodeByNameWith csvDecodeOptions bs of
466 Left e -> panic (pack e)
467 Right corpus -> corpus
468 ) $ BL.readFile fp