]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
[pasers] framewrite parser for fetching documents from framewrite node
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14
15 module Gargantext.Core.Text.Corpus.Parsers.CSV where
16
17 import Control.Applicative
18 import qualified Data.ByteString as BS
19 import qualified Data.ByteString.Lazy as BL
20 import Data.Char (ord)
21 import Data.Csv
22 import Data.Either (Either(..))
23 import Data.Maybe (fromMaybe)
24 import Data.Text (Text, pack, length, intercalate)
25 import Data.Time.Segment (jour)
26 import qualified Data.Vector as V
27 import Data.Vector (Vector)
28 import GHC.IO (FilePath)
29 import GHC.Word (Word8)
30
31 import qualified Prelude as Prelude
32
33 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
34 import Gargantext.Prelude hiding (length)
35 import Gargantext.Core.Text
36 import Gargantext.Core.Text.Context
37
38 ---------------------------------------------------------------
39 headerCsvGargV3 :: Header
40 headerCsvGargV3 =
41 header [ "title"
42 , "source"
43 , "publication_year"
44 , "publication_month"
45 , "publication_day"
46 , "abstract"
47 , "authors"
48 ]
49 ---------------------------------------------------------------
50 data CsvGargV3 = CsvGargV3
51 { d_docId :: !Int
52 , d_title :: !Text
53 , d_source :: !Text
54 , d_publication_year :: !Int
55 , d_publication_month :: !Int
56 , d_publication_day :: !Int
57 , d_abstract :: !Text
58 , d_authors :: !Text
59 }
60 deriving (Show)
61 ---------------------------------------------------------------
62 -- | Doc 2 HyperdataDocument
63 toDoc :: CsvGargV3 -> HyperdataDocument
64 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
65 HyperdataDocument (Just "CSV")
66 (Just . pack . show $ did)
67 Nothing
68 Nothing
69 Nothing
70 Nothing
71 (Just dt)
72 Nothing
73 (Just dau)
74 (Just dab)
75 (Nothing)
76 Nothing
77 (Just dpy)
78 (Just dpm)
79 (Just dpd)
80 Nothing
81 Nothing
82 Nothing
83 Nothing
84
85 ---------------------------------------------------------------
86 -- | Types Conversions
87 toDocs :: Vector CsvDoc -> [CsvGargV3]
88 toDocs v = V.toList
89 $ V.zipWith (\nId (CsvDoc t s mPy pm pd abst auth)
90 -> CsvGargV3 nId t s
91 (fromMIntOrDec defaultYear mPy) (fromMaybe defaultMonth pm) (fromMaybe defaultDay pd)
92 abst auth )
93 (V.enumFromN 1 (V.length v'')) v''
94 where
95 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
96 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
97
98 ---------------------------------------------------------------
99 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
100 fromDocs docs = V.map fromDocs' docs
101 where
102 fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s (Just $ IntOrDec py) (Just pm) (Just pd) abst auth)
103
104 ---------------------------------------------------------------
105 -- | Split a document in its context
106 -- TODO adapt the size of the paragraph according to the corpus average
107 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
108 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
109 if docSize > 1000
110 then
111 if (mod (round m) docSize) >= 10
112 then
113 splitDoc' splt doc
114 else
115 V.fromList [doc]
116 else
117 V.fromList [doc]
118 where
119 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
120 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
121 where
122 firstDoc = CsvDoc t s py pm pd firstAbstract auth
123 firstAbstract = head' "splitDoc'1" abstracts
124
125 nextDocs = map (\txt -> CsvDoc
126 (head' "splitDoc'2" $ sentences txt)
127 s py pm pd
128 (unsentences $ tail' "splitDoc'1" $ sentences txt)
129 auth
130 ) (tail' "splitDoc'2" abstracts)
131
132 abstracts = (splitBy $ contextSize) abst
133
134 ---------------------------------------------------------------
135 ---------------------------------------------------------------
136 type Mean = Double
137
138 docsSize :: Vector CsvDoc -> Mean
139 docsSize csvDoc = mean ls
140 where
141 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
142
143
144 ---------------------------------------------------------------
145 newtype IntOrDec = IntOrDec Int
146 deriving (Show, Eq, Read)
147 unIntOrDec :: IntOrDec -> Int
148 unIntOrDec (IntOrDec i) = i
149 instance FromField IntOrDec where
150 parseField s = case runParser (parseField s :: Parser Int) of
151 Left _err -> IntOrDec <$> Prelude.floor <$> (parseField s :: Parser Double)
152 Right n -> pure $ IntOrDec n
153 instance ToField IntOrDec where
154 toField (IntOrDec i) = toField i
155
156 fromMIntOrDec :: Int -> Maybe IntOrDec -> Int
157 fromMIntOrDec default' mVal = unIntOrDec $ fromMaybe (IntOrDec default') mVal
158 defaultYear :: Int
159 defaultYear = 1973
160 defaultMonth :: Int
161 defaultMonth = 1
162 defaultDay :: Int
163 defaultDay = 1
164
165 data CsvDoc = CsvDoc
166 { csv_title :: !Text
167 , csv_source :: !Text
168 , csv_publication_year :: !(Maybe IntOrDec)
169 , csv_publication_month :: !(Maybe Int)
170 , csv_publication_day :: !(Maybe Int)
171 , csv_abstract :: !Text
172 , csv_authors :: !Text
173 }
174 deriving (Show)
175
176 instance FromNamedRecord CsvDoc where
177 parseNamedRecord r = CsvDoc <$> (r .: "title" <|> r .: "Title")
178 <*> (r .: "source" <|> r .: "Source")
179 <*> (r .: "publication_year" <|> r .: "Publication Year")
180 <*> (r .: "publication_month" <|> r .: "Publication Month")
181 <*> (r .: "publication_day" <|> r .: "Publication Day")
182 <*> (r .: "abstract" <|> r .: "Abstract")
183 <*> (r .: "authors" <|> r .: "Authors")
184
185 instance ToNamedRecord CsvDoc where
186 toNamedRecord (CsvDoc t s py pm pd abst aut) =
187 namedRecord [ "title" .= t
188 , "source" .= s
189 , "publication_year" .= py
190 , "publication_month" .= pm
191 , "publication_day" .= pd
192 , "abstract" .= abst
193 , "authors" .= aut
194 ]
195
196 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
197 hyperdataDocument2csvDoc h = CsvDoc (m $ _hd_title h)
198 (m $ _hd_source h)
199 (Just $ IntOrDec $ mI $ _hd_publication_year h)
200 (Just $ mI $ _hd_publication_month h)
201 (Just $ mI $ _hd_publication_day h)
202 (m $ _hd_abstract h)
203 (m $ _hd_authors h)
204
205 where
206 m = maybe "" identity
207 mI = maybe 0 identity
208
209
210 csvDecodeOptions :: DecodeOptions
211 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
212
213 csvEncodeOptions :: EncodeOptions
214 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
215
216 delimiter :: Word8
217 delimiter = fromIntegral $ ord '\t'
218 ------------------------------------------------------------------------
219 ------------------------------------------------------------------------
220 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO (Either Prelude.String [Text])
221 readCsvOn' fields fp = do
222 r <- readFile fp
223 pure $ ( V.toList
224 . V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
225 . snd ) <$> r
226
227 ------------------------------------------------------------------------
228
229 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
230 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
231
232 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
233 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
234
235 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> Either Prelude.String (Header, Vector a)
236 readByteStringLazy _f bs = decodeByNameWith csvDecodeOptions bs
237
238 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> Either Prelude.String (Header, Vector a)
239 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
240
241 ------------------------------------------------------------------------
242 -- | TODO use readFileLazy
243 readFile :: FilePath -> IO (Either Prelude.String (Header, Vector CsvDoc))
244 readFile = fmap readCsvLazyBS . BL.readFile
245
246
247 -- | TODO use readByteStringLazy
248 readCsvLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvDoc)
249 readCsvLazyBS bs = decodeByNameWith csvDecodeOptions bs
250
251 ------------------------------------------------------------------------
252 -- | TODO use readFileLazy
253 readCsvHal :: FilePath -> IO (Either Prelude.String (Header, Vector CsvHal))
254 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
255
256 -- | TODO use readByteStringLazy
257 readCsvHalLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvHal)
258 readCsvHalLazyBS bs = decodeByNameWith csvDecodeOptions bs
259
260 readCsvHalBSStrict :: BS.ByteString -> Either Prelude.String (Header, Vector CsvHal)
261 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
262
263 ------------------------------------------------------------------------
264 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
265 writeFile fp (h, vs) = BL.writeFile fp $
266 encodeByNameWith csvEncodeOptions h (V.toList vs)
267
268 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
269 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
270
271 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
272 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
273
274 ------------------------------------------------------------------------
275 -- Hal Format
276 data CsvHal = CsvHal
277 { csvHal_title :: !Text
278 , csvHal_source :: !Text
279 , csvHal_publication_year :: !Integer
280 , csvHal_publication_month :: !Int
281 , csvHal_publication_day :: !Int
282 , csvHal_abstract :: !Text
283 , csvHal_authors :: !Text
284
285 , csvHal_url :: !Text
286 , csvHal_isbn_s :: !Text
287 , csvHal_issue_s :: !Text
288 , csvHal_journalPublisher_s:: !Text
289 , csvHal_language_s :: !Text
290
291 , csvHal_doiId_s :: !Text
292 , csvHal_authId_i :: !Text
293 , csvHal_instStructId_i :: !Text
294 , csvHal_deptStructId_i :: !Text
295 , csvHal_labStructId_i :: !Text
296
297 , csvHal_rteamStructId_i :: !Text
298 , csvHal_docType_s :: !Text
299 }
300 deriving (Show)
301
302 instance FromNamedRecord CsvHal where
303 parseNamedRecord r = CsvHal <$> r .: "title"
304 <*> r .: "source"
305 <*> r .: "publication_year"
306 <*> r .: "publication_month"
307 <*> r .: "publication_day"
308 <*> r .: "abstract"
309 <*> r .: "authors"
310
311 <*> r .: "url"
312 <*> r .: "isbn_s"
313 <*> r .: "issue_s"
314 <*> r .: "journalPublisher_s"
315 <*> r .: "language_s"
316
317 <*> r .: "doiId_s"
318 <*> r .: "authId_i"
319 <*> r .: "instStructId_i"
320 <*> r .: "deptStructId_i"
321 <*> r .: "labStructId_i"
322
323 <*> r .: "rteamStructId_i"
324 <*> r .: "docType_s"
325
326 instance ToNamedRecord CsvHal where
327 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
328 namedRecord [ "title" .= t
329 , "source" .= s
330
331 , "publication_year" .= py
332 , "publication_month" .= pm
333 , "publication_day" .= pd
334
335 , "abstract" .= abst
336 , "authors" .= aut
337
338 , "url" .= url
339 , "isbn_s" .= isbn
340 , "issue_s" .= iss
341 , "journalPublisher_s" .= j
342 , "language_s" .= lang
343
344 , "doiId_s" .= doi
345 , "authId_i" .= auth
346 , "instStructId_i" .= inst
347 , "deptStructId_i" .= dept
348 , "labStructId_i" .= lab
349
350 , "rteamStructId_i" .= team
351 , "docType_s" .= doct
352 ]
353
354 csvHal2doc :: CsvHal -> HyperdataDocument
355 csvHal2doc (CsvHal title source
356 pub_year pub_month pub_day
357 abstract authors
358 url _ _ _ _
359 doi _ inst _ _
360 _ _ ) = HyperdataDocument (Just "CsvHal")
361 (Just doi)
362 (Just url)
363 Nothing
364 Nothing
365 Nothing
366 (Just title)
367 (Just authors)
368 (Just inst)
369 (Just source)
370 (Just abstract)
371 (Just $ pack . show $ jour pub_year pub_month pub_day)
372 (Just $ fromIntegral pub_year)
373 (Just pub_month)
374 (Just pub_day)
375 Nothing
376 Nothing
377 Nothing
378 Nothing
379
380
381 csv2doc :: CsvDoc -> HyperdataDocument
382 csv2doc (CsvDoc title source
383 mPubYear mPubMonth mPubDay
384 abstract authors ) = HyperdataDocument (Just "CsvHal")
385 Nothing
386 Nothing
387 Nothing
388 Nothing
389 Nothing
390 (Just title)
391 (Just authors)
392 Nothing
393 (Just source)
394 (Just abstract)
395 (Just $ pack . show $ jour (fromIntegral pubYear) pubMonth pubDay)
396 (Just pubYear)
397 (Just pubMonth)
398 (Just pubDay)
399 Nothing
400 Nothing
401 Nothing
402 Nothing
403 where
404 pubYear = fromMIntOrDec defaultYear mPubYear
405 pubMonth = fromMaybe defaultMonth mPubMonth
406 pubDay = fromMaybe defaultDay mPubDay
407
408 ------------------------------------------------------------------------
409 parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
410 parseHal fp = do
411 r <- readCsvHal fp
412 pure $ (V.toList . V.map csvHal2doc . snd) <$> r
413
414 parseHal' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
415 parseHal' bs = (V.toList . V.map csvHal2doc . snd) <$> readCsvHalLazyBS bs
416
417 ------------------------------------------------------------------------
418 parseCsv :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
419 parseCsv fp = do
420 r <- readFile fp
421 pure $ (V.toList . V.map csv2doc . snd) <$> r
422
423 parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
424 parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS bs
425
426 ------------------------------------------------------------------------
427 -- Csv v3 weighted for phylo
428
429 data Csv' = Csv'
430 { csv'_title :: !Text
431 , csv'_source :: !Text
432 , csv'_publication_year :: !Int
433 , csv'_publication_month :: !Int
434 , csv'_publication_day :: !Int
435 , csv'_abstract :: !Text
436 , csv'_authors :: !Text
437 , csv'_weight :: !Double } deriving (Show)
438
439
440 instance FromNamedRecord Csv' where
441 parseNamedRecord r = Csv' <$> r .: "title"
442 <*> r .: "source"
443 <*> r .: "publication_year"
444 <*> r .: "publication_month"
445 <*> r .: "publication_day"
446 <*> r .: "abstract"
447 <*> r .: "authors"
448 <*> r .: "weight"
449
450 readWeightedCsv :: FilePath -> IO (Header, Vector Csv')
451 readWeightedCsv fp =
452 fmap (\bs ->
453 case decodeByNameWith csvDecodeOptions bs of
454 Left e -> panic (pack e)
455 Right corpus -> corpus
456 ) $ BL.readFile fp