]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
[csv] more work on making the parser accept looser input
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14
15 module Gargantext.Core.Text.Corpus.Parsers.CSV where
16
17 import Control.Applicative
18 import qualified Data.ByteString as BS
19 import qualified Data.ByteString.Lazy as BL
20 import Data.Char (ord)
21 import Data.Csv
22 import Data.Either (Either(..))
23 import Data.Text (Text, pack, length, intercalate)
24 import Data.Time.Segment (jour)
25 import qualified Data.Vector as V
26 import Data.Vector (Vector)
27 import GHC.IO (FilePath)
28 import GHC.Word (Word8)
29
30 import qualified Prelude as Prelude
31
32 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
33 import Gargantext.Prelude hiding (length)
34 import Gargantext.Core.Text
35 import Gargantext.Core.Text.Context
36
37 ---------------------------------------------------------------
38 headerCsvGargV3 :: Header
39 headerCsvGargV3 =
40 header [ "title"
41 , "source"
42 , "publication_year"
43 , "publication_month"
44 , "publication_day"
45 , "abstract"
46 , "authors"
47 ]
48 ---------------------------------------------------------------
49 data CsvGargV3 = CsvGargV3
50 { d_docId :: !Int
51 , d_title :: !Text
52 , d_source :: !Text
53 , d_publication_year :: !Int
54 , d_publication_month :: !Int
55 , d_publication_day :: !Int
56 , d_abstract :: !Text
57 , d_authors :: !Text
58 }
59 deriving (Show)
60 ---------------------------------------------------------------
61 -- | Doc 2 HyperdataDocument
62 toDoc :: CsvGargV3 -> HyperdataDocument
63 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
64 HyperdataDocument (Just "CSV")
65 (Just . pack . show $ did)
66 Nothing
67 Nothing
68 Nothing
69 Nothing
70 (Just dt)
71 Nothing
72 (Just dau)
73 (Just dab)
74 (Nothing)
75 Nothing
76 (Just dpy)
77 (Just dpm)
78 (Just dpd)
79 Nothing
80 Nothing
81 Nothing
82 Nothing
83
84 ---------------------------------------------------------------
85 -- | Types Conversions
86 toDocs :: Vector CsvDoc -> [CsvGargV3]
87 toDocs v = V.toList
88 $ V.zipWith (\nId (CsvDoc t s (IntOrDec py) pm pd abst auth)
89 -> CsvGargV3 nId t s py pm pd abst auth )
90 (V.enumFromN 1 (V.length v'')) v''
91 where
92 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
93 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
94
95 ---------------------------------------------------------------
96 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
97 fromDocs docs = V.map fromDocs' docs
98 where
99 fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s (IntOrDec py) pm pd abst auth)
100
101 ---------------------------------------------------------------
102 -- | Split a document in its context
103 -- TODO adapt the size of the paragraph according to the corpus average
104 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
105 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
106 if docSize > 1000
107 then
108 if (mod (round m) docSize) >= 10
109 then
110 splitDoc' splt doc
111 else
112 V.fromList [doc]
113 else
114 V.fromList [doc]
115 where
116 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
117 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
118 where
119 firstDoc = CsvDoc t s py pm pd firstAbstract auth
120 firstAbstract = head' "splitDoc'1" abstracts
121
122 nextDocs = map (\txt -> CsvDoc
123 (head' "splitDoc'2" $ sentences txt)
124 s py pm pd
125 (unsentences $ tail' "splitDoc'1" $ sentences txt)
126 auth
127 ) (tail' "splitDoc'2" abstracts)
128
129 abstracts = (splitBy $ contextSize) abst
130
131 ---------------------------------------------------------------
132 ---------------------------------------------------------------
133 type Mean = Double
134
135 docsSize :: Vector CsvDoc -> Mean
136 docsSize csvDoc = mean ls
137 where
138 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
139
140
141 ---------------------------------------------------------------
142 newtype IntOrDec = IntOrDec Int
143 deriving (Show, Eq, Read)
144 unIntOrDec :: IntOrDec -> Int
145 unIntOrDec (IntOrDec i) = i
146 instance FromField IntOrDec where
147 parseField s = case runParser (parseField s :: Parser Int) of
148 Left _err -> IntOrDec <$> Prelude.floor <$> (parseField s :: Parser Double)
149 Right n -> pure $ IntOrDec n
150 instance ToField IntOrDec where
151 toField (IntOrDec i) = toField i
152
153 data CsvDoc = CsvDoc
154 { csv_title :: !Text
155 , csv_source :: !Text
156 , csv_publication_year :: !IntOrDec
157 , csv_publication_month :: !Int
158 , csv_publication_day :: !Int
159 , csv_abstract :: !Text
160 , csv_authors :: !Text
161 }
162 deriving (Show)
163
164 instance FromNamedRecord CsvDoc where
165 parseNamedRecord r = CsvDoc <$> (r .: "title" <|> r .: "Title")
166 <*> (r .: "source" <|> r .: "Source")
167 <*> (r .: "publication_year" <|> r .: "Publication Year")
168 <*> (r .: "publication_month" <|> r .: "Publication Month")
169 <*> (r .: "publication_day" <|> r .: "Publication Day")
170 <*> (r .: "abstract" <|> r .: "Abstract")
171 <*> (r .: "authors" <|> r .: "Authors")
172
173 instance ToNamedRecord CsvDoc where
174 toNamedRecord (CsvDoc t s py pm pd abst aut) =
175 namedRecord [ "title" .= t
176 , "source" .= s
177 , "publication_year" .= py
178 , "publication_month" .= pm
179 , "publication_day" .= pd
180 , "abstract" .= abst
181 , "authors" .= aut
182 ]
183
184 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
185 hyperdataDocument2csvDoc h = CsvDoc (m $ _hd_title h)
186 (m $ _hd_source h)
187 (IntOrDec $ mI $ _hd_publication_year h)
188 (mI $ _hd_publication_month h)
189 (mI $ _hd_publication_day h)
190 (m $ _hd_abstract h)
191 (m $ _hd_authors h)
192
193 where
194 m = maybe "" identity
195 mI = maybe 0 identity
196
197
198 csvDecodeOptions :: DecodeOptions
199 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
200
201 csvEncodeOptions :: EncodeOptions
202 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
203
204 delimiter :: Word8
205 delimiter = fromIntegral $ ord '\t'
206 ------------------------------------------------------------------------
207 ------------------------------------------------------------------------
208 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO (Either Prelude.String [Text])
209 readCsvOn' fields fp = do
210 r <- readFile fp
211 pure $ ( V.toList
212 . V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
213 . snd ) <$> r
214
215 ------------------------------------------------------------------------
216
217 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
218 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
219
220 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
221 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
222
223 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> Either Prelude.String (Header, Vector a)
224 readByteStringLazy _f bs = decodeByNameWith csvDecodeOptions bs
225
226 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> Either Prelude.String (Header, Vector a)
227 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
228
229 ------------------------------------------------------------------------
230 -- | TODO use readFileLazy
231 readFile :: FilePath -> IO (Either Prelude.String (Header, Vector CsvDoc))
232 readFile = fmap readCsvLazyBS . BL.readFile
233
234
235 -- | TODO use readByteStringLazy
236 readCsvLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvDoc)
237 readCsvLazyBS bs = decodeByNameWith csvDecodeOptions bs
238
239 ------------------------------------------------------------------------
240 -- | TODO use readFileLazy
241 readCsvHal :: FilePath -> IO (Either Prelude.String (Header, Vector CsvHal))
242 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
243
244 -- | TODO use readByteStringLazy
245 readCsvHalLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvHal)
246 readCsvHalLazyBS bs = decodeByNameWith csvDecodeOptions bs
247
248 readCsvHalBSStrict :: BS.ByteString -> Either Prelude.String (Header, Vector CsvHal)
249 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
250
251 ------------------------------------------------------------------------
252 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
253 writeFile fp (h, vs) = BL.writeFile fp $
254 encodeByNameWith csvEncodeOptions h (V.toList vs)
255
256 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
257 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
258
259 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
260 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
261
262 ------------------------------------------------------------------------
263 -- Hal Format
264 data CsvHal = CsvHal
265 { csvHal_title :: !Text
266 , csvHal_source :: !Text
267 , csvHal_publication_year :: !Integer
268 , csvHal_publication_month :: !Int
269 , csvHal_publication_day :: !Int
270 , csvHal_abstract :: !Text
271 , csvHal_authors :: !Text
272
273 , csvHal_url :: !Text
274 , csvHal_isbn_s :: !Text
275 , csvHal_issue_s :: !Text
276 , csvHal_journalPublisher_s:: !Text
277 , csvHal_language_s :: !Text
278
279 , csvHal_doiId_s :: !Text
280 , csvHal_authId_i :: !Text
281 , csvHal_instStructId_i :: !Text
282 , csvHal_deptStructId_i :: !Text
283 , csvHal_labStructId_i :: !Text
284
285 , csvHal_rteamStructId_i :: !Text
286 , csvHal_docType_s :: !Text
287 }
288 deriving (Show)
289
290 instance FromNamedRecord CsvHal where
291 parseNamedRecord r = CsvHal <$> r .: "title"
292 <*> r .: "source"
293 <*> r .: "publication_year"
294 <*> r .: "publication_month"
295 <*> r .: "publication_day"
296 <*> r .: "abstract"
297 <*> r .: "authors"
298
299 <*> r .: "url"
300 <*> r .: "isbn_s"
301 <*> r .: "issue_s"
302 <*> r .: "journalPublisher_s"
303 <*> r .: "language_s"
304
305 <*> r .: "doiId_s"
306 <*> r .: "authId_i"
307 <*> r .: "instStructId_i"
308 <*> r .: "deptStructId_i"
309 <*> r .: "labStructId_i"
310
311 <*> r .: "rteamStructId_i"
312 <*> r .: "docType_s"
313
314 instance ToNamedRecord CsvHal where
315 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
316 namedRecord [ "title" .= t
317 , "source" .= s
318
319 , "publication_year" .= py
320 , "publication_month" .= pm
321 , "publication_day" .= pd
322
323 , "abstract" .= abst
324 , "authors" .= aut
325
326 , "url" .= url
327 , "isbn_s" .= isbn
328 , "issue_s" .= iss
329 , "journalPublisher_s" .= j
330 , "language_s" .= lang
331
332 , "doiId_s" .= doi
333 , "authId_i" .= auth
334 , "instStructId_i" .= inst
335 , "deptStructId_i" .= dept
336 , "labStructId_i" .= lab
337
338 , "rteamStructId_i" .= team
339 , "docType_s" .= doct
340 ]
341
342 csvHal2doc :: CsvHal -> HyperdataDocument
343 csvHal2doc (CsvHal title source
344 pub_year pub_month pub_day
345 abstract authors
346 url _ _ _ _
347 doi _ inst _ _
348 _ _ ) = HyperdataDocument (Just "CsvHal")
349 (Just doi)
350 (Just url)
351 Nothing
352 Nothing
353 Nothing
354 (Just title)
355 (Just authors)
356 (Just inst)
357 (Just source)
358 (Just abstract)
359 (Just $ pack . show $ jour pub_year pub_month pub_day)
360 (Just $ fromIntegral pub_year)
361 (Just pub_month)
362 (Just pub_day)
363 Nothing
364 Nothing
365 Nothing
366 Nothing
367
368
369 csv2doc :: CsvDoc -> HyperdataDocument
370 csv2doc (CsvDoc title source
371 (IntOrDec pub_year) pub_month pub_day
372 abstract authors ) = HyperdataDocument (Just "CsvHal")
373 Nothing
374 Nothing
375 Nothing
376 Nothing
377 Nothing
378 (Just title)
379 (Just authors)
380 Nothing
381 (Just source)
382 (Just abstract)
383 (Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
384 (Just $ fromIntegral pub_year)
385 (Just pub_month)
386 (Just pub_day)
387 Nothing
388 Nothing
389 Nothing
390 Nothing
391
392 ------------------------------------------------------------------------
393 parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
394 parseHal fp = do
395 r <- readCsvHal fp
396 pure $ (V.toList . V.map csvHal2doc . snd) <$> r
397
398 parseHal' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
399 parseHal' bs = (V.toList . V.map csvHal2doc . snd) <$> readCsvHalLazyBS bs
400
401 ------------------------------------------------------------------------
402 parseCsv :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
403 parseCsv fp = do
404 r <- readFile fp
405 pure $ (V.toList . V.map csv2doc . snd) <$> r
406
407 parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
408 parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS bs
409
410 ------------------------------------------------------------------------
411 -- Csv v3 weighted for phylo
412
413 data Csv' = Csv'
414 { csv'_title :: !Text
415 , csv'_source :: !Text
416 , csv'_publication_year :: !Int
417 , csv'_publication_month :: !Int
418 , csv'_publication_day :: !Int
419 , csv'_abstract :: !Text
420 , csv'_authors :: !Text
421 , csv'_weight :: !Double } deriving (Show)
422
423
424 instance FromNamedRecord Csv' where
425 parseNamedRecord r = Csv' <$> r .: "title"
426 <*> r .: "source"
427 <*> r .: "publication_year"
428 <*> r .: "publication_month"
429 <*> r .: "publication_day"
430 <*> r .: "abstract"
431 <*> r .: "authors"
432 <*> r .: "weight"
433
434 readWeightedCsv :: FilePath -> IO (Header, Vector Csv')
435 readWeightedCsv fp =
436 fmap (\bs ->
437 case decodeByNameWith csvDecodeOptions bs of
438 Left e -> panic (pack e)
439 Right corpus -> corpus
440 ) $ BL.readFile fp