2 Module : Gargantext.Text.Corpus.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
18 module Gargantext.Text.Corpus.Parsers.CSV where
20 import Control.Applicative
21 import qualified Data.ByteString as BS
22 import qualified Data.ByteString.Lazy as BL
23 import Data.Char (ord)
25 import Data.Either (Either(Left, Right))
26 import Data.Text (Text, pack, length, intercalate)
27 import Data.Time.Segment (jour)
28 import qualified Data.Vector as V
29 import Data.Vector (Vector)
30 import GHC.IO (FilePath)
31 import GHC.Real (round)
32 import GHC.Word (Word8)
34 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
35 import Gargantext.Prelude hiding (length)
36 import Gargantext.Text
37 import Gargantext.Text.Context
39 ---------------------------------------------------------------
40 headerCsvGargV3 :: Header
41 headerCsvGargV3 = header [ "title"
49 ---------------------------------------------------------------
50 data CsvGargV3 = CsvGargV3
54 , d_publication_year :: !Int
55 , d_publication_month :: !Int
56 , d_publication_day :: !Int
61 ---------------------------------------------------------------
62 -- | Doc 2 HyperdataDocument
63 toDoc :: CsvGargV3 -> HyperdataDocument
64 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
65 HyperdataDocument (Just "CSV")
66 (Just . pack . show $ did)
85 ---------------------------------------------------------------
86 -- | Types Conversions
87 toDocs :: Vector CsvDoc -> [CsvGargV3]
89 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
90 -> CsvGargV3 nId t s py pm pd abst auth )
91 (V.enumFromN 1 (V.length v'')) v''
93 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
94 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
96 ---------------------------------------------------------------
97 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
98 fromDocs docs = V.map fromDocs' docs
100 fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
102 ---------------------------------------------------------------
103 -- | Split a document in its context
104 -- TODO adapt the size of the paragraph according to the corpus average
105 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
106 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
109 if (mod (round m) docSize) >= 10
117 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
118 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
120 firstDoc = CsvDoc t s py pm pd firstAbstract auth
121 firstAbstract = head' "splitDoc'1" abstracts
123 nextDocs = map (\txt -> CsvDoc
124 (head' "splitDoc'2" $ sentences txt)
126 (unsentences $ tail' "splitDoc'1" $ sentences txt)
128 ) (tail' "splitDoc'2" abstracts)
130 abstracts = (splitBy $ contextSize) abst
132 ---------------------------------------------------------------
133 ---------------------------------------------------------------
136 docsSize :: Vector CsvDoc -> Mean
137 docsSize csvDoc = mean ls
139 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
142 ---------------------------------------------------------------
145 , csv_source :: !Text
146 , csv_publication_year :: !Int
147 , csv_publication_month :: !Int
148 , csv_publication_day :: !Int
149 , csv_abstract :: !Text
150 , csv_authors :: !Text
154 instance FromNamedRecord CsvDoc where
155 parseNamedRecord r = CsvDoc <$> r .: "title"
157 <*> r .: "publication_year"
158 <*> r .: "publication_month"
159 <*> r .: "publication_day"
163 instance ToNamedRecord CsvDoc where
164 toNamedRecord (CsvDoc t s py pm pd abst aut) =
165 namedRecord [ "title" .= t
167 , "publication_year" .= py
168 , "publication_month" .= pm
169 , "publication_day" .= pd
174 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
175 hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
176 (m $ _hyperdataDocument_source h)
177 (mI $ _hyperdataDocument_publication_year h)
178 (mI $ _hyperdataDocument_publication_month h)
179 (mI $ _hyperdataDocument_publication_day h)
180 (m $ _hyperdataDocument_abstract h)
181 (m $ _hyperdataDocument_authors h)
184 m = maybe "" identity
185 mI = maybe 0 identity
188 csvDecodeOptions :: DecodeOptions
189 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
191 csvEncodeOptions :: EncodeOptions
192 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
195 delimiter = fromIntegral $ ord '\t'
196 ------------------------------------------------------------------------
197 ------------------------------------------------------------------------
198 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text]
199 readCsvOn' fields fp = V.toList
200 <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
204 ------------------------------------------------------------------------
206 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
207 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
209 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
210 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
212 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
213 readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
214 Left e -> panic (pack e)
215 Right csvDocs -> csvDocs
217 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
218 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
220 ------------------------------------------------------------------------
221 -- | TODO use readFileLazy
222 readFile :: FilePath -> IO (Header, Vector CsvDoc)
223 readFile = fmap readCsvLazyBS . BL.readFile
226 -- | TODO use readByteStringLazy
227 readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
228 readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
229 Left e -> panic (pack e)
230 Right csvDocs -> csvDocs
232 ------------------------------------------------------------------------
234 -- | TODO use readFileLazy
235 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
236 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
238 -- | TODO use readByteStringLazy
239 readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
240 readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
241 Left e -> panic (pack e)
242 Right csvDocs -> csvDocs
244 readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
245 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
247 ------------------------------------------------------------------------
248 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
249 writeFile fp (h, vs) = BL.writeFile fp $
250 encodeByNameWith csvEncodeOptions h (V.toList vs)
252 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
253 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
255 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
256 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
258 ------------------------------------------------------------------------
261 { csvHal_title :: !Text
262 , csvHal_source :: !Text
263 , csvHal_publication_year :: !Integer
264 , csvHal_publication_month :: !Int
265 , csvHal_publication_day :: !Int
266 , csvHal_abstract :: !Text
267 , csvHal_authors :: !Text
269 , csvHal_url :: !Text
270 , csvHal_isbn_s :: !Text
271 , csvHal_issue_s :: !Text
272 , csvHal_journalPublisher_s:: !Text
273 , csvHal_language_s :: !Text
275 , csvHal_doiId_s :: !Text
276 , csvHal_authId_i :: !Text
277 , csvHal_instStructId_i :: !Text
278 , csvHal_deptStructId_i :: !Text
279 , csvHal_labStructId_i :: !Text
281 , csvHal_rteamStructId_i :: !Text
282 , csvHal_docType_s :: !Text
286 instance FromNamedRecord CsvHal where
287 parseNamedRecord r = CsvHal <$> r .: "title"
289 <*> r .: "publication_year"
290 <*> r .: "publication_month"
291 <*> r .: "publication_day"
298 <*> r .: "journalPublisher_s"
299 <*> r .: "language_s"
303 <*> r .: "instStructId_i"
304 <*> r .: "deptStructId_i"
305 <*> r .: "labStructId_i"
307 <*> r .: "rteamStructId_i"
310 instance ToNamedRecord CsvHal where
311 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
312 namedRecord [ "title" .= t
315 , "publication_year" .= py
316 , "publication_month" .= pm
317 , "publication_day" .= pd
325 , "journalPublisher_s" .= j
326 , "language_s" .= lang
330 , "instStructId_i" .= inst
331 , "deptStructId_i" .= dept
332 , "labStructId_i" .= lab
334 , "rteamStructId_i" .= team
335 , "docType_s" .= doct
338 csvHal2doc :: CsvHal -> HyperdataDocument
339 csvHal2doc (CsvHal title source
340 pub_year pub_month pub_day
344 _ _ ) = HyperdataDocument (Just "CsvHal")
355 (Just $ pack . show $ jour pub_year pub_month pub_day)
356 (Just $ fromIntegral pub_year)
365 csv2doc :: CsvDoc -> HyperdataDocument
366 csv2doc (CsvDoc title source
367 pub_year pub_month pub_day
368 abstract authors ) = HyperdataDocument (Just "CsvHal")
379 (Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
380 (Just $ fromIntegral pub_year)
388 ------------------------------------------------------------------------
389 parseHal :: FilePath -> IO [HyperdataDocument]
390 parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp
392 parseHal' :: BL.ByteString -> [HyperdataDocument]
393 parseHal' = V.toList . V.map csvHal2doc . snd . readCsvHalLazyBS
395 ------------------------------------------------------------------------
397 parseCsv :: FilePath -> IO [HyperdataDocument]
398 parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
400 parseCsv' :: BL.ByteString -> [HyperdataDocument]
401 parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs