2 Module : Gargantext.Text.Corpus.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
15 module Gargantext.Text.Corpus.Parsers.CSV where
17 import Control.Applicative
18 import qualified Data.ByteString as BS
19 import qualified Data.ByteString.Lazy as BL
20 import Data.Char (ord)
22 import Data.Either (Either(Left, Right))
23 import Data.Text (Text, pack, length, intercalate)
24 import Data.Time.Segment (jour)
25 import qualified Data.Vector as V
26 import Data.Vector (Vector)
27 import GHC.IO (FilePath)
28 import GHC.Real (round)
29 import GHC.Word (Word8)
31 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
32 import Gargantext.Prelude hiding (length)
33 import Gargantext.Text
34 import Gargantext.Text.Context
36 ---------------------------------------------------------------
37 headerCsvGargV3 :: Header
38 headerCsvGargV3 = header [ "title"
46 ---------------------------------------------------------------
47 data CsvGargV3 = CsvGargV3
51 , d_publication_year :: !Int
52 , d_publication_month :: !Int
53 , d_publication_day :: !Int
58 ---------------------------------------------------------------
59 -- | Doc 2 HyperdataDocument
60 toDoc :: CsvGargV3 -> HyperdataDocument
61 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
62 HyperdataDocument (Just "CSV")
63 (Just . pack . show $ did)
82 ---------------------------------------------------------------
83 -- | Types Conversions
84 toDocs :: Vector CsvDoc -> [CsvGargV3]
86 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
87 -> CsvGargV3 nId t s py pm pd abst auth )
88 (V.enumFromN 1 (V.length v'')) v''
90 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
91 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
93 ---------------------------------------------------------------
94 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
95 fromDocs docs = V.map fromDocs' docs
97 fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
99 ---------------------------------------------------------------
100 -- | Split a document in its context
101 -- TODO adapt the size of the paragraph according to the corpus average
102 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
103 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
106 if (mod (round m) docSize) >= 10
114 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
115 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
117 firstDoc = CsvDoc t s py pm pd firstAbstract auth
118 firstAbstract = head' "splitDoc'1" abstracts
120 nextDocs = map (\txt -> CsvDoc
121 (head' "splitDoc'2" $ sentences txt)
123 (unsentences $ tail' "splitDoc'1" $ sentences txt)
125 ) (tail' "splitDoc'2" abstracts)
127 abstracts = (splitBy $ contextSize) abst
129 ---------------------------------------------------------------
130 ---------------------------------------------------------------
133 docsSize :: Vector CsvDoc -> Mean
134 docsSize csvDoc = mean ls
136 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
139 ---------------------------------------------------------------
142 , csv_source :: !Text
143 , csv_publication_year :: !Int
144 , csv_publication_month :: !Int
145 , csv_publication_day :: !Int
146 , csv_abstract :: !Text
147 , csv_authors :: !Text
151 instance FromNamedRecord CsvDoc where
152 parseNamedRecord r = CsvDoc <$> r .: "title"
154 <*> r .: "publication_year"
155 <*> r .: "publication_month"
156 <*> r .: "publication_day"
160 instance ToNamedRecord CsvDoc where
161 toNamedRecord (CsvDoc t s py pm pd abst aut) =
162 namedRecord [ "title" .= t
164 , "publication_year" .= py
165 , "publication_month" .= pm
166 , "publication_day" .= pd
171 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
172 hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
173 (m $ _hyperdataDocument_source h)
174 (mI $ _hyperdataDocument_publication_year h)
175 (mI $ _hyperdataDocument_publication_month h)
176 (mI $ _hyperdataDocument_publication_day h)
177 (m $ _hyperdataDocument_abstract h)
178 (m $ _hyperdataDocument_authors h)
181 m = maybe "" identity
182 mI = maybe 0 identity
185 csvDecodeOptions :: DecodeOptions
186 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
188 csvEncodeOptions :: EncodeOptions
189 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
192 delimiter = fromIntegral $ ord '\t'
193 ------------------------------------------------------------------------
194 ------------------------------------------------------------------------
195 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text]
196 readCsvOn' fields fp = V.toList
197 <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
201 ------------------------------------------------------------------------
203 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
204 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
206 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
207 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
209 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
210 readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
211 Left e -> panic (pack e)
212 Right csvDocs -> csvDocs
214 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
215 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
217 ------------------------------------------------------------------------
218 -- | TODO use readFileLazy
219 readFile :: FilePath -> IO (Header, Vector CsvDoc)
220 readFile = fmap readCsvLazyBS . BL.readFile
223 -- | TODO use readByteStringLazy
224 readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
225 readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
226 Left e -> panic (pack e)
227 Right csvDocs -> csvDocs
229 ------------------------------------------------------------------------
231 -- | TODO use readFileLazy
232 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
233 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
235 -- | TODO use readByteStringLazy
236 readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
237 readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
238 Left e -> panic (pack e)
239 Right csvDocs -> csvDocs
241 readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
242 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
244 ------------------------------------------------------------------------
245 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
246 writeFile fp (h, vs) = BL.writeFile fp $
247 encodeByNameWith csvEncodeOptions h (V.toList vs)
249 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
250 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
252 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
253 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
255 ------------------------------------------------------------------------
258 { csvHal_title :: !Text
259 , csvHal_source :: !Text
260 , csvHal_publication_year :: !Integer
261 , csvHal_publication_month :: !Int
262 , csvHal_publication_day :: !Int
263 , csvHal_abstract :: !Text
264 , csvHal_authors :: !Text
266 , csvHal_url :: !Text
267 , csvHal_isbn_s :: !Text
268 , csvHal_issue_s :: !Text
269 , csvHal_journalPublisher_s:: !Text
270 , csvHal_language_s :: !Text
272 , csvHal_doiId_s :: !Text
273 , csvHal_authId_i :: !Text
274 , csvHal_instStructId_i :: !Text
275 , csvHal_deptStructId_i :: !Text
276 , csvHal_labStructId_i :: !Text
278 , csvHal_rteamStructId_i :: !Text
279 , csvHal_docType_s :: !Text
283 instance FromNamedRecord CsvHal where
284 parseNamedRecord r = CsvHal <$> r .: "title"
286 <*> r .: "publication_year"
287 <*> r .: "publication_month"
288 <*> r .: "publication_day"
295 <*> r .: "journalPublisher_s"
296 <*> r .: "language_s"
300 <*> r .: "instStructId_i"
301 <*> r .: "deptStructId_i"
302 <*> r .: "labStructId_i"
304 <*> r .: "rteamStructId_i"
307 instance ToNamedRecord CsvHal where
308 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
309 namedRecord [ "title" .= t
312 , "publication_year" .= py
313 , "publication_month" .= pm
314 , "publication_day" .= pd
322 , "journalPublisher_s" .= j
323 , "language_s" .= lang
327 , "instStructId_i" .= inst
328 , "deptStructId_i" .= dept
329 , "labStructId_i" .= lab
331 , "rteamStructId_i" .= team
332 , "docType_s" .= doct
335 csvHal2doc :: CsvHal -> HyperdataDocument
336 csvHal2doc (CsvHal title source
337 pub_year pub_month pub_day
341 _ _ ) = HyperdataDocument (Just "CsvHal")
352 (Just $ pack . show $ jour pub_year pub_month pub_day)
353 (Just $ fromIntegral pub_year)
362 csv2doc :: CsvDoc -> HyperdataDocument
363 csv2doc (CsvDoc title source
364 pub_year pub_month pub_day
365 abstract authors ) = HyperdataDocument (Just "CsvHal")
376 (Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
377 (Just $ fromIntegral pub_year)
385 ------------------------------------------------------------------------
386 parseHal :: FilePath -> IO [HyperdataDocument]
387 parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp
389 parseHal' :: BL.ByteString -> [HyperdataDocument]
390 parseHal' = V.toList . V.map csvHal2doc . snd . readCsvHalLazyBS
392 ------------------------------------------------------------------------
394 parseCsv :: FilePath -> IO [HyperdataDocument]
395 parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
397 parseCsv' :: BL.ByteString -> [HyperdataDocument]
398 parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs