2 Module : Gargantext.Text.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
18 module Gargantext.Text.Parsers.CSV where
20 import Control.Applicative
21 import Data.Char (ord)
23 import Data.Either (Either(Left, Right))
24 import Data.Text (Text, pack, length, intercalate)
25 import Data.Time.Segment (jour)
26 import Data.Vector (Vector)
27 import GHC.IO (FilePath)
28 import GHC.Real (round)
29 import GHC.Word (Word8)
30 import Gargantext.Database.Types.Node -- (HyperdataDocument(..))
31 import Gargantext.Prelude hiding (length)
32 import Gargantext.Text
33 import Gargantext.Text.Context
34 import qualified Data.ByteString.Lazy as BL
35 import qualified Data.ByteString as BS
36 import qualified Data.Vector as V
38 ---------------------------------------------------------------
39 headerCsvGargV3 :: Header
40 headerCsvGargV3 = header [ "title"
48 ---------------------------------------------------------------
49 data CsvGargV3 = CsvGargV3
53 , d_publication_year :: !Int
54 , d_publication_month :: !Int
55 , d_publication_day :: !Int
60 ---------------------------------------------------------------
61 -- | Doc 2 HyperdataDocument
62 toDoc :: CsvGargV3 -> HyperdataDocument
63 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
64 HyperdataDocument (Just "CSV")
65 (Just . pack . show $ did)
84 ---------------------------------------------------------------
85 -- | Types Conversions
86 toDocs :: Vector CsvDoc -> [CsvGargV3]
88 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
89 -> CsvGargV3 nId t s py pm pd abst auth )
90 (V.enumFromN 1 (V.length v'')) v''
92 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
93 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
95 ---------------------------------------------------------------
96 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
97 fromDocs docs = V.map fromDocs' docs
99 fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
101 ---------------------------------------------------------------
102 -- | Split a document in its context
103 -- TODO adapt the size of the paragraph according to the corpus average
105 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
106 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
109 if (mod (round m) docSize) >= 10
118 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
119 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
121 firstDoc = CsvDoc t s py pm pd firstAbstract auth
122 firstAbstract = head' "splitDoc'1" abstracts
124 nextDocs = map (\txt -> CsvDoc
125 (head' "splitDoc'2" $ sentences txt)
127 (unsentences $ tail' "splitDoc'1" $ sentences txt)
129 ) (tail' "splitDoc'2" abstracts)
131 abstracts = (splitBy $ contextSize) abst
133 ---------------------------------------------------------------
134 ---------------------------------------------------------------
137 docsSize :: Vector CsvDoc -> Mean
138 docsSize csvDoc = mean ls
140 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
143 ---------------------------------------------------------------
146 , csv_source :: !Text
147 , csv_publication_year :: !Int
148 , csv_publication_month :: !Int
149 , csv_publication_day :: !Int
150 , csv_abstract :: !Text
151 , csv_authors :: !Text
155 instance FromNamedRecord CsvDoc where
156 parseNamedRecord r = CsvDoc <$> r .: "title"
158 <*> r .: "publication_year"
159 <*> r .: "publication_month"
160 <*> r .: "publication_day"
164 instance ToNamedRecord CsvDoc where
165 toNamedRecord (CsvDoc t s py pm pd abst aut) =
166 namedRecord [ "title" .= t
168 , "publication_year" .= py
169 , "publication_month" .= pm
170 , "publication_day" .= pd
175 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
176 hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
177 (m $ _hyperdataDocument_source h)
178 (mI $ _hyperdataDocument_publication_year h)
179 (mI $ _hyperdataDocument_publication_month h)
180 (mI $ _hyperdataDocument_publication_day h)
181 (m $ _hyperdataDocument_abstract h)
182 (m $ _hyperdataDocument_authors h)
185 m = maybe "" identity
186 mI = maybe 0 identity
189 csvDecodeOptions :: DecodeOptions
190 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
192 csvEncodeOptions :: EncodeOptions
193 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
196 delimiter = fromIntegral $ ord '\t'
197 ------------------------------------------------------------------------
198 ------------------------------------------------------------------------
199 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
200 readCsvOn fields fp = V.toList
201 <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
205 ------------------------------------------------------------------------
207 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
208 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
210 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
211 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
213 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
214 readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
215 Left e -> panic (pack e)
216 Right csvDocs -> csvDocs
218 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
219 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
221 ------------------------------------------------------------------------
222 -- | TODO use readFileLazy
223 readFile :: FilePath -> IO (Header, Vector CsvDoc)
224 readFile = fmap readCsvLazyBS . BL.readFile
227 -- | TODO use readByteStringLazy
228 readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
229 readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
230 Left e -> panic (pack e)
231 Right csvDocs -> csvDocs
233 ------------------------------------------------------------------------
234 -- | TODO use readFileLazy
235 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
236 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
238 -- | TODO use readByteStringLazy
239 readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
240 readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
241 Left e -> panic (pack e)
242 Right csvDocs -> csvDocs
244 readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
245 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
247 ------------------------------------------------------------------------
248 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
249 writeFile fp (h, vs) = BL.writeFile fp $
250 encodeByNameWith csvEncodeOptions h (V.toList vs)
252 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
253 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
255 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
256 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
258 ------------------------------------------------------------------------
261 { csvHal_title :: !Text
262 , csvHal_source :: !Text
263 , csvHal_publication_year :: !Integer
264 , csvHal_publication_month :: !Int
265 , csvHal_publication_day :: !Int
266 , csvHal_abstract :: !Text
267 , csvHal_authors :: !Text
269 , csvHal_url :: !Text
270 , csvHal_isbn_s :: !Text
271 , csvHal_issue_s :: !Text
272 , csvHal_journalPublisher_s:: !Text
273 , csvHal_language_s :: !Text
275 , csvHal_doiId_s :: !Text
276 , csvHal_authId_i :: !Text
277 , csvHal_instStructId_i :: !Text
278 , csvHal_deptStructId_i :: !Text
279 , csvHal_labStructId_i :: !Text
281 , csvHal_rteamStructId_i :: !Text
282 , csvHal_docType_s :: !Text
286 instance FromNamedRecord CsvHal where
287 parseNamedRecord r = CsvHal <$> r .: "title"
289 <*> r .: "publication_year"
290 <*> r .: "publication_month"
291 <*> r .: "publication_day"
298 <*> r .: "journalPublisher_s"
299 <*> r .: "language_s"
303 <*> r .: "instStructId_i"
304 <*> r .: "deptStructId_i"
305 <*> r .: "labStructId_i"
307 <*> r .: "rteamStructId_i"
310 instance ToNamedRecord CsvHal where
311 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
312 namedRecord [ "title" .= t
315 , "publication_year" .= py
316 , "publication_month" .= pm
317 , "publication_day" .= pd
325 , "journalPublisher_s" .= j
326 , "language_s" .= lang
330 , "instStructId_i" .= inst
331 , "deptStructId_i" .= dept
332 , "labStructId_i" .= lab
334 , "rteamStructId_i" .= team
335 , "docType_s" .= doct
338 csvHal2doc :: CsvHal -> HyperdataDocument
339 csvHal2doc (CsvHal title source
340 pub_year pub_month pub_day
344 _ _ ) = HyperdataDocument (Just "CsvHal")
355 (Just $ pack . show $ jour pub_year pub_month pub_day)
356 (Just $ fromIntegral pub_year)
364 ------------------------------------------------------------------------
365 parseHal :: FilePath -> IO [HyperdataDocument]
366 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readCsvHal fp
367 ------------------------------------------------------------------------