2 Module : Gargantext.Text.Corpus.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
18 module Gargantext.Text.Corpus.Parsers.CSV where
20 import Control.Applicative
21 import Data.Char (ord)
23 import Data.Either (Either(Left, Right))
24 import Data.Text (Text, pack, length, intercalate)
25 import Data.Time.Segment (jour)
26 import Data.Vector (Vector)
27 import GHC.IO (FilePath)
28 import GHC.Real (round)
29 import GHC.Word (Word8)
30 import Gargantext.Database.Types.Node -- (HyperdataDocument(..))
31 import Gargantext.Prelude hiding (length)
32 import Gargantext.Text
33 import Gargantext.Text.Context
34 import qualified Data.ByteString.Lazy as BL
35 import qualified Data.ByteString as BS
36 import qualified Data.Vector as V
38 ---------------------------------------------------------------
39 headerCsvGargV3 :: Header
40 headerCsvGargV3 = header [ "title"
48 ---------------------------------------------------------------
49 data CsvGargV3 = CsvGargV3
53 , d_publication_year :: !Int
54 , d_publication_month :: !Int
55 , d_publication_day :: !Int
60 ---------------------------------------------------------------
61 -- | Doc 2 HyperdataDocument
62 toDoc :: CsvGargV3 -> HyperdataDocument
63 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
64 HyperdataDocument (Just "CSV")
65 (Just . pack . show $ did)
84 ---------------------------------------------------------------
85 -- | Types Conversions
86 toDocs :: Vector CsvDoc -> [CsvGargV3]
88 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
89 -> CsvGargV3 nId t s py pm pd abst auth )
90 (V.enumFromN 1 (V.length v'')) v''
92 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
93 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
95 ---------------------------------------------------------------
96 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
97 fromDocs docs = V.map fromDocs' docs
99 fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
101 ---------------------------------------------------------------
102 -- | Split a document in its context
103 -- TODO adapt the size of the paragraph according to the corpus average
104 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
105 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
108 if (mod (round m) docSize) >= 10
116 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
117 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
119 firstDoc = CsvDoc t s py pm pd firstAbstract auth
120 firstAbstract = head' "splitDoc'1" abstracts
122 nextDocs = map (\txt -> CsvDoc
123 (head' "splitDoc'2" $ sentences txt)
125 (unsentences $ tail' "splitDoc'1" $ sentences txt)
127 ) (tail' "splitDoc'2" abstracts)
129 abstracts = (splitBy $ contextSize) abst
131 ---------------------------------------------------------------
132 ---------------------------------------------------------------
135 docsSize :: Vector CsvDoc -> Mean
136 docsSize csvDoc = mean ls
138 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
141 ---------------------------------------------------------------
144 , csv_source :: !Text
145 , csv_publication_year :: !Int
146 , csv_publication_month :: !Int
147 , csv_publication_day :: !Int
148 , csv_abstract :: !Text
149 , csv_authors :: !Text
153 instance FromNamedRecord CsvDoc where
154 parseNamedRecord r = CsvDoc <$> r .: "title"
156 <*> r .: "publication_year"
157 <*> r .: "publication_month"
158 <*> r .: "publication_day"
162 instance ToNamedRecord CsvDoc where
163 toNamedRecord (CsvDoc t s py pm pd abst aut) =
164 namedRecord [ "title" .= t
166 , "publication_year" .= py
167 , "publication_month" .= pm
168 , "publication_day" .= pd
173 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
174 hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
175 (m $ _hyperdataDocument_source h)
176 (mI $ _hyperdataDocument_publication_year h)
177 (mI $ _hyperdataDocument_publication_month h)
178 (mI $ _hyperdataDocument_publication_day h)
179 (m $ _hyperdataDocument_abstract h)
180 (m $ _hyperdataDocument_authors h)
183 m = maybe "" identity
184 mI = maybe 0 identity
187 csvDecodeOptions :: DecodeOptions
188 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
190 csvEncodeOptions :: EncodeOptions
191 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
194 delimiter = fromIntegral $ ord '\t'
195 ------------------------------------------------------------------------
196 ------------------------------------------------------------------------
197 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text]
198 readCsvOn' fields fp = V.toList
199 <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
203 ------------------------------------------------------------------------
205 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
206 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
208 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
209 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
211 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
212 readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
213 Left e -> panic (pack e)
214 Right csvDocs -> csvDocs
216 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
217 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
219 ------------------------------------------------------------------------
220 -- | TODO use readFileLazy
221 readFile :: FilePath -> IO (Header, Vector CsvDoc)
222 readFile = fmap readCsvLazyBS . BL.readFile
225 -- | TODO use readByteStringLazy
226 readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
227 readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
228 Left e -> panic (pack e)
229 Right csvDocs -> csvDocs
231 ------------------------------------------------------------------------
233 -- | TODO use readFileLazy
234 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
235 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
237 -- | TODO use readByteStringLazy
238 readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
239 readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
240 Left e -> panic (pack e)
241 Right csvDocs -> csvDocs
243 readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
244 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
246 ------------------------------------------------------------------------
247 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
248 writeFile fp (h, vs) = BL.writeFile fp $
249 encodeByNameWith csvEncodeOptions h (V.toList vs)
251 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
252 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
254 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
255 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
257 ------------------------------------------------------------------------
260 { csvHal_title :: !Text
261 , csvHal_source :: !Text
262 , csvHal_publication_year :: !Integer
263 , csvHal_publication_month :: !Int
264 , csvHal_publication_day :: !Int
265 , csvHal_abstract :: !Text
266 , csvHal_authors :: !Text
268 , csvHal_url :: !Text
269 , csvHal_isbn_s :: !Text
270 , csvHal_issue_s :: !Text
271 , csvHal_journalPublisher_s:: !Text
272 , csvHal_language_s :: !Text
274 , csvHal_doiId_s :: !Text
275 , csvHal_authId_i :: !Text
276 , csvHal_instStructId_i :: !Text
277 , csvHal_deptStructId_i :: !Text
278 , csvHal_labStructId_i :: !Text
280 , csvHal_rteamStructId_i :: !Text
281 , csvHal_docType_s :: !Text
285 instance FromNamedRecord CsvHal where
286 parseNamedRecord r = CsvHal <$> r .: "title"
288 <*> r .: "publication_year"
289 <*> r .: "publication_month"
290 <*> r .: "publication_day"
297 <*> r .: "journalPublisher_s"
298 <*> r .: "language_s"
302 <*> r .: "instStructId_i"
303 <*> r .: "deptStructId_i"
304 <*> r .: "labStructId_i"
306 <*> r .: "rteamStructId_i"
309 instance ToNamedRecord CsvHal where
310 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
311 namedRecord [ "title" .= t
314 , "publication_year" .= py
315 , "publication_month" .= pm
316 , "publication_day" .= pd
324 , "journalPublisher_s" .= j
325 , "language_s" .= lang
329 , "instStructId_i" .= inst
330 , "deptStructId_i" .= dept
331 , "labStructId_i" .= lab
333 , "rteamStructId_i" .= team
334 , "docType_s" .= doct
337 csvHal2doc :: CsvHal -> HyperdataDocument
338 csvHal2doc (CsvHal title source
339 pub_year pub_month pub_day
343 _ _ ) = HyperdataDocument (Just "CsvHal")
354 (Just $ pack . show $ jour pub_year pub_month pub_day)
355 (Just $ fromIntegral pub_year)
364 csv2doc :: CsvDoc -> HyperdataDocument
365 csv2doc (CsvDoc title source
366 pub_year pub_month pub_day
367 abstract authors ) = HyperdataDocument (Just "CsvHal")
378 (Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
379 (Just $ fromIntegral pub_year)
387 ------------------------------------------------------------------------
388 parseHal :: FilePath -> IO [HyperdataDocument]
389 parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp
391 parseHal' :: BL.ByteString -> [HyperdataDocument]
392 parseHal' = V.toList . V.map csvHal2doc . snd . readCsvHalLazyBS
394 ------------------------------------------------------------------------
396 parseCsv :: FilePath -> IO [HyperdataDocument]
397 parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
399 parseCsv' :: BL.ByteString -> [HyperdataDocument]
400 parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs