2 Module : Gargantext.Core.Text.Corpus.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
15 module Gargantext.Core.Text.Corpus.Parsers.CSV where
17 import Control.Applicative
18 import qualified Data.ByteString as BS
19 import qualified Data.ByteString.Lazy as BL
20 import Data.Char (ord)
22 import Data.Either (Either(Left, Right))
23 import Data.Text (Text, pack, length, intercalate)
24 import Data.Time.Segment (jour)
25 import qualified Data.Vector as V
26 import Data.Vector (Vector)
27 import GHC.IO (FilePath)
28 import GHC.Word (Word8)
30 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
31 import Gargantext.Prelude hiding (length)
32 import Gargantext.Core.Text
33 import Gargantext.Core.Text.Context
35 ---------------------------------------------------------------
36 headerCsvGargV3 :: Header
46 ---------------------------------------------------------------
47 data CsvGargV3 = CsvGargV3
51 , d_publication_year :: !Int
52 , d_publication_month :: !Int
53 , d_publication_day :: !Int
58 ---------------------------------------------------------------
59 -- | Doc 2 HyperdataDocument
60 toDoc :: CsvGargV3 -> HyperdataDocument
61 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
62 HyperdataDocument (Just "CSV")
63 (Just . pack . show $ did)
82 ---------------------------------------------------------------
83 -- | Types Conversions
84 toDocs :: Vector CsvDoc -> [CsvGargV3]
86 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
87 -> CsvGargV3 nId t s py pm pd abst auth )
88 (V.enumFromN 1 (V.length v'')) v''
90 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
91 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
93 ---------------------------------------------------------------
94 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
95 fromDocs docs = V.map fromDocs' docs
97 fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
99 ---------------------------------------------------------------
100 -- | Split a document in its context
101 -- TODO adapt the size of the paragraph according to the corpus average
102 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
103 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
106 if (mod (round m) docSize) >= 10
114 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
115 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
117 firstDoc = CsvDoc t s py pm pd firstAbstract auth
118 firstAbstract = head' "splitDoc'1" abstracts
120 nextDocs = map (\txt -> CsvDoc
121 (head' "splitDoc'2" $ sentences txt)
123 (unsentences $ tail' "splitDoc'1" $ sentences txt)
125 ) (tail' "splitDoc'2" abstracts)
127 abstracts = (splitBy $ contextSize) abst
129 ---------------------------------------------------------------
130 ---------------------------------------------------------------
133 docsSize :: Vector CsvDoc -> Mean
134 docsSize csvDoc = mean ls
136 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
139 ---------------------------------------------------------------
142 , csv_source :: !Text
143 , csv_publication_year :: !Int
144 , csv_publication_month :: !Int
145 , csv_publication_day :: !Int
146 , csv_abstract :: !Text
147 , csv_authors :: !Text
151 instance FromNamedRecord CsvDoc where
152 parseNamedRecord r = CsvDoc <$> r .: "title"
154 <*> r .: "publication_year"
155 <*> r .: "publication_month"
156 <*> r .: "publication_day"
160 instance ToNamedRecord CsvDoc where
161 toNamedRecord (CsvDoc t s py pm pd abst aut) =
162 namedRecord [ "title" .= t
164 , "publication_year" .= py
165 , "publication_month" .= pm
166 , "publication_day" .= pd
171 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
172 hyperdataDocument2csvDoc h = CsvDoc (m $ _hd_title h)
174 (mI $ _hd_publication_year h)
175 (mI $ _hd_publication_month h)
176 (mI $ _hd_publication_day h)
181 m = maybe "" identity
182 mI = maybe 0 identity
185 csvDecodeOptions :: DecodeOptions
186 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
188 csvEncodeOptions :: EncodeOptions
189 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
192 delimiter = fromIntegral $ ord '\t'
193 ------------------------------------------------------------------------
194 ------------------------------------------------------------------------
195 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text]
196 readCsvOn' fields fp = V.toList
197 <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
201 ------------------------------------------------------------------------
203 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
204 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
206 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
207 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
209 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
210 readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
211 Left e -> panic (pack e)
212 Right csvDocs -> csvDocs
214 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
215 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
217 ------------------------------------------------------------------------
218 -- | TODO use readFileLazy
219 readFile :: FilePath -> IO (Header, Vector CsvDoc)
220 readFile = fmap readCsvLazyBS . BL.readFile
223 -- | TODO use readByteStringLazy
224 readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
225 readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
226 Left e -> panic (pack e)
227 Right csvDocs -> csvDocs
229 ------------------------------------------------------------------------
230 -- | TODO use readFileLazy
231 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
232 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
234 -- | TODO use readByteStringLazy
235 readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
236 readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
237 Left e -> panic (pack e)
238 Right csvDocs -> csvDocs
240 readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
241 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
243 ------------------------------------------------------------------------
244 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
245 writeFile fp (h, vs) = BL.writeFile fp $
246 encodeByNameWith csvEncodeOptions h (V.toList vs)
248 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
249 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
251 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
252 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
254 ------------------------------------------------------------------------
257 { csvHal_title :: !Text
258 , csvHal_source :: !Text
259 , csvHal_publication_year :: !Integer
260 , csvHal_publication_month :: !Int
261 , csvHal_publication_day :: !Int
262 , csvHal_abstract :: !Text
263 , csvHal_authors :: !Text
265 , csvHal_url :: !Text
266 , csvHal_isbn_s :: !Text
267 , csvHal_issue_s :: !Text
268 , csvHal_journalPublisher_s:: !Text
269 , csvHal_language_s :: !Text
271 , csvHal_doiId_s :: !Text
272 , csvHal_authId_i :: !Text
273 , csvHal_instStructId_i :: !Text
274 , csvHal_deptStructId_i :: !Text
275 , csvHal_labStructId_i :: !Text
277 , csvHal_rteamStructId_i :: !Text
278 , csvHal_docType_s :: !Text
282 instance FromNamedRecord CsvHal where
283 parseNamedRecord r = CsvHal <$> r .: "title"
285 <*> r .: "publication_year"
286 <*> r .: "publication_month"
287 <*> r .: "publication_day"
294 <*> r .: "journalPublisher_s"
295 <*> r .: "language_s"
299 <*> r .: "instStructId_i"
300 <*> r .: "deptStructId_i"
301 <*> r .: "labStructId_i"
303 <*> r .: "rteamStructId_i"
306 instance ToNamedRecord CsvHal where
307 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
308 namedRecord [ "title" .= t
311 , "publication_year" .= py
312 , "publication_month" .= pm
313 , "publication_day" .= pd
321 , "journalPublisher_s" .= j
322 , "language_s" .= lang
326 , "instStructId_i" .= inst
327 , "deptStructId_i" .= dept
328 , "labStructId_i" .= lab
330 , "rteamStructId_i" .= team
331 , "docType_s" .= doct
334 csvHal2doc :: CsvHal -> HyperdataDocument
335 csvHal2doc (CsvHal title source
336 pub_year pub_month pub_day
340 _ _ ) = HyperdataDocument (Just "CsvHal")
351 (Just $ pack . show $ jour pub_year pub_month pub_day)
352 (Just $ fromIntegral pub_year)
361 csv2doc :: CsvDoc -> HyperdataDocument
362 csv2doc (CsvDoc title source
363 pub_year pub_month pub_day
364 abstract authors ) = HyperdataDocument (Just "CsvHal")
375 (Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
376 (Just $ fromIntegral pub_year)
384 ------------------------------------------------------------------------
385 parseHal :: FilePath -> IO [HyperdataDocument]
386 parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp
388 parseHal' :: BL.ByteString -> [HyperdataDocument]
389 parseHal' = V.toList . V.map csvHal2doc . snd . readCsvHalLazyBS
391 ------------------------------------------------------------------------
392 parseCsv :: FilePath -> IO [HyperdataDocument]
393 parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
395 parseCsv' :: BL.ByteString -> [HyperdataDocument]
396 parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs
398 ------------------------------------------------------------------------
399 -- Csv v3 weighted for phylo
402 { csv'_title :: !Text
403 , csv'_source :: !Text
404 , csv'_publication_year :: !Int
405 , csv'_publication_month :: !Int
406 , csv'_publication_day :: !Int
407 , csv'_abstract :: !Text
408 , csv'_authors :: !Text
409 , csv'_weight :: !Double } deriving (Show)
412 instance FromNamedRecord Csv' where
413 parseNamedRecord r = Csv' <$> r .: "title"
415 <*> r .: "publication_year"
416 <*> r .: "publication_month"
417 <*> r .: "publication_day"
422 readWeightedCsv :: FilePath -> IO (Header, Vector Csv')
425 case decodeByNameWith csvDecodeOptions bs of
426 Left e -> panic (pack e)
427 Right corpus -> corpus