2 Module : Gargantext.Core.Text.Corpus.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
15 module Gargantext.Core.Text.Corpus.Parsers.CSV where
17 import Control.Applicative
18 import qualified Data.ByteString as BS
19 import qualified Data.ByteString.Lazy as BL
20 import Data.Char (ord)
22 import Data.Either (Either(..))
23 import Data.Text (Text, pack, length, intercalate)
24 import Data.Time.Segment (jour)
25 import qualified Data.Vector as V
26 import Data.Vector (Vector)
27 import GHC.IO (FilePath)
28 import GHC.Word (Word8)
30 import qualified Prelude as Prelude
32 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
33 import Gargantext.Prelude hiding (length)
34 import Gargantext.Core.Text
35 import Gargantext.Core.Text.Context
37 ---------------------------------------------------------------
38 headerCsvGargV3 :: Header
48 ---------------------------------------------------------------
49 data CsvGargV3 = CsvGargV3
53 , d_publication_year :: !Int
54 , d_publication_month :: !Int
55 , d_publication_day :: !Int
60 ---------------------------------------------------------------
61 -- | Doc 2 HyperdataDocument
62 toDoc :: CsvGargV3 -> HyperdataDocument
63 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
64 HyperdataDocument (Just "CSV")
65 (Just . pack . show $ did)
84 ---------------------------------------------------------------
85 -- | Types Conversions
86 toDocs :: Vector CsvDoc -> [CsvGargV3]
88 $ V.zipWith (\nId (CsvDoc t s (IntOrDec py) pm pd abst auth)
89 -> CsvGargV3 nId t s py pm pd abst auth )
90 (V.enumFromN 1 (V.length v'')) v''
92 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
93 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
95 ---------------------------------------------------------------
96 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
97 fromDocs docs = V.map fromDocs' docs
99 fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s (IntOrDec py) pm pd abst auth)
101 ---------------------------------------------------------------
102 -- | Split a document in its context
103 -- TODO adapt the size of the paragraph according to the corpus average
104 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
105 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
108 if (mod (round m) docSize) >= 10
116 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
117 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
119 firstDoc = CsvDoc t s py pm pd firstAbstract auth
120 firstAbstract = head' "splitDoc'1" abstracts
122 nextDocs = map (\txt -> CsvDoc
123 (head' "splitDoc'2" $ sentences txt)
125 (unsentences $ tail' "splitDoc'1" $ sentences txt)
127 ) (tail' "splitDoc'2" abstracts)
129 abstracts = (splitBy $ contextSize) abst
131 ---------------------------------------------------------------
132 ---------------------------------------------------------------
135 docsSize :: Vector CsvDoc -> Mean
136 docsSize csvDoc = mean ls
138 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
141 ---------------------------------------------------------------
142 newtype IntOrDec = IntOrDec Int
143 deriving (Show, Eq, Read)
144 unIntOrDec :: IntOrDec -> Int
145 unIntOrDec (IntOrDec i) = i
146 instance FromField IntOrDec where
147 parseField s = case runParser (parseField s :: Parser Int) of
148 Left _err -> IntOrDec <$> Prelude.floor <$> (parseField s :: Parser Double)
149 Right n -> pure $ IntOrDec n
150 instance ToField IntOrDec where
151 toField (IntOrDec i) = toField i
155 , csv_source :: !Text
156 , csv_publication_year :: !IntOrDec
157 , csv_publication_month :: !Int
158 , csv_publication_day :: !Int
159 , csv_abstract :: !Text
160 , csv_authors :: !Text
164 instance FromNamedRecord CsvDoc where
165 parseNamedRecord r = CsvDoc <$> (r .: "title" <|> r .: "Title")
166 <*> (r .: "source" <|> r .: "Source")
167 <*> (r .: "publication_year" <|> r .: "Publication Year")
168 <*> (r .: "publication_month" <|> r .: "Publication Month")
169 <*> (r .: "publication_day" <|> r .: "Publication Day")
170 <*> (r .: "abstract" <|> r .: "Abstract")
171 <*> (r .: "authors" <|> r .: "Authors")
173 instance ToNamedRecord CsvDoc where
174 toNamedRecord (CsvDoc t s py pm pd abst aut) =
175 namedRecord [ "title" .= t
177 , "publication_year" .= py
178 , "publication_month" .= pm
179 , "publication_day" .= pd
184 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
185 hyperdataDocument2csvDoc h = CsvDoc (m $ _hd_title h)
187 (IntOrDec $ mI $ _hd_publication_year h)
188 (mI $ _hd_publication_month h)
189 (mI $ _hd_publication_day h)
194 m = maybe "" identity
195 mI = maybe 0 identity
198 csvDecodeOptions :: DecodeOptions
199 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
201 csvEncodeOptions :: EncodeOptions
202 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
205 delimiter = fromIntegral $ ord '\t'
206 ------------------------------------------------------------------------
207 ------------------------------------------------------------------------
208 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO (Either Prelude.String [Text])
209 readCsvOn' fields fp = do
212 . V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
215 ------------------------------------------------------------------------
217 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
218 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
220 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
221 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
223 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> Either Prelude.String (Header, Vector a)
224 readByteStringLazy _f bs = decodeByNameWith csvDecodeOptions bs
226 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> Either Prelude.String (Header, Vector a)
227 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
229 ------------------------------------------------------------------------
230 -- | TODO use readFileLazy
231 readFile :: FilePath -> IO (Either Prelude.String (Header, Vector CsvDoc))
232 readFile = fmap readCsvLazyBS . BL.readFile
235 -- | TODO use readByteStringLazy
236 readCsvLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvDoc)
237 readCsvLazyBS bs = decodeByNameWith csvDecodeOptions bs
239 ------------------------------------------------------------------------
240 -- | TODO use readFileLazy
241 readCsvHal :: FilePath -> IO (Either Prelude.String (Header, Vector CsvHal))
242 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
244 -- | TODO use readByteStringLazy
245 readCsvHalLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvHal)
246 readCsvHalLazyBS bs = decodeByNameWith csvDecodeOptions bs
248 readCsvHalBSStrict :: BS.ByteString -> Either Prelude.String (Header, Vector CsvHal)
249 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
251 ------------------------------------------------------------------------
252 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
253 writeFile fp (h, vs) = BL.writeFile fp $
254 encodeByNameWith csvEncodeOptions h (V.toList vs)
256 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
257 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
259 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
260 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
262 ------------------------------------------------------------------------
265 { csvHal_title :: !Text
266 , csvHal_source :: !Text
267 , csvHal_publication_year :: !Integer
268 , csvHal_publication_month :: !Int
269 , csvHal_publication_day :: !Int
270 , csvHal_abstract :: !Text
271 , csvHal_authors :: !Text
273 , csvHal_url :: !Text
274 , csvHal_isbn_s :: !Text
275 , csvHal_issue_s :: !Text
276 , csvHal_journalPublisher_s:: !Text
277 , csvHal_language_s :: !Text
279 , csvHal_doiId_s :: !Text
280 , csvHal_authId_i :: !Text
281 , csvHal_instStructId_i :: !Text
282 , csvHal_deptStructId_i :: !Text
283 , csvHal_labStructId_i :: !Text
285 , csvHal_rteamStructId_i :: !Text
286 , csvHal_docType_s :: !Text
290 instance FromNamedRecord CsvHal where
291 parseNamedRecord r = CsvHal <$> r .: "title"
293 <*> r .: "publication_year"
294 <*> r .: "publication_month"
295 <*> r .: "publication_day"
302 <*> r .: "journalPublisher_s"
303 <*> r .: "language_s"
307 <*> r .: "instStructId_i"
308 <*> r .: "deptStructId_i"
309 <*> r .: "labStructId_i"
311 <*> r .: "rteamStructId_i"
314 instance ToNamedRecord CsvHal where
315 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
316 namedRecord [ "title" .= t
319 , "publication_year" .= py
320 , "publication_month" .= pm
321 , "publication_day" .= pd
329 , "journalPublisher_s" .= j
330 , "language_s" .= lang
334 , "instStructId_i" .= inst
335 , "deptStructId_i" .= dept
336 , "labStructId_i" .= lab
338 , "rteamStructId_i" .= team
339 , "docType_s" .= doct
342 csvHal2doc :: CsvHal -> HyperdataDocument
343 csvHal2doc (CsvHal title source
344 pub_year pub_month pub_day
348 _ _ ) = HyperdataDocument (Just "CsvHal")
359 (Just $ pack . show $ jour pub_year pub_month pub_day)
360 (Just $ fromIntegral pub_year)
369 csv2doc :: CsvDoc -> HyperdataDocument
370 csv2doc (CsvDoc title source
371 (IntOrDec pub_year) pub_month pub_day
372 abstract authors ) = HyperdataDocument (Just "CsvHal")
383 (Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
384 (Just $ fromIntegral pub_year)
392 ------------------------------------------------------------------------
393 parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
396 pure $ (V.toList . V.map csvHal2doc . snd) <$> r
398 parseHal' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
399 parseHal' bs = (V.toList . V.map csvHal2doc . snd) <$> readCsvHalLazyBS bs
401 ------------------------------------------------------------------------
402 parseCsv :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
405 pure $ (V.toList . V.map csv2doc . snd) <$> r
407 parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
408 parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS bs
410 ------------------------------------------------------------------------
411 -- Csv v3 weighted for phylo
414 { csv'_title :: !Text
415 , csv'_source :: !Text
416 , csv'_publication_year :: !Int
417 , csv'_publication_month :: !Int
418 , csv'_publication_day :: !Int
419 , csv'_abstract :: !Text
420 , csv'_authors :: !Text
421 , csv'_weight :: !Double } deriving (Show)
424 instance FromNamedRecord Csv' where
425 parseNamedRecord r = Csv' <$> r .: "title"
427 <*> r .: "publication_year"
428 <*> r .: "publication_month"
429 <*> r .: "publication_day"
434 readWeightedCsv :: FilePath -> IO (Header, Vector Csv')
437 case decodeByNameWith csvDecodeOptions bs of
438 Left e -> panic (pack e)
439 Right corpus -> corpus