2 Module : Gargantext.Core.Text.Corpus.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
15 module Gargantext.Core.Text.Corpus.Parsers.CSV where
17 import Control.Applicative
18 import qualified Data.ByteString as BS
19 import qualified Data.ByteString.Lazy as BL
20 import Data.Char (ord)
22 import Data.Either (Either(..))
23 import Data.Maybe (fromMaybe)
24 import Data.Text (Text, pack, length, intercalate)
25 import Data.Time.Segment (jour)
26 import qualified Data.Vector as V
27 import Data.Vector (Vector)
28 import GHC.IO (FilePath)
29 import GHC.Word (Word8)
31 import qualified Prelude as Prelude
33 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
34 import Gargantext.Prelude hiding (length)
35 import Gargantext.Core.Text
36 import Gargantext.Core.Text.Context
38 ---------------------------------------------------------------
39 headerCsvGargV3 :: Header
49 ---------------------------------------------------------------
50 data CsvGargV3 = CsvGargV3
54 , d_publication_year :: !Int
55 , d_publication_month :: !Int
56 , d_publication_day :: !Int
61 ---------------------------------------------------------------
62 -- | Doc 2 HyperdataDocument
63 toDoc :: CsvGargV3 -> HyperdataDocument
64 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
65 HyperdataDocument (Just "CSV")
66 (Just . pack . show $ did)
85 ---------------------------------------------------------------
86 -- | Types Conversions
87 toDocs :: Vector CsvDoc -> [CsvGargV3]
89 $ V.zipWith (\nId (CsvDoc t s mPy pm pd abst auth)
91 (fromMIntOrDec defaultYear mPy) (fromMaybe defaultMonth pm) (fromMaybe defaultDay pd)
93 (V.enumFromN 1 (V.length v'')) v''
95 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
96 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
98 ---------------------------------------------------------------
99 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
100 fromDocs docs = V.map fromDocs' docs
102 fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s (Just $ IntOrDec py) (Just pm) (Just pd) abst auth)
104 ---------------------------------------------------------------
105 -- | Split a document in its context
106 -- TODO adapt the size of the paragraph according to the corpus average
107 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
108 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
111 if (mod (round m) docSize) >= 10
119 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
120 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
122 firstDoc = CsvDoc t s py pm pd firstAbstract auth
123 firstAbstract = head' "splitDoc'1" abstracts
125 nextDocs = map (\txt -> CsvDoc
126 (head' "splitDoc'2" $ sentences txt)
128 (unsentences $ tail' "splitDoc'1" $ sentences txt)
130 ) (tail' "splitDoc'2" abstracts)
132 abstracts = (splitBy $ contextSize) abst
134 ---------------------------------------------------------------
135 ---------------------------------------------------------------
138 docsSize :: Vector CsvDoc -> Mean
139 docsSize csvDoc = mean ls
141 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
144 ---------------------------------------------------------------
145 newtype IntOrDec = IntOrDec Int
146 deriving (Show, Eq, Read)
147 unIntOrDec :: IntOrDec -> Int
148 unIntOrDec (IntOrDec i) = i
149 instance FromField IntOrDec where
150 parseField s = case runParser (parseField s :: Parser Int) of
151 Left _err -> IntOrDec <$> Prelude.floor <$> (parseField s :: Parser Double)
152 Right n -> pure $ IntOrDec n
153 instance ToField IntOrDec where
154 toField (IntOrDec i) = toField i
156 fromMIntOrDec :: Int -> Maybe IntOrDec -> Int
157 fromMIntOrDec default' mVal = unIntOrDec $ fromMaybe (IntOrDec default') mVal
167 , csv_source :: !Text
168 , csv_publication_year :: !(Maybe IntOrDec)
169 , csv_publication_month :: !(Maybe Int)
170 , csv_publication_day :: !(Maybe Int)
171 , csv_abstract :: !Text
172 , csv_authors :: !Text
176 instance FromNamedRecord CsvDoc where
177 parseNamedRecord r = CsvDoc <$> (r .: "title" <|> r .: "Title")
178 <*> (r .: "source" <|> r .: "Source")
179 <*> (r .: "publication_year" <|> r .: "Publication Year")
180 <*> (r .: "publication_month" <|> r .: "Publication Month")
181 <*> (r .: "publication_day" <|> r .: "Publication Day")
182 <*> (r .: "abstract" <|> r .: "Abstract")
183 <*> (r .: "authors" <|> r .: "Authors")
185 instance ToNamedRecord CsvDoc where
186 toNamedRecord (CsvDoc t s py pm pd abst aut) =
187 namedRecord [ "title" .= t
189 , "publication_year" .= py
190 , "publication_month" .= pm
191 , "publication_day" .= pd
196 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
197 hyperdataDocument2csvDoc h = CsvDoc (m $ _hd_title h)
199 (Just $ IntOrDec $ mI $ _hd_publication_year h)
200 (Just $ mI $ _hd_publication_month h)
201 (Just $ mI $ _hd_publication_day h)
206 m = maybe "" identity
207 mI = maybe 0 identity
210 csvDecodeOptions :: DecodeOptions
211 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
213 csvEncodeOptions :: EncodeOptions
214 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
217 delimiter = fromIntegral $ ord '\t'
218 ------------------------------------------------------------------------
219 ------------------------------------------------------------------------
220 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO (Either Prelude.String [Text])
221 readCsvOn' fields fp = do
224 . V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
227 ------------------------------------------------------------------------
229 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
230 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
232 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
233 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
235 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> Either Prelude.String (Header, Vector a)
236 readByteStringLazy _f bs = decodeByNameWith csvDecodeOptions bs
238 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> Either Prelude.String (Header, Vector a)
239 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
241 ------------------------------------------------------------------------
242 -- | TODO use readFileLazy
243 readFile :: FilePath -> IO (Either Prelude.String (Header, Vector CsvDoc))
244 readFile = fmap readCsvLazyBS . BL.readFile
247 -- | TODO use readByteStringLazy
248 readCsvLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvDoc)
249 readCsvLazyBS bs = decodeByNameWith csvDecodeOptions bs
251 ------------------------------------------------------------------------
252 -- | TODO use readFileLazy
253 readCsvHal :: FilePath -> IO (Either Prelude.String (Header, Vector CsvHal))
254 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
256 -- | TODO use readByteStringLazy
257 readCsvHalLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvHal)
258 readCsvHalLazyBS bs = decodeByNameWith csvDecodeOptions bs
260 readCsvHalBSStrict :: BS.ByteString -> Either Prelude.String (Header, Vector CsvHal)
261 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
263 ------------------------------------------------------------------------
264 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
265 writeFile fp (h, vs) = BL.writeFile fp $
266 encodeByNameWith csvEncodeOptions h (V.toList vs)
268 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
269 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
271 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
272 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
274 ------------------------------------------------------------------------
277 { csvHal_title :: !Text
278 , csvHal_source :: !Text
279 , csvHal_publication_year :: !Integer
280 , csvHal_publication_month :: !Int
281 , csvHal_publication_day :: !Int
282 , csvHal_abstract :: !Text
283 , csvHal_authors :: !Text
285 , csvHal_url :: !Text
286 , csvHal_isbn_s :: !Text
287 , csvHal_issue_s :: !Text
288 , csvHal_journalPublisher_s:: !Text
289 , csvHal_language_s :: !Text
291 , csvHal_doiId_s :: !Text
292 , csvHal_authId_i :: !Text
293 , csvHal_instStructId_i :: !Text
294 , csvHal_deptStructId_i :: !Text
295 , csvHal_labStructId_i :: !Text
297 , csvHal_rteamStructId_i :: !Text
298 , csvHal_docType_s :: !Text
302 instance FromNamedRecord CsvHal where
303 parseNamedRecord r = CsvHal <$> r .: "title"
305 <*> r .: "publication_year"
306 <*> r .: "publication_month"
307 <*> r .: "publication_day"
314 <*> r .: "journalPublisher_s"
315 <*> r .: "language_s"
319 <*> r .: "instStructId_i"
320 <*> r .: "deptStructId_i"
321 <*> r .: "labStructId_i"
323 <*> r .: "rteamStructId_i"
326 instance ToNamedRecord CsvHal where
327 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
328 namedRecord [ "title" .= t
331 , "publication_year" .= py
332 , "publication_month" .= pm
333 , "publication_day" .= pd
341 , "journalPublisher_s" .= j
342 , "language_s" .= lang
346 , "instStructId_i" .= inst
347 , "deptStructId_i" .= dept
348 , "labStructId_i" .= lab
350 , "rteamStructId_i" .= team
351 , "docType_s" .= doct
354 csvHal2doc :: CsvHal -> HyperdataDocument
355 csvHal2doc (CsvHal title source
356 pub_year pub_month pub_day
360 _ _ ) = HyperdataDocument (Just "CsvHal")
371 (Just $ pack . show $ jour pub_year pub_month pub_day)
372 (Just $ fromIntegral pub_year)
381 csv2doc :: CsvDoc -> HyperdataDocument
382 csv2doc (CsvDoc title source
383 mPubYear mPubMonth mPubDay
384 abstract authors ) = HyperdataDocument (Just "CsvHal")
395 (Just $ pack . show $ jour (fromIntegral pubYear) pubMonth pubDay)
404 pubYear = fromMIntOrDec defaultYear mPubYear
405 pubMonth = fromMaybe defaultMonth mPubMonth
406 pubDay = fromMaybe defaultDay mPubDay
408 ------------------------------------------------------------------------
409 parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
412 pure $ (V.toList . V.map csvHal2doc . snd) <$> r
414 parseHal' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
415 parseHal' bs = (V.toList . V.map csvHal2doc . snd) <$> readCsvHalLazyBS bs
417 ------------------------------------------------------------------------
418 parseCsv :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
421 pure $ (V.toList . V.map csv2doc . snd) <$> r
423 parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
424 parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS bs
426 ------------------------------------------------------------------------
427 -- Csv v3 weighted for phylo
430 { csv'_title :: !Text
431 , csv'_source :: !Text
432 , csv'_publication_year :: !Int
433 , csv'_publication_month :: !Int
434 , csv'_publication_day :: !Int
435 , csv'_abstract :: !Text
436 , csv'_authors :: !Text
437 , csv'_weight :: !Double } deriving (Show)
440 instance FromNamedRecord Csv' where
441 parseNamedRecord r = Csv' <$> r .: "title"
443 <*> r .: "publication_year"
444 <*> r .: "publication_month"
445 <*> r .: "publication_day"
450 readWeightedCsv :: FilePath -> IO (Header, Vector Csv')
453 case decodeByNameWith csvDecodeOptions bs of
454 Left e -> panic (pack e)
455 Right corpus -> corpus