src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Corpus.Parsers.CSV
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 CSV parser for Gargantext corpus files.
  11
  12 -}
  13
  14
  15 module Gargantext.Core.Text.Corpus.Parsers.CSV where
  16
  17 import Control.Applicative
  18 import qualified Data.ByteString      as BS
  19 import qualified Data.ByteString.Lazy as BL
  20 import Data.Char (ord)
  21 import Data.Csv
  22 import Data.Either (Either(..))
  23 import Data.Maybe (fromMaybe)
  24 import Data.Text (Text, pack, length, intercalate)
  25 import Data.Time.Segment (jour)
  26 import qualified Data.Vector          as V
  27 import Data.Vector (Vector)
  28 import GHC.IO (FilePath)
  29 import GHC.Word (Word8)
  30
  31 import qualified Prelude as Prelude
  32
  33 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  34 import Gargantext.Prelude hiding (length)
  35 import Gargantext.Core.Text
  36 import Gargantext.Core.Text.Context
  37
  38 ---------------------------------------------------------------
  39 headerCsvGargV3 :: Header
  40 headerCsvGargV3 =
  41   header [ "title"
  42          , "source"
  43          , "publication_year"
  44          , "publication_month"
  45          , "publication_day"
  46          , "abstract"
  47          , "authors"
  48          ]
  49 ---------------------------------------------------------------
  50 data CsvGargV3 = CsvGargV3
  51     { d_docId             :: !Int
  52     , d_title             :: !Text
  53     , d_source            :: !Text
  54     , d_publication_year  :: !Int
  55     , d_publication_month :: !Int
  56     , d_publication_day   :: !Int
  57     , d_abstract          :: !Text
  58     , d_authors           :: !Text
  59     }
  60     deriving (Show)
  61 ---------------------------------------------------------------
  62 -- | Doc 2 HyperdataDocument
  63 toDoc :: CsvGargV3 -> HyperdataDocument
  64 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
  65   HyperdataDocument (Just "CSV")
  66                     (Just . pack . show $ did)
  67                     Nothing
  68                     Nothing
  69                     Nothing
  70                     Nothing
  71                     (Just dt)
  72                     Nothing
  73                     (Just dau)
  74                     (Just dab)
  75                     (Nothing)
  76                     Nothing
  77                     (Just dpy)
  78                     (Just dpm)
  79                     (Just dpd)
  80                     Nothing
  81                     Nothing
  82                     Nothing
  83                     Nothing
  84
  85 ---------------------------------------------------------------
  86 -- | Types Conversions
  87 toDocs :: Vector CsvDoc -> [CsvGargV3]
  88 toDocs v = V.toList
  89          $ V.zipWith (\nId (CsvDoc t s mPy pm pd abst auth)
  90                        -> CsvGargV3 nId t s
  91                           (fromMIntOrDec defaultYear mPy) (fromMaybe defaultMonth pm) (fromMaybe defaultDay pd)
  92                           abst auth )
  93                        (V.enumFromN 1 (V.length v'')) v''
  94           where
  95             v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
  96             seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
  97
  98 ---------------------------------------------------------------
  99 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
 100 fromDocs docs = V.map fromDocs' docs
 101   where
 102     fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s (Just $ IntOrDec py) (Just pm) (Just pd) abst auth)
 103
 104 ---------------------------------------------------------------
 105 -- | Split a document in its context
 106 -- TODO adapt the size of the paragraph according to the corpus average
 107 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
 108 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
 109                           if docSize > 1000
 110                             then
 111                               if (mod (round m) docSize) >= 10
 112                                 then
 113                                   splitDoc' splt doc
 114                                 else
 115                                   V.fromList [doc]
 116                             else
 117                               V.fromList [doc]
 118   where
 119     splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
 120     splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
 121         where
 122           firstDoc = CsvDoc t s py pm pd firstAbstract auth
 123           firstAbstract = head' "splitDoc'1" abstracts
 124
 125           nextDocs = map (\txt -> CsvDoc
 126                                     (head' "splitDoc'2" $ sentences txt)
 127                                     s py pm pd
 128                                     (unsentences $ tail' "splitDoc'1" $ sentences txt)
 129                                     auth
 130                           ) (tail' "splitDoc'2" abstracts)
 131
 132           abstracts    = (splitBy $ contextSize) abst
 133
 134 ---------------------------------------------------------------
 135 ---------------------------------------------------------------
 136 type Mean = Double
 137
 138 docsSize :: Vector CsvDoc -> Mean
 139 docsSize csvDoc = mean ls
 140   where
 141     ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
 142
 143
 144 ---------------------------------------------------------------
 145 newtype IntOrDec = IntOrDec Int
 146   deriving (Show, Eq, Read)
 147 unIntOrDec :: IntOrDec -> Int
 148 unIntOrDec (IntOrDec i) = i
 149 instance FromField IntOrDec where
 150   parseField s = case runParser (parseField s :: Parser Int) of
 151     Left _err -> IntOrDec <$> Prelude.floor <$> (parseField s :: Parser Double)
 152     Right n   -> pure $ IntOrDec n
 153 instance ToField IntOrDec where
 154   toField (IntOrDec i) = toField i
 155
 156 fromMIntOrDec :: Int -> Maybe IntOrDec -> Int
 157 fromMIntOrDec default' mVal = unIntOrDec $ fromMaybe (IntOrDec default') mVal
 158 defaultYear :: Int
 159 defaultYear = 1973
 160 defaultMonth :: Int
 161 defaultMonth = 1
 162 defaultDay :: Int
 163 defaultDay = 1
 164
 165 data CsvDoc = CsvDoc
 166     { csv_title             :: !Text
 167     , csv_source            :: !Text
 168     , csv_publication_year  :: !(Maybe IntOrDec)
 169     , csv_publication_month :: !(Maybe Int)
 170     , csv_publication_day   :: !(Maybe Int)
 171     , csv_abstract          :: !Text
 172     , csv_authors           :: !Text
 173     }
 174     deriving (Show)
 175
 176 instance FromNamedRecord CsvDoc where
 177   parseNamedRecord r = CsvDoc <$> (r .: "title" <|> r .: "Title")
 178                               <*> (r .: "source" <|> r .: "Source")
 179                               <*> (r .: "publication_year" <|> r .: "Publication Year")
 180                               <*> (r .: "publication_month" <|> r .: "Publication Month")
 181                               <*> (r .: "publication_day" <|> r .: "Publication Day")
 182                               <*> (r .: "abstract" <|> r .: "Abstract")
 183                               <*> (r .: "authors" <|> r .: "Authors")
 184
 185 instance ToNamedRecord CsvDoc where
 186   toNamedRecord (CsvDoc t s py pm pd abst aut) =
 187     namedRecord [ "title"             .= t
 188                 , "source"            .= s
 189                 , "publication_year"  .= py
 190                 , "publication_month" .= pm
 191                 , "publication_day"   .= pd
 192                 , "abstract"          .= abst
 193                 , "authors"           .= aut
 194                 ]
 195
 196 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
 197 hyperdataDocument2csvDoc h = CsvDoc (m  $ _hd_title h)
 198                                     (m  $ _hd_source h)
 199                                     (Just $ IntOrDec $ mI $ _hd_publication_year h)
 200                                     (Just $ mI $ _hd_publication_month h)
 201                                     (Just $ mI $ _hd_publication_day   h)
 202                                     (m  $ _hd_abstract h)
 203                                     (m  $ _hd_authors h)
 204
 205   where
 206     m = maybe "" identity
 207     mI = maybe 0 identity
 208
 209
 210 csvDecodeOptions :: DecodeOptions
 211 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
 212
 213 csvEncodeOptions :: EncodeOptions
 214 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
 215
 216 delimiter :: Word8
 217 delimiter = fromIntegral $ ord '\t'
 218 ------------------------------------------------------------------------
 219 ------------------------------------------------------------------------
 220 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO (Either Prelude.String [Text])
 221 readCsvOn' fields fp = do
 222   r <- readFile fp
 223   pure $ ( V.toList
 224           . V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
 225           . snd ) <$> r
 226
 227 ------------------------------------------------------------------------
 228
 229 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
 230 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
 231
 232 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
 233 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
 234
 235 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> Either Prelude.String (Header, Vector a)
 236 readByteStringLazy _f bs = decodeByNameWith csvDecodeOptions bs
 237
 238 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> Either Prelude.String (Header, Vector a)
 239 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
 240
 241 ------------------------------------------------------------------------
 242 -- | TODO use readFileLazy
 243 readFile :: FilePath -> IO (Either Prelude.String (Header, Vector CsvDoc))
 244 readFile = fmap readCsvLazyBS . BL.readFile
 245
 246
 247 -- | TODO use readByteStringLazy
 248 readCsvLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvDoc)
 249 readCsvLazyBS bs = decodeByNameWith csvDecodeOptions bs
 250
 251 ------------------------------------------------------------------------
 252 -- | TODO use readFileLazy
 253 readCsvHal :: FilePath -> IO (Either Prelude.String (Header, Vector CsvHal))
 254 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
 255
 256 -- | TODO use readByteStringLazy
 257 readCsvHalLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvHal)
 258 readCsvHalLazyBS bs = decodeByNameWith csvDecodeOptions bs
 259
 260 readCsvHalBSStrict :: BS.ByteString -> Either Prelude.String (Header, Vector CsvHal)
 261 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
 262
 263 ------------------------------------------------------------------------
 264 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 265 writeFile fp (h, vs) = BL.writeFile fp $
 266                       encodeByNameWith csvEncodeOptions h (V.toList vs)
 267
 268 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
 269 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
 270
 271 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
 272 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
 273
 274 ------------------------------------------------------------------------
 275 -- Hal Format
 276 data CsvHal = CsvHal
 277     { csvHal_title  :: !Text
 278     , csvHal_source :: !Text
 279     , csvHal_publication_year  :: !Integer
 280     , csvHal_publication_month :: !Int
 281     , csvHal_publication_day   :: !Int
 282     , csvHal_abstract          :: !Text
 283     , csvHal_authors           :: !Text
 284
 285     , csvHal_url               :: !Text
 286     , csvHal_isbn_s            :: !Text
 287     , csvHal_issue_s           :: !Text
 288     , csvHal_journalPublisher_s:: !Text
 289     , csvHal_language_s        :: !Text
 290
 291     , csvHal_doiId_s           :: !Text
 292     , csvHal_authId_i          :: !Text
 293     , csvHal_instStructId_i    :: !Text
 294     , csvHal_deptStructId_i    :: !Text
 295     , csvHal_labStructId_i     :: !Text
 296
 297     , csvHal_rteamStructId_i   :: !Text
 298     , csvHal_docType_s         :: !Text
 299     }
 300     deriving (Show)
 301
 302 instance FromNamedRecord CsvHal where
 303   parseNamedRecord r = CsvHal <$> r .: "title"
 304                               <*> r .: "source"
 305                               <*> r .: "publication_year"
 306                               <*> r .: "publication_month"
 307                               <*> r .: "publication_day"
 308                               <*> r .: "abstract"
 309                               <*> r .: "authors"
 310
 311                               <*> r .: "url"
 312                               <*> r .: "isbn_s"
 313                               <*> r .: "issue_s"
 314                               <*> r .: "journalPublisher_s"
 315                               <*> r .: "language_s"
 316
 317                               <*> r .: "doiId_s"
 318                               <*> r .: "authId_i"
 319                               <*> r .: "instStructId_i"
 320                               <*> r .: "deptStructId_i"
 321                               <*> r .: "labStructId_i"
 322
 323                               <*> r .: "rteamStructId_i"
 324                               <*> r .: "docType_s"
 325
 326 instance ToNamedRecord CsvHal where
 327   toNamedRecord (CsvHal t s py  pm pd abst aut  url isbn iss j lang  doi auth inst dept lab team doct) =
 328     namedRecord [ "title"  .= t
 329                 , "source" .= s
 330
 331                 , "publication_year"  .= py
 332                 , "publication_month" .= pm
 333                 , "publication_day"   .= pd
 334
 335                 , "abstract"          .= abst
 336                 , "authors"           .= aut
 337
 338                 , "url"                .= url
 339                 , "isbn_s"             .= isbn
 340                 , "issue_s"            .= iss
 341                 , "journalPublisher_s" .= j
 342                 , "language_s"         .= lang
 343
 344                 , "doiId_s"            .= doi
 345                 , "authId_i"           .= auth
 346                 , "instStructId_i"     .= inst
 347                 , "deptStructId_i"     .= dept
 348                 , "labStructId_i"      .= lab
 349
 350                 , "rteamStructId_i"    .= team
 351                 , "docType_s"          .= doct
 352                ]
 353
 354 csvHal2doc :: CsvHal -> HyperdataDocument
 355 csvHal2doc (CsvHal title source
 356        pub_year pub_month pub_day
 357        abstract authors
 358        url _ _ _ _
 359        doi _ inst _ _
 360        _ _ ) = HyperdataDocument (Just "CsvHal")
 361                                (Just doi)
 362                                (Just url)
 363                                Nothing
 364                                Nothing
 365                                Nothing
 366                                (Just title)
 367                                (Just authors)
 368                                (Just inst)
 369                                (Just source)
 370                                (Just abstract)
 371                                (Just $ pack . show $ jour pub_year pub_month pub_day)
 372                                (Just $ fromIntegral pub_year)
 373                                (Just pub_month)
 374                                (Just pub_day)
 375                                Nothing
 376                                Nothing
 377                                Nothing
 378                                Nothing
 379
 380
 381 csv2doc :: CsvDoc -> HyperdataDocument
 382 csv2doc (CsvDoc title source
 383        mPubYear mPubMonth mPubDay
 384        abstract authors ) = HyperdataDocument (Just "CsvHal")
 385                                Nothing
 386                                Nothing
 387                                Nothing
 388                                Nothing
 389                                Nothing
 390                                (Just title)
 391                                (Just authors)
 392                                Nothing
 393                                (Just source)
 394                                (Just abstract)
 395                                (Just $ pack . show $ jour (fromIntegral pubYear) pubMonth pubDay)
 396                                (Just pubYear)
 397                                (Just pubMonth)
 398                                (Just pubDay)
 399                                Nothing
 400                                Nothing
 401                                Nothing
 402                                Nothing
 403   where
 404     pubYear = fromMIntOrDec defaultYear mPubYear
 405     pubMonth = fromMaybe defaultMonth mPubMonth
 406     pubDay = fromMaybe defaultDay mPubDay
 407
 408 ------------------------------------------------------------------------
 409 parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
 410 parseHal fp = do
 411   r <- readCsvHal fp
 412   pure $ (V.toList . V.map csvHal2doc . snd) <$> r
 413
 414 parseHal' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
 415 parseHal' bs = (V.toList . V.map csvHal2doc . snd) <$> readCsvHalLazyBS bs
 416
 417 ------------------------------------------------------------------------
 418 parseCsv :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
 419 parseCsv fp = do
 420   r <- readFile fp
 421   pure $ (V.toList . V.map csv2doc . snd) <$> r
 422
 423 parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
 424 parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS bs
 425
 426 ------------------------------------------------------------------------
 427 -- Csv v3 weighted for phylo
 428
 429 data Csv' = Csv'
 430       { csv'_title             :: !Text
 431       , csv'_source            :: !Text
 432       , csv'_publication_year  :: !Int
 433       , csv'_publication_month :: !Int
 434       , csv'_publication_day   :: !Int
 435       , csv'_abstract          :: !Text
 436       , csv'_authors           :: !Text
 437       , csv'_weight            :: !Double } deriving (Show)
 438
 439
 440 instance FromNamedRecord Csv' where
 441   parseNamedRecord r = Csv' <$> r .: "title"
 442                             <*> r .: "source"
 443                             <*> r .: "publication_year"
 444                             <*> r .: "publication_month"
 445                             <*> r .: "publication_day"
 446                             <*> r .: "abstract"
 447                             <*> r .: "authors"
 448                             <*> r .: "weight"
 449
 450 readWeightedCsv :: FilePath -> IO (Header, Vector Csv')
 451 readWeightedCsv fp =
 452   fmap (\bs ->
 453     case decodeByNameWith csvDecodeOptions bs of
 454       Left e       -> panic (pack e)
 455       Right corpus -> corpus
 456     ) $ BL.readFile fp