src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Corpus.Parsers.CSV
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 CSV parser for Gargantext corpus files.
  11
  12 -}
  13
  14
  15 module Gargantext.Core.Text.Corpus.Parsers.CSV where
  16
  17 import Control.Applicative
  18 import qualified Data.ByteString      as BS
  19 import qualified Data.ByteString.Lazy as BL
  20 import Data.Char (ord)
  21 import Data.Csv
  22 import Data.Either (Either(..))
  23 import Data.Text (Text, pack, length, intercalate)
  24 import Data.Time.Segment (jour)
  25 import qualified Data.Vector          as V
  26 import Data.Vector (Vector)
  27 import GHC.IO (FilePath)
  28 import GHC.Word (Word8)
  29
  30 import qualified Prelude as Prelude
  31
  32 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  33 import Gargantext.Prelude hiding (length)
  34 import Gargantext.Core.Text
  35 import Gargantext.Core.Text.Context
  36
  37 ---------------------------------------------------------------
  38 headerCsvGargV3 :: Header
  39 headerCsvGargV3 =
  40   header [ "title"
  41          , "source"
  42          , "publication_year"
  43          , "publication_month"
  44          , "publication_day"
  45          , "abstract"
  46          , "authors"
  47          ]
  48 ---------------------------------------------------------------
  49 data CsvGargV3 = CsvGargV3
  50     { d_docId             :: !Int
  51     , d_title             :: !Text
  52     , d_source            :: !Text
  53     , d_publication_year  :: !Int
  54     , d_publication_month :: !Int
  55     , d_publication_day   :: !Int
  56     , d_abstract          :: !Text
  57     , d_authors           :: !Text
  58     }
  59     deriving (Show)
  60 ---------------------------------------------------------------
  61 -- | Doc 2 HyperdataDocument
  62 toDoc :: CsvGargV3 -> HyperdataDocument
  63 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
  64   HyperdataDocument (Just "CSV")
  65                     (Just . pack . show $ did)
  66                     Nothing
  67                     Nothing
  68                     Nothing
  69                     Nothing
  70                     (Just dt)
  71                     Nothing
  72                     (Just dau)
  73                     (Just dab)
  74                     (Nothing)
  75                     Nothing
  76                     (Just dpy)
  77                     (Just dpm)
  78                     (Just dpd)
  79                     Nothing
  80                     Nothing
  81                     Nothing
  82                     Nothing
  83
  84 ---------------------------------------------------------------
  85 -- | Types Conversions
  86 toDocs :: Vector CsvDoc -> [CsvGargV3]
  87 toDocs v = V.toList
  88          $ V.zipWith (\nId (CsvDoc t s (IntOrDec py) pm pd abst auth)
  89                        -> CsvGargV3 nId t s py pm pd abst auth )
  90                        (V.enumFromN 1 (V.length v'')) v''
  91           where
  92             v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
  93             seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
  94
  95 ---------------------------------------------------------------
  96 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
  97 fromDocs docs = V.map fromDocs' docs
  98   where
  99     fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s (IntOrDec py) pm pd abst auth)
 100
 101 ---------------------------------------------------------------
 102 -- | Split a document in its context
 103 -- TODO adapt the size of the paragraph according to the corpus average
 104 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
 105 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
 106                           if docSize > 1000
 107                             then
 108                               if (mod (round m) docSize) >= 10
 109                                 then
 110                                   splitDoc' splt doc
 111                                 else
 112                                   V.fromList [doc]
 113                             else
 114                               V.fromList [doc]
 115   where
 116     splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
 117     splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
 118         where
 119           firstDoc = CsvDoc t s py pm pd firstAbstract auth
 120           firstAbstract = head' "splitDoc'1" abstracts
 121
 122           nextDocs = map (\txt -> CsvDoc
 123                                     (head' "splitDoc'2" $ sentences txt)
 124                                     s py pm pd
 125                                     (unsentences $ tail' "splitDoc'1" $ sentences txt)
 126                                     auth
 127                           ) (tail' "splitDoc'2" abstracts)
 128
 129           abstracts    = (splitBy $ contextSize) abst
 130
 131 ---------------------------------------------------------------
 132 ---------------------------------------------------------------
 133 type Mean = Double
 134
 135 docsSize :: Vector CsvDoc -> Mean
 136 docsSize csvDoc = mean ls
 137   where
 138     ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
 139
 140
 141 ---------------------------------------------------------------
 142 newtype IntOrDec = IntOrDec Int
 143   deriving (Show, Eq, Read)
 144 unIntOrDec :: IntOrDec -> Int
 145 unIntOrDec (IntOrDec i) = i
 146 instance FromField IntOrDec where
 147   parseField s = case runParser (parseField s :: Parser Int) of
 148     Left _err -> IntOrDec <$> Prelude.floor <$> (parseField s :: Parser Double)
 149     Right n   -> pure $ IntOrDec n
 150 instance ToField IntOrDec where
 151   toField (IntOrDec i) = toField i
 152
 153 data CsvDoc = CsvDoc
 154     { csv_title  :: !Text
 155     , csv_source :: !Text
 156     , csv_publication_year  :: !IntOrDec
 157     , csv_publication_month :: !Int
 158     , csv_publication_day   :: !Int
 159     , csv_abstract          :: !Text
 160     , csv_authors           :: !Text
 161     }
 162     deriving (Show)
 163
 164 instance FromNamedRecord CsvDoc where
 165   parseNamedRecord r = CsvDoc <$> (r .: "title" <|> r .: "Title")
 166                               <*> (r .: "source" <|> r .: "Source")
 167                               <*> (r .: "publication_year" <|> r .: "Publication Year")
 168                               <*> (r .: "publication_month" <|> r .: "Publication Month")
 169                               <*> (r .: "publication_day" <|> r .: "Publication Day")
 170                               <*> (r .: "abstract" <|> r .: "Abstract")
 171                               <*> (r .: "authors" <|> r .: "Authors")
 172
 173 instance ToNamedRecord CsvDoc where
 174   toNamedRecord (CsvDoc t s py pm pd abst aut) =
 175     namedRecord [ "title"  .= t
 176                 , "source" .= s
 177                 , "publication_year"  .= py
 178                 , "publication_month" .= pm
 179                 , "publication_day"   .= pd
 180                 , "abstract"          .= abst
 181                 , "authors"           .= aut
 182                ]
 183
 184 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
 185 hyperdataDocument2csvDoc h = CsvDoc (m  $ _hd_title h)
 186                                     (m  $ _hd_source h)
 187                                     (IntOrDec $ mI $ _hd_publication_year h)
 188                                     (mI $ _hd_publication_month h)
 189                                     (mI $ _hd_publication_day   h)
 190                                     (m  $ _hd_abstract h)
 191                                     (m  $ _hd_authors h)
 192
 193   where
 194     m = maybe "" identity
 195     mI = maybe 0 identity
 196
 197
 198 csvDecodeOptions :: DecodeOptions
 199 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
 200
 201 csvEncodeOptions :: EncodeOptions
 202 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
 203
 204 delimiter :: Word8
 205 delimiter = fromIntegral $ ord '\t'
 206 ------------------------------------------------------------------------
 207 ------------------------------------------------------------------------
 208 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO (Either Prelude.String [Text])
 209 readCsvOn' fields fp = do
 210   r <- readFile fp
 211   pure $ ( V.toList
 212           . V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
 213           . snd ) <$> r
 214
 215 ------------------------------------------------------------------------
 216
 217 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
 218 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
 219
 220 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Either Prelude.String (Header, Vector a))
 221 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
 222
 223 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> Either Prelude.String (Header, Vector a)
 224 readByteStringLazy _f bs = decodeByNameWith csvDecodeOptions bs
 225
 226 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> Either Prelude.String (Header, Vector a)
 227 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
 228
 229 ------------------------------------------------------------------------
 230 -- | TODO use readFileLazy
 231 readFile :: FilePath -> IO (Either Prelude.String (Header, Vector CsvDoc))
 232 readFile = fmap readCsvLazyBS . BL.readFile
 233
 234
 235 -- | TODO use readByteStringLazy
 236 readCsvLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvDoc)
 237 readCsvLazyBS bs = decodeByNameWith csvDecodeOptions bs
 238
 239 ------------------------------------------------------------------------
 240 -- | TODO use readFileLazy
 241 readCsvHal :: FilePath -> IO (Either Prelude.String (Header, Vector CsvHal))
 242 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
 243
 244 -- | TODO use readByteStringLazy
 245 readCsvHalLazyBS :: BL.ByteString -> Either Prelude.String (Header, Vector CsvHal)
 246 readCsvHalLazyBS bs = decodeByNameWith csvDecodeOptions bs
 247
 248 readCsvHalBSStrict :: BS.ByteString -> Either Prelude.String (Header, Vector CsvHal)
 249 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
 250
 251 ------------------------------------------------------------------------
 252 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 253 writeFile fp (h, vs) = BL.writeFile fp $
 254                       encodeByNameWith csvEncodeOptions h (V.toList vs)
 255
 256 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
 257 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
 258
 259 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
 260 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
 261
 262 ------------------------------------------------------------------------
 263 -- Hal Format
 264 data CsvHal = CsvHal
 265     { csvHal_title  :: !Text
 266     , csvHal_source :: !Text
 267     , csvHal_publication_year  :: !Integer
 268     , csvHal_publication_month :: !Int
 269     , csvHal_publication_day   :: !Int
 270     , csvHal_abstract          :: !Text
 271     , csvHal_authors           :: !Text
 272
 273     , csvHal_url               :: !Text
 274     , csvHal_isbn_s            :: !Text
 275     , csvHal_issue_s           :: !Text
 276     , csvHal_journalPublisher_s:: !Text
 277     , csvHal_language_s        :: !Text
 278
 279     , csvHal_doiId_s           :: !Text
 280     , csvHal_authId_i          :: !Text
 281     , csvHal_instStructId_i    :: !Text
 282     , csvHal_deptStructId_i    :: !Text
 283     , csvHal_labStructId_i     :: !Text
 284
 285     , csvHal_rteamStructId_i   :: !Text
 286     , csvHal_docType_s         :: !Text
 287     }
 288     deriving (Show)
 289
 290 instance FromNamedRecord CsvHal where
 291   parseNamedRecord r = CsvHal <$> r .: "title"
 292                               <*> r .: "source"
 293                               <*> r .: "publication_year"
 294                               <*> r .: "publication_month"
 295                               <*> r .: "publication_day"
 296                               <*> r .: "abstract"
 297                               <*> r .: "authors"
 298
 299                               <*> r .: "url"
 300                               <*> r .: "isbn_s"
 301                               <*> r .: "issue_s"
 302                               <*> r .: "journalPublisher_s"
 303                               <*> r .: "language_s"
 304
 305                               <*> r .: "doiId_s"
 306                               <*> r .: "authId_i"
 307                               <*> r .: "instStructId_i"
 308                               <*> r .: "deptStructId_i"
 309                               <*> r .: "labStructId_i"
 310
 311                               <*> r .: "rteamStructId_i"
 312                               <*> r .: "docType_s"
 313
 314 instance ToNamedRecord CsvHal where
 315   toNamedRecord (CsvHal t s py  pm pd abst aut  url isbn iss j lang  doi auth inst dept lab team doct) =
 316     namedRecord [ "title"  .= t
 317                 , "source" .= s
 318
 319                 , "publication_year"  .= py
 320                 , "publication_month" .= pm
 321                 , "publication_day"   .= pd
 322
 323                 , "abstract"          .= abst
 324                 , "authors"           .= aut
 325
 326                 , "url"                .= url
 327                 , "isbn_s"             .= isbn
 328                 , "issue_s"            .= iss
 329                 , "journalPublisher_s" .= j
 330                 , "language_s"         .= lang
 331
 332                 , "doiId_s"            .= doi
 333                 , "authId_i"           .= auth
 334                 , "instStructId_i"     .= inst
 335                 , "deptStructId_i"     .= dept
 336                 , "labStructId_i"      .= lab
 337
 338                 , "rteamStructId_i"    .= team
 339                 , "docType_s"          .= doct
 340                ]
 341
 342 csvHal2doc :: CsvHal -> HyperdataDocument
 343 csvHal2doc (CsvHal title source
 344        pub_year pub_month pub_day
 345        abstract authors
 346        url _ _ _ _
 347        doi _ inst _ _
 348        _ _ ) = HyperdataDocument (Just "CsvHal")
 349                                (Just doi)
 350                                (Just url)
 351                                Nothing
 352                                Nothing
 353                                Nothing
 354                                (Just title)
 355                                (Just authors)
 356                                (Just inst)
 357                                (Just source)
 358                                (Just abstract)
 359                                (Just $ pack . show $ jour pub_year pub_month pub_day)
 360                                (Just $ fromIntegral pub_year)
 361                                (Just pub_month)
 362                                (Just pub_day)
 363                                Nothing
 364                                Nothing
 365                                Nothing
 366                                Nothing
 367
 368
 369 csv2doc :: CsvDoc -> HyperdataDocument
 370 csv2doc (CsvDoc title source
 371        (IntOrDec pub_year) pub_month pub_day
 372        abstract authors ) = HyperdataDocument (Just "CsvHal")
 373                                Nothing
 374                                Nothing
 375                                Nothing
 376                                Nothing
 377                                Nothing
 378                                (Just title)
 379                                (Just authors)
 380                                Nothing
 381                                (Just source)
 382                                (Just abstract)
 383                                (Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
 384                                (Just $ fromIntegral pub_year)
 385                                (Just pub_month)
 386                                (Just pub_day)
 387                                Nothing
 388                                Nothing
 389                                Nothing
 390                                Nothing
 391
 392 ------------------------------------------------------------------------
 393 parseHal :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
 394 parseHal fp = do
 395   r <- readCsvHal fp
 396   pure $ (V.toList . V.map csvHal2doc . snd) <$> r
 397
 398 parseHal' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
 399 parseHal' bs = (V.toList . V.map csvHal2doc . snd) <$> readCsvHalLazyBS bs
 400
 401 ------------------------------------------------------------------------
 402 parseCsv :: FilePath -> IO (Either Prelude.String [HyperdataDocument])
 403 parseCsv fp = do
 404   r <- readFile fp
 405   pure $ (V.toList . V.map csv2doc . snd) <$> r
 406
 407 parseCsv' :: BL.ByteString -> Either Prelude.String [HyperdataDocument]
 408 parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS bs
 409
 410 ------------------------------------------------------------------------
 411 -- Csv v3 weighted for phylo
 412
 413 data Csv' = Csv'
 414       { csv'_title             :: !Text
 415       , csv'_source            :: !Text
 416       , csv'_publication_year  :: !Int
 417       , csv'_publication_month :: !Int
 418       , csv'_publication_day   :: !Int
 419       , csv'_abstract          :: !Text
 420       , csv'_authors           :: !Text
 421       , csv'_weight            :: !Double } deriving (Show)
 422
 423
 424 instance FromNamedRecord Csv' where
 425   parseNamedRecord r = Csv' <$> r .: "title"
 426                             <*> r .: "source"
 427                             <*> r .: "publication_year"
 428                             <*> r .: "publication_month"
 429                             <*> r .: "publication_day"
 430                             <*> r .: "abstract"
 431                             <*> r .: "authors"
 432                             <*> r .: "weight"
 433
 434 readWeightedCsv :: FilePath -> IO (Header, Vector Csv')
 435 readWeightedCsv fp =
 436   fmap (\bs ->
 437     case decodeByNameWith csvDecodeOptions bs of
 438       Left e       -> panic (pack e)
 439       Right corpus -> corpus
 440     ) $ BL.readFile fp