src/Gargantext/Text/Corpus/Parsers/CSV.hs

   1 {-|
   2 Module      : Gargantext.Text.Corpus.Parsers.CSV
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 CSV parser for Gargantext corpus files.
  11
  12 -}
  13
  14 {-# LANGUAGE NoImplicitPrelude #-}
  15 {-# LANGUAGE OverloadedStrings #-}
  16 {-# LANGUAGE DeriveGeneric     #-}
  17
  18 module Gargantext.Text.Corpus.Parsers.CSV where
  19
  20 import Control.Applicative
  21 import Data.Char (ord)
  22 import Data.Csv
  23 import Data.Either (Either(Left, Right))
  24 import Data.Text (Text, pack, length, intercalate)
  25 import Data.Time.Segment (jour)
  26 import Data.Vector (Vector)
  27 import GHC.IO (FilePath)
  28 import GHC.Real (round)
  29 import GHC.Word (Word8)
  30 import Gargantext.Database.Admin.Types.Node -- (HyperdataDocument(..))
  31 import Gargantext.Prelude hiding (length)
  32 import Gargantext.Text
  33 import Gargantext.Text.Context
  34 import qualified Data.ByteString      as BS
  35 import qualified Data.ByteString.Lazy as BL
  36 import qualified Data.Vector          as V
  37
  38 ---------------------------------------------------------------
  39 headerCsvGargV3 :: Header
  40 headerCsvGargV3 = header [ "title"
  41          , "source"
  42          , "publication_year"
  43          , "publication_month"
  44          , "publication_day"
  45          , "abstract"
  46          , "authors"
  47          ]
  48 ---------------------------------------------------------------
  49 data CsvGargV3 = CsvGargV3
  50     { d_docId  :: !Int
  51     , d_title  :: !Text
  52     , d_source :: !Text
  53     , d_publication_year  :: !Int
  54     , d_publication_month :: !Int
  55     , d_publication_day   :: !Int
  56     , d_abstract          :: !Text
  57     , d_authors           :: !Text
  58     }
  59     deriving (Show)
  60 ---------------------------------------------------------------
  61 -- | Doc 2 HyperdataDocument
  62 toDoc :: CsvGargV3 -> HyperdataDocument
  63 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
  64   HyperdataDocument (Just "CSV")
  65                     (Just . pack . show $ did)
  66                     Nothing
  67                     Nothing
  68                     Nothing
  69                     Nothing
  70                     (Just dt)
  71                     Nothing
  72                     (Just dau)
  73                     (Just dab)
  74                     (Nothing)
  75                     Nothing
  76                     (Just dpy)
  77                     (Just dpm)
  78                     (Just dpd)
  79                     Nothing
  80                     Nothing
  81                     Nothing
  82                     Nothing
  83
  84 ---------------------------------------------------------------
  85 -- | Types Conversions
  86 toDocs :: Vector CsvDoc -> [CsvGargV3]
  87 toDocs v = V.toList
  88          $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
  89                        -> CsvGargV3 nId t s py pm pd abst auth )
  90                        (V.enumFromN 1 (V.length v'')) v''
  91           where
  92             v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
  93             seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
  94
  95 ---------------------------------------------------------------
  96 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
  97 fromDocs docs = V.map fromDocs' docs
  98   where
  99     fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
 100
 101 ---------------------------------------------------------------
 102 -- | Split a document in its context
 103 -- TODO adapt the size of the paragraph according to the corpus average
 104 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
 105 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
 106                           if docSize > 1000
 107                             then
 108                               if (mod (round m) docSize) >= 10
 109                                 then
 110                                   splitDoc' splt doc
 111                                 else
 112                                   V.fromList [doc]
 113                             else
 114                               V.fromList [doc]
 115   where
 116     splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
 117     splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
 118         where
 119           firstDoc = CsvDoc t s py pm pd firstAbstract auth
 120           firstAbstract = head' "splitDoc'1" abstracts
 121
 122           nextDocs = map (\txt -> CsvDoc
 123                                     (head' "splitDoc'2" $ sentences txt)
 124                                     s py pm pd
 125                                     (unsentences $ tail' "splitDoc'1" $ sentences txt)
 126                                     auth
 127                           ) (tail' "splitDoc'2" abstracts)
 128
 129           abstracts    = (splitBy $ contextSize) abst
 130
 131 ---------------------------------------------------------------
 132 ---------------------------------------------------------------
 133 type Mean = Double
 134
 135 docsSize :: Vector CsvDoc -> Mean
 136 docsSize csvDoc = mean ls
 137   where
 138     ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
 139
 140
 141 ---------------------------------------------------------------
 142 data CsvDoc = CsvDoc
 143     { csv_title  :: !Text
 144     , csv_source :: !Text
 145     , csv_publication_year  :: !Int
 146     , csv_publication_month :: !Int
 147     , csv_publication_day   :: !Int
 148     , csv_abstract          :: !Text
 149     , csv_authors           :: !Text
 150     }
 151     deriving (Show)
 152
 153 instance FromNamedRecord CsvDoc where
 154   parseNamedRecord r = CsvDoc <$> r .: "title"
 155                               <*> r .: "source"
 156                               <*> r .: "publication_year"
 157                               <*> r .: "publication_month"
 158                               <*> r .: "publication_day"
 159                               <*> r .: "abstract"
 160                               <*> r .: "authors"
 161
 162 instance ToNamedRecord CsvDoc where
 163   toNamedRecord (CsvDoc t s py pm pd abst aut) =
 164     namedRecord [ "title"  .= t
 165                 , "source" .= s
 166                 , "publication_year"  .= py
 167                 , "publication_month" .= pm
 168                 , "publication_day"   .= pd
 169                 , "abstract"          .= abst
 170                 , "authors"           .= aut
 171                ]
 172
 173 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
 174 hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
 175                                     (m $ _hyperdataDocument_source h)
 176                                     (mI $ _hyperdataDocument_publication_year h)
 177                                     (mI $ _hyperdataDocument_publication_month h)
 178                                     (mI $ _hyperdataDocument_publication_day   h)
 179                                     (m $ _hyperdataDocument_abstract h)
 180                                     (m $ _hyperdataDocument_authors h)
 181
 182   where
 183     m = maybe "" identity
 184     mI = maybe 0 identity
 185
 186
 187 csvDecodeOptions :: DecodeOptions
 188 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
 189
 190 csvEncodeOptions :: EncodeOptions
 191 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
 192
 193 delimiter :: Word8
 194 delimiter = fromIntegral $ ord '\t'
 195 ------------------------------------------------------------------------
 196 ------------------------------------------------------------------------
 197 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text]
 198 readCsvOn' fields fp = V.toList
 199                    <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
 200                    <$> snd
 201                    <$> readFile fp
 202
 203 ------------------------------------------------------------------------
 204
 205 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
 206 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
 207
 208 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
 209 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
 210
 211 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
 212 readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
 213       Left e        -> panic (pack e)
 214       Right csvDocs -> csvDocs
 215
 216 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
 217 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
 218
 219 ------------------------------------------------------------------------
 220 -- | TODO use readFileLazy
 221 readFile :: FilePath -> IO (Header, Vector CsvDoc)
 222 readFile = fmap readCsvLazyBS . BL.readFile
 223
 224
 225 -- | TODO use readByteStringLazy
 226 readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
 227 readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
 228       Left e        -> panic (pack e)
 229       Right csvDocs -> csvDocs
 230
 231 ------------------------------------------------------------------------
 232
 233 -- | TODO use readFileLazy
 234 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
 235 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
 236
 237 -- | TODO use readByteStringLazy
 238 readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
 239 readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
 240       Left e        -> panic (pack e)
 241       Right csvDocs -> csvDocs
 242
 243 readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
 244 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
 245
 246 ------------------------------------------------------------------------
 247 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 248 writeFile fp (h, vs) = BL.writeFile fp $
 249                       encodeByNameWith csvEncodeOptions h (V.toList vs)
 250
 251 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
 252 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
 253
 254 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
 255 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
 256
 257 ------------------------------------------------------------------------
 258 -- Hal Format
 259 data CsvHal = CsvHal
 260     { csvHal_title  :: !Text
 261     , csvHal_source :: !Text
 262     , csvHal_publication_year  :: !Integer
 263     , csvHal_publication_month :: !Int
 264     , csvHal_publication_day   :: !Int
 265     , csvHal_abstract          :: !Text
 266     , csvHal_authors           :: !Text
 267
 268     , csvHal_url               :: !Text
 269     , csvHal_isbn_s            :: !Text
 270     , csvHal_issue_s           :: !Text
 271     , csvHal_journalPublisher_s:: !Text
 272     , csvHal_language_s        :: !Text
 273
 274     , csvHal_doiId_s           :: !Text
 275     , csvHal_authId_i          :: !Text
 276     , csvHal_instStructId_i    :: !Text
 277     , csvHal_deptStructId_i    :: !Text
 278     , csvHal_labStructId_i     :: !Text
 279
 280     , csvHal_rteamStructId_i   :: !Text
 281     , csvHal_docType_s         :: !Text
 282     }
 283     deriving (Show)
 284
 285 instance FromNamedRecord CsvHal where
 286   parseNamedRecord r = CsvHal <$> r .: "title"
 287                               <*> r .: "source"
 288                               <*> r .: "publication_year"
 289                               <*> r .: "publication_month"
 290                               <*> r .: "publication_day"
 291                               <*> r .: "abstract"
 292                               <*> r .: "authors"
 293
 294                               <*> r .: "url"
 295                               <*> r .: "isbn_s"
 296                               <*> r .: "issue_s"
 297                               <*> r .: "journalPublisher_s"
 298                               <*> r .: "language_s"
 299
 300                               <*> r .: "doiId_s"
 301                               <*> r .: "authId_i"
 302                               <*> r .: "instStructId_i"
 303                               <*> r .: "deptStructId_i"
 304                               <*> r .: "labStructId_i"
 305
 306                               <*> r .: "rteamStructId_i"
 307                               <*> r .: "docType_s"
 308
 309 instance ToNamedRecord CsvHal where
 310   toNamedRecord (CsvHal t s py  pm pd abst aut  url isbn iss j lang  doi auth inst dept lab team doct) =
 311     namedRecord [ "title"  .= t
 312                 , "source" .= s
 313
 314                 , "publication_year"  .= py
 315                 , "publication_month" .= pm
 316                 , "publication_day"   .= pd
 317
 318                 , "abstract"          .= abst
 319                 , "authors"           .= aut
 320
 321                 , "url"                .= url
 322                 , "isbn_s"             .= isbn
 323                 , "issue_s"            .= iss
 324                 , "journalPublisher_s" .= j
 325                 , "language_s"         .= lang
 326
 327                 , "doiId_s"            .= doi
 328                 , "authId_i"           .= auth
 329                 , "instStructId_i"     .= inst
 330                 , "deptStructId_i"     .= dept
 331                 , "labStructId_i"      .= lab
 332
 333                 , "rteamStructId_i"    .= team
 334                 , "docType_s"          .= doct
 335                ]
 336
 337 csvHal2doc :: CsvHal -> HyperdataDocument
 338 csvHal2doc (CsvHal title source
 339        pub_year pub_month pub_day
 340        abstract authors
 341        url _ _ _ _
 342        doi _ inst _ _
 343        _ _ ) = HyperdataDocument (Just "CsvHal")
 344                                (Just doi)
 345                                (Just url)
 346                                Nothing
 347                                Nothing
 348                                Nothing
 349                                (Just title)
 350                                (Just authors)
 351                                (Just inst)
 352                                (Just source)
 353                                (Just abstract)
 354                                (Just $ pack . show $ jour pub_year pub_month pub_day)
 355                                (Just $ fromIntegral pub_year)
 356                                (Just pub_month)
 357                                (Just pub_day)
 358                                Nothing
 359                                Nothing
 360                                Nothing
 361                                Nothing
 362
 363
 364 csv2doc :: CsvDoc -> HyperdataDocument
 365 csv2doc (CsvDoc title source
 366        pub_year pub_month pub_day
 367        abstract authors ) = HyperdataDocument (Just "CsvHal")
 368                                Nothing
 369                                Nothing
 370                                Nothing
 371                                Nothing
 372                                Nothing
 373                                (Just title)
 374                                (Just authors)
 375                                Nothing
 376                                (Just source)
 377                                (Just abstract)
 378                                (Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
 379                                (Just $ fromIntegral pub_year)
 380                                (Just pub_month)
 381                                (Just pub_day)
 382                                Nothing
 383                                Nothing
 384                                Nothing
 385                                Nothing
 386
 387 ------------------------------------------------------------------------
 388 parseHal :: FilePath -> IO [HyperdataDocument]
 389 parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp
 390
 391 parseHal' :: BL.ByteString -> [HyperdataDocument]
 392 parseHal' = V.toList . V.map csvHal2doc . snd . readCsvHalLazyBS
 393
 394 ------------------------------------------------------------------------
 395
 396 parseCsv :: FilePath -> IO [HyperdataDocument]
 397 parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
 398
 399 parseCsv' :: BL.ByteString -> [HyperdataDocument]
 400 parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs
 401