src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Corpus.Parsers.CSV
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 CSV parser for Gargantext corpus files.
  11
  12 -}
  13
  14
  15 module Gargantext.Core.Text.Corpus.Parsers.CSV where
  16
  17 import Control.Applicative
  18 import qualified Data.ByteString      as BS
  19 import qualified Data.ByteString.Lazy as BL
  20 import Data.Char (ord)
  21 import Data.Csv
  22 import Data.Either (Either(Left, Right))
  23 import Data.Text (Text, pack, length, intercalate)
  24 import Data.Time.Segment (jour)
  25 import qualified Data.Vector          as V
  26 import Data.Vector (Vector)
  27 import GHC.IO (FilePath)
  28 import GHC.Word (Word8)
  29
  30 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  31 import Gargantext.Prelude hiding (length)
  32 import Gargantext.Core.Text
  33 import Gargantext.Core.Text.Context
  34
  35 ---------------------------------------------------------------
  36 headerCsvGargV3 :: Header
  37 headerCsvGargV3 =
  38   header [ "title"
  39          , "source"
  40          , "publication_year"
  41          , "publication_month"
  42          , "publication_day"
  43          , "abstract"
  44          , "authors"
  45          ]
  46 ---------------------------------------------------------------
  47 data CsvGargV3 = CsvGargV3
  48     { d_docId             :: !Int
  49     , d_title             :: !Text
  50     , d_source            :: !Text
  51     , d_publication_year  :: !Int
  52     , d_publication_month :: !Int
  53     , d_publication_day   :: !Int
  54     , d_abstract          :: !Text
  55     , d_authors           :: !Text
  56     }
  57     deriving (Show)
  58 ---------------------------------------------------------------
  59 -- | Doc 2 HyperdataDocument
  60 toDoc :: CsvGargV3 -> HyperdataDocument
  61 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
  62   HyperdataDocument (Just "CSV")
  63                     (Just . pack . show $ did)
  64                     Nothing
  65                     Nothing
  66                     Nothing
  67                     Nothing
  68                     (Just dt)
  69                     Nothing
  70                     (Just dau)
  71                     (Just dab)
  72                     (Nothing)
  73                     Nothing
  74                     (Just dpy)
  75                     (Just dpm)
  76                     (Just dpd)
  77                     Nothing
  78                     Nothing
  79                     Nothing
  80                     Nothing
  81
  82 ---------------------------------------------------------------
  83 -- | Types Conversions
  84 toDocs :: Vector CsvDoc -> [CsvGargV3]
  85 toDocs v = V.toList
  86          $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
  87                        -> CsvGargV3 nId t s py pm pd abst auth )
  88                        (V.enumFromN 1 (V.length v'')) v''
  89           where
  90             v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
  91             seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
  92
  93 ---------------------------------------------------------------
  94 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
  95 fromDocs docs = V.map fromDocs' docs
  96   where
  97     fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
  98
  99 ---------------------------------------------------------------
 100 -- | Split a document in its context
 101 -- TODO adapt the size of the paragraph according to the corpus average
 102 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
 103 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
 104                           if docSize > 1000
 105                             then
 106                               if (mod (round m) docSize) >= 10
 107                                 then
 108                                   splitDoc' splt doc
 109                                 else
 110                                   V.fromList [doc]
 111                             else
 112                               V.fromList [doc]
 113   where
 114     splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
 115     splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
 116         where
 117           firstDoc = CsvDoc t s py pm pd firstAbstract auth
 118           firstAbstract = head' "splitDoc'1" abstracts
 119
 120           nextDocs = map (\txt -> CsvDoc
 121                                     (head' "splitDoc'2" $ sentences txt)
 122                                     s py pm pd
 123                                     (unsentences $ tail' "splitDoc'1" $ sentences txt)
 124                                     auth
 125                           ) (tail' "splitDoc'2" abstracts)
 126
 127           abstracts    = (splitBy $ contextSize) abst
 128
 129 ---------------------------------------------------------------
 130 ---------------------------------------------------------------
 131 type Mean = Double
 132
 133 docsSize :: Vector CsvDoc -> Mean
 134 docsSize csvDoc = mean ls
 135   where
 136     ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
 137
 138
 139 ---------------------------------------------------------------
 140 data CsvDoc = CsvDoc
 141     { csv_title  :: !Text
 142     , csv_source :: !Text
 143     , csv_publication_year  :: !Int
 144     , csv_publication_month :: !Int
 145     , csv_publication_day   :: !Int
 146     , csv_abstract          :: !Text
 147     , csv_authors           :: !Text
 148     }
 149     deriving (Show)
 150
 151 instance FromNamedRecord CsvDoc where
 152   parseNamedRecord r = CsvDoc <$> r .: "title"
 153                               <*> r .: "source"
 154                               <*> r .: "publication_year"
 155                               <*> r .: "publication_month"
 156                               <*> r .: "publication_day"
 157                               <*> r .: "abstract"
 158                               <*> r .: "authors"
 159
 160 instance ToNamedRecord CsvDoc where
 161   toNamedRecord (CsvDoc t s py pm pd abst aut) =
 162     namedRecord [ "title"  .= t
 163                 , "source" .= s
 164                 , "publication_year"  .= py
 165                 , "publication_month" .= pm
 166                 , "publication_day"   .= pd
 167                 , "abstract"          .= abst
 168                 , "authors"           .= aut
 169                ]
 170
 171 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
 172 hyperdataDocument2csvDoc h = CsvDoc (m  $ _hd_title h)
 173                                     (m  $ _hd_source h)
 174                                     (mI $ _hd_publication_year h)
 175                                     (mI $ _hd_publication_month h)
 176                                     (mI $ _hd_publication_day   h)
 177                                     (m  $ _hd_abstract h)
 178                                     (m  $ _hd_authors h)
 179
 180   where
 181     m = maybe "" identity
 182     mI = maybe 0 identity
 183
 184
 185 csvDecodeOptions :: DecodeOptions
 186 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
 187
 188 csvEncodeOptions :: EncodeOptions
 189 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
 190
 191 delimiter :: Word8
 192 delimiter = fromIntegral $ ord '\t'
 193 ------------------------------------------------------------------------
 194 ------------------------------------------------------------------------
 195 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text]
 196 readCsvOn' fields fp = V.toList
 197                    <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
 198                    <$> snd
 199                    <$> readFile fp
 200
 201 ------------------------------------------------------------------------
 202
 203 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
 204 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
 205
 206 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
 207 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
 208
 209 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
 210 readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
 211       Left e        -> panic (pack e)
 212       Right csvDocs -> csvDocs
 213
 214 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
 215 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
 216
 217 ------------------------------------------------------------------------
 218 -- | TODO use readFileLazy
 219 readFile :: FilePath -> IO (Header, Vector CsvDoc)
 220 readFile = fmap readCsvLazyBS . BL.readFile
 221
 222
 223 -- | TODO use readByteStringLazy
 224 readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
 225 readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
 226       Left e        -> panic (pack e)
 227       Right csvDocs -> csvDocs
 228
 229 ------------------------------------------------------------------------
 230 -- | TODO use readFileLazy
 231 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
 232 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
 233
 234 -- | TODO use readByteStringLazy
 235 readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
 236 readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
 237       Left e        -> panic (pack e)
 238       Right csvDocs -> csvDocs
 239
 240 readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
 241 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
 242
 243 ------------------------------------------------------------------------
 244 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 245 writeFile fp (h, vs) = BL.writeFile fp $
 246                       encodeByNameWith csvEncodeOptions h (V.toList vs)
 247
 248 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
 249 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
 250
 251 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
 252 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
 253
 254 ------------------------------------------------------------------------
 255 -- Hal Format
 256 data CsvHal = CsvHal
 257     { csvHal_title  :: !Text
 258     , csvHal_source :: !Text
 259     , csvHal_publication_year  :: !Integer
 260     , csvHal_publication_month :: !Int
 261     , csvHal_publication_day   :: !Int
 262     , csvHal_abstract          :: !Text
 263     , csvHal_authors           :: !Text
 264
 265     , csvHal_url               :: !Text
 266     , csvHal_isbn_s            :: !Text
 267     , csvHal_issue_s           :: !Text
 268     , csvHal_journalPublisher_s:: !Text
 269     , csvHal_language_s        :: !Text
 270
 271     , csvHal_doiId_s           :: !Text
 272     , csvHal_authId_i          :: !Text
 273     , csvHal_instStructId_i    :: !Text
 274     , csvHal_deptStructId_i    :: !Text
 275     , csvHal_labStructId_i     :: !Text
 276
 277     , csvHal_rteamStructId_i   :: !Text
 278     , csvHal_docType_s         :: !Text
 279     }
 280     deriving (Show)
 281
 282 instance FromNamedRecord CsvHal where
 283   parseNamedRecord r = CsvHal <$> r .: "title"
 284                               <*> r .: "source"
 285                               <*> r .: "publication_year"
 286                               <*> r .: "publication_month"
 287                               <*> r .: "publication_day"
 288                               <*> r .: "abstract"
 289                               <*> r .: "authors"
 290
 291                               <*> r .: "url"
 292                               <*> r .: "isbn_s"
 293                               <*> r .: "issue_s"
 294                               <*> r .: "journalPublisher_s"
 295                               <*> r .: "language_s"
 296
 297                               <*> r .: "doiId_s"
 298                               <*> r .: "authId_i"
 299                               <*> r .: "instStructId_i"
 300                               <*> r .: "deptStructId_i"
 301                               <*> r .: "labStructId_i"
 302
 303                               <*> r .: "rteamStructId_i"
 304                               <*> r .: "docType_s"
 305
 306 instance ToNamedRecord CsvHal where
 307   toNamedRecord (CsvHal t s py  pm pd abst aut  url isbn iss j lang  doi auth inst dept lab team doct) =
 308     namedRecord [ "title"  .= t
 309                 , "source" .= s
 310
 311                 , "publication_year"  .= py
 312                 , "publication_month" .= pm
 313                 , "publication_day"   .= pd
 314
 315                 , "abstract"          .= abst
 316                 , "authors"           .= aut
 317
 318                 , "url"                .= url
 319                 , "isbn_s"             .= isbn
 320                 , "issue_s"            .= iss
 321                 , "journalPublisher_s" .= j
 322                 , "language_s"         .= lang
 323
 324                 , "doiId_s"            .= doi
 325                 , "authId_i"           .= auth
 326                 , "instStructId_i"     .= inst
 327                 , "deptStructId_i"     .= dept
 328                 , "labStructId_i"      .= lab
 329
 330                 , "rteamStructId_i"    .= team
 331                 , "docType_s"          .= doct
 332                ]
 333
 334 csvHal2doc :: CsvHal -> HyperdataDocument
 335 csvHal2doc (CsvHal title source
 336        pub_year pub_month pub_day
 337        abstract authors
 338        url _ _ _ _
 339        doi _ inst _ _
 340        _ _ ) = HyperdataDocument (Just "CsvHal")
 341                                (Just doi)
 342                                (Just url)
 343                                Nothing
 344                                Nothing
 345                                Nothing
 346                                (Just title)
 347                                (Just authors)
 348                                (Just inst)
 349                                (Just source)
 350                                (Just abstract)
 351                                (Just $ pack . show $ jour pub_year pub_month pub_day)
 352                                (Just $ fromIntegral pub_year)
 353                                (Just pub_month)
 354                                (Just pub_day)
 355                                Nothing
 356                                Nothing
 357                                Nothing
 358                                Nothing
 359
 360
 361 csv2doc :: CsvDoc -> HyperdataDocument
 362 csv2doc (CsvDoc title source
 363        pub_year pub_month pub_day
 364        abstract authors ) = HyperdataDocument (Just "CsvHal")
 365                                Nothing
 366                                Nothing
 367                                Nothing
 368                                Nothing
 369                                Nothing
 370                                (Just title)
 371                                (Just authors)
 372                                Nothing
 373                                (Just source)
 374                                (Just abstract)
 375                                (Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
 376                                (Just $ fromIntegral pub_year)
 377                                (Just pub_month)
 378                                (Just pub_day)
 379                                Nothing
 380                                Nothing
 381                                Nothing
 382                                Nothing
 383
 384 ------------------------------------------------------------------------
 385 parseHal :: FilePath -> IO [HyperdataDocument]
 386 parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp
 387
 388 parseHal' :: BL.ByteString -> [HyperdataDocument]
 389 parseHal' = V.toList . V.map csvHal2doc . snd . readCsvHalLazyBS
 390
 391 ------------------------------------------------------------------------
 392 parseCsv :: FilePath -> IO [HyperdataDocument]
 393 parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
 394
 395 parseCsv' :: BL.ByteString -> [HyperdataDocument]
 396 parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs
 397
 398 ------------------------------------------------------------------------
 399 -- Csv v3 weighted for phylo
 400
 401 data Csv' = Csv'
 402       { csv'_title             :: !Text
 403       , csv'_source            :: !Text
 404       , csv'_publication_year  :: !Int
 405       , csv'_publication_month :: !Int
 406       , csv'_publication_day   :: !Int
 407       , csv'_abstract          :: !Text
 408       , csv'_authors           :: !Text
 409       , csv'_weight            :: !Double } deriving (Show)
 410
 411
 412 instance FromNamedRecord Csv' where
 413   parseNamedRecord r = Csv' <$> r .: "title"
 414                             <*> r .: "source"
 415                             <*> r .: "publication_year"
 416                             <*> r .: "publication_month"
 417                             <*> r .: "publication_day"
 418                             <*> r .: "abstract"
 419                             <*> r .: "authors"
 420                             <*> r .: "weight"
 421
 422 readWeightedCsv :: FilePath -> IO (Header, Vector Csv')
 423 readWeightedCsv fp =
 424   fmap (\bs ->
 425     case decodeByNameWith csvDecodeOptions bs of
 426       Left e       -> panic (pack e)
 427       Right corpus -> corpus
 428     ) $ BL.readFile fp