src/Gargantext/Text/Corpus/Parsers/CSV.hs

   1 {-|
   2 Module      : Gargantext.Text.Corpus.Parsers.CSV
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 CSV parser for Gargantext corpus files.
  11
  12 -}
  13
  14 {-# LANGUAGE NoImplicitPrelude #-}
  15 {-# LANGUAGE OverloadedStrings #-}
  16 {-# LANGUAGE DeriveGeneric     #-}
  17
  18 module Gargantext.Text.Corpus.Parsers.CSV where
  19
  20 import Control.Applicative
  21 import qualified Data.ByteString      as BS
  22 import qualified Data.ByteString.Lazy as BL
  23 import Data.Char (ord)
  24 import Data.Csv
  25 import Data.Either (Either(Left, Right))
  26 import Data.Text (Text, pack, length, intercalate)
  27 import Data.Time.Segment (jour)
  28 import qualified Data.Vector          as V
  29 import Data.Vector (Vector)
  30 import GHC.IO (FilePath)
  31 import GHC.Real (round)
  32 import GHC.Word (Word8)
  33
  34 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  35 import Gargantext.Prelude hiding (length)
  36 import Gargantext.Text
  37 import Gargantext.Text.Context
  38
  39 ---------------------------------------------------------------
  40 headerCsvGargV3 :: Header
  41 headerCsvGargV3 = header [ "title"
  42          , "source"
  43          , "publication_year"
  44          , "publication_month"
  45          , "publication_day"
  46          , "abstract"
  47          , "authors"
  48          ]
  49 ---------------------------------------------------------------
  50 data CsvGargV3 = CsvGargV3
  51     { d_docId  :: !Int
  52     , d_title  :: !Text
  53     , d_source :: !Text
  54     , d_publication_year  :: !Int
  55     , d_publication_month :: !Int
  56     , d_publication_day   :: !Int
  57     , d_abstract          :: !Text
  58     , d_authors           :: !Text
  59     }
  60     deriving (Show)
  61 ---------------------------------------------------------------
  62 -- | Doc 2 HyperdataDocument
  63 toDoc :: CsvGargV3 -> HyperdataDocument
  64 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
  65   HyperdataDocument (Just "CSV")
  66                     (Just . pack . show $ did)
  67                     Nothing
  68                     Nothing
  69                     Nothing
  70                     Nothing
  71                     (Just dt)
  72                     Nothing
  73                     (Just dau)
  74                     (Just dab)
  75                     (Nothing)
  76                     Nothing
  77                     (Just dpy)
  78                     (Just dpm)
  79                     (Just dpd)
  80                     Nothing
  81                     Nothing
  82                     Nothing
  83                     Nothing
  84
  85 ---------------------------------------------------------------
  86 -- | Types Conversions
  87 toDocs :: Vector CsvDoc -> [CsvGargV3]
  88 toDocs v = V.toList
  89          $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
  90                        -> CsvGargV3 nId t s py pm pd abst auth )
  91                        (V.enumFromN 1 (V.length v'')) v''
  92           where
  93             v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
  94             seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
  95
  96 ---------------------------------------------------------------
  97 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
  98 fromDocs docs = V.map fromDocs' docs
  99   where
 100     fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
 101
 102 ---------------------------------------------------------------
 103 -- | Split a document in its context
 104 -- TODO adapt the size of the paragraph according to the corpus average
 105 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
 106 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
 107                           if docSize > 1000
 108                             then
 109                               if (mod (round m) docSize) >= 10
 110                                 then
 111                                   splitDoc' splt doc
 112                                 else
 113                                   V.fromList [doc]
 114                             else
 115                               V.fromList [doc]
 116   where
 117     splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
 118     splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
 119         where
 120           firstDoc = CsvDoc t s py pm pd firstAbstract auth
 121           firstAbstract = head' "splitDoc'1" abstracts
 122
 123           nextDocs = map (\txt -> CsvDoc
 124                                     (head' "splitDoc'2" $ sentences txt)
 125                                     s py pm pd
 126                                     (unsentences $ tail' "splitDoc'1" $ sentences txt)
 127                                     auth
 128                           ) (tail' "splitDoc'2" abstracts)
 129
 130           abstracts    = (splitBy $ contextSize) abst
 131
 132 ---------------------------------------------------------------
 133 ---------------------------------------------------------------
 134 type Mean = Double
 135
 136 docsSize :: Vector CsvDoc -> Mean
 137 docsSize csvDoc = mean ls
 138   where
 139     ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
 140
 141
 142 ---------------------------------------------------------------
 143 data CsvDoc = CsvDoc
 144     { csv_title  :: !Text
 145     , csv_source :: !Text
 146     , csv_publication_year  :: !Int
 147     , csv_publication_month :: !Int
 148     , csv_publication_day   :: !Int
 149     , csv_abstract          :: !Text
 150     , csv_authors           :: !Text
 151     }
 152     deriving (Show)
 153
 154 instance FromNamedRecord CsvDoc where
 155   parseNamedRecord r = CsvDoc <$> r .: "title"
 156                               <*> r .: "source"
 157                               <*> r .: "publication_year"
 158                               <*> r .: "publication_month"
 159                               <*> r .: "publication_day"
 160                               <*> r .: "abstract"
 161                               <*> r .: "authors"
 162
 163 instance ToNamedRecord CsvDoc where
 164   toNamedRecord (CsvDoc t s py pm pd abst aut) =
 165     namedRecord [ "title"  .= t
 166                 , "source" .= s
 167                 , "publication_year"  .= py
 168                 , "publication_month" .= pm
 169                 , "publication_day"   .= pd
 170                 , "abstract"          .= abst
 171                 , "authors"           .= aut
 172                ]
 173
 174 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
 175 hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
 176                                     (m $ _hyperdataDocument_source h)
 177                                     (mI $ _hyperdataDocument_publication_year h)
 178                                     (mI $ _hyperdataDocument_publication_month h)
 179                                     (mI $ _hyperdataDocument_publication_day   h)
 180                                     (m $ _hyperdataDocument_abstract h)
 181                                     (m $ _hyperdataDocument_authors h)
 182
 183   where
 184     m = maybe "" identity
 185     mI = maybe 0 identity
 186
 187
 188 csvDecodeOptions :: DecodeOptions
 189 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
 190
 191 csvEncodeOptions :: EncodeOptions
 192 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
 193
 194 delimiter :: Word8
 195 delimiter = fromIntegral $ ord '\t'
 196 ------------------------------------------------------------------------
 197 ------------------------------------------------------------------------
 198 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text]
 199 readCsvOn' fields fp = V.toList
 200                    <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
 201                    <$> snd
 202                    <$> readFile fp
 203
 204 ------------------------------------------------------------------------
 205
 206 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
 207 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
 208
 209 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
 210 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
 211
 212 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
 213 readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
 214       Left e        -> panic (pack e)
 215       Right csvDocs -> csvDocs
 216
 217 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
 218 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
 219
 220 ------------------------------------------------------------------------
 221 -- | TODO use readFileLazy
 222 readFile :: FilePath -> IO (Header, Vector CsvDoc)
 223 readFile = fmap readCsvLazyBS . BL.readFile
 224
 225
 226 -- | TODO use readByteStringLazy
 227 readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
 228 readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
 229       Left e        -> panic (pack e)
 230       Right csvDocs -> csvDocs
 231
 232 ------------------------------------------------------------------------
 233
 234 -- | TODO use readFileLazy
 235 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
 236 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
 237
 238 -- | TODO use readByteStringLazy
 239 readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
 240 readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
 241       Left e        -> panic (pack e)
 242       Right csvDocs -> csvDocs
 243
 244 readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
 245 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
 246
 247 ------------------------------------------------------------------------
 248 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 249 writeFile fp (h, vs) = BL.writeFile fp $
 250                       encodeByNameWith csvEncodeOptions h (V.toList vs)
 251
 252 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
 253 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
 254
 255 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
 256 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
 257
 258 ------------------------------------------------------------------------
 259 -- Hal Format
 260 data CsvHal = CsvHal
 261     { csvHal_title  :: !Text
 262     , csvHal_source :: !Text
 263     , csvHal_publication_year  :: !Integer
 264     , csvHal_publication_month :: !Int
 265     , csvHal_publication_day   :: !Int
 266     , csvHal_abstract          :: !Text
 267     , csvHal_authors           :: !Text
 268
 269     , csvHal_url               :: !Text
 270     , csvHal_isbn_s            :: !Text
 271     , csvHal_issue_s           :: !Text
 272     , csvHal_journalPublisher_s:: !Text
 273     , csvHal_language_s        :: !Text
 274
 275     , csvHal_doiId_s           :: !Text
 276     , csvHal_authId_i          :: !Text
 277     , csvHal_instStructId_i    :: !Text
 278     , csvHal_deptStructId_i    :: !Text
 279     , csvHal_labStructId_i     :: !Text
 280
 281     , csvHal_rteamStructId_i   :: !Text
 282     , csvHal_docType_s         :: !Text
 283     }
 284     deriving (Show)
 285
 286 instance FromNamedRecord CsvHal where
 287   parseNamedRecord r = CsvHal <$> r .: "title"
 288                               <*> r .: "source"
 289                               <*> r .: "publication_year"
 290                               <*> r .: "publication_month"
 291                               <*> r .: "publication_day"
 292                               <*> r .: "abstract"
 293                               <*> r .: "authors"
 294
 295                               <*> r .: "url"
 296                               <*> r .: "isbn_s"
 297                               <*> r .: "issue_s"
 298                               <*> r .: "journalPublisher_s"
 299                               <*> r .: "language_s"
 300
 301                               <*> r .: "doiId_s"
 302                               <*> r .: "authId_i"
 303                               <*> r .: "instStructId_i"
 304                               <*> r .: "deptStructId_i"
 305                               <*> r .: "labStructId_i"
 306
 307                               <*> r .: "rteamStructId_i"
 308                               <*> r .: "docType_s"
 309
 310 instance ToNamedRecord CsvHal where
 311   toNamedRecord (CsvHal t s py  pm pd abst aut  url isbn iss j lang  doi auth inst dept lab team doct) =
 312     namedRecord [ "title"  .= t
 313                 , "source" .= s
 314
 315                 , "publication_year"  .= py
 316                 , "publication_month" .= pm
 317                 , "publication_day"   .= pd
 318
 319                 , "abstract"          .= abst
 320                 , "authors"           .= aut
 321
 322                 , "url"                .= url
 323                 , "isbn_s"             .= isbn
 324                 , "issue_s"            .= iss
 325                 , "journalPublisher_s" .= j
 326                 , "language_s"         .= lang
 327
 328                 , "doiId_s"            .= doi
 329                 , "authId_i"           .= auth
 330                 , "instStructId_i"     .= inst
 331                 , "deptStructId_i"     .= dept
 332                 , "labStructId_i"      .= lab
 333
 334                 , "rteamStructId_i"    .= team
 335                 , "docType_s"          .= doct
 336                ]
 337
 338 csvHal2doc :: CsvHal -> HyperdataDocument
 339 csvHal2doc (CsvHal title source
 340        pub_year pub_month pub_day
 341        abstract authors
 342        url _ _ _ _
 343        doi _ inst _ _
 344        _ _ ) = HyperdataDocument (Just "CsvHal")
 345                                (Just doi)
 346                                (Just url)
 347                                Nothing
 348                                Nothing
 349                                Nothing
 350                                (Just title)
 351                                (Just authors)
 352                                (Just inst)
 353                                (Just source)
 354                                (Just abstract)
 355                                (Just $ pack . show $ jour pub_year pub_month pub_day)
 356                                (Just $ fromIntegral pub_year)
 357                                (Just pub_month)
 358                                (Just pub_day)
 359                                Nothing
 360                                Nothing
 361                                Nothing
 362                                Nothing
 363
 364
 365 csv2doc :: CsvDoc -> HyperdataDocument
 366 csv2doc (CsvDoc title source
 367        pub_year pub_month pub_day
 368        abstract authors ) = HyperdataDocument (Just "CsvHal")
 369                                Nothing
 370                                Nothing
 371                                Nothing
 372                                Nothing
 373                                Nothing
 374                                (Just title)
 375                                (Just authors)
 376                                Nothing
 377                                (Just source)
 378                                (Just abstract)
 379                                (Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
 380                                (Just $ fromIntegral pub_year)
 381                                (Just pub_month)
 382                                (Just pub_day)
 383                                Nothing
 384                                Nothing
 385                                Nothing
 386                                Nothing
 387
 388 ------------------------------------------------------------------------
 389 parseHal :: FilePath -> IO [HyperdataDocument]
 390 parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp
 391
 392 parseHal' :: BL.ByteString -> [HyperdataDocument]
 393 parseHal' = V.toList . V.map csvHal2doc . snd . readCsvHalLazyBS
 394
 395 ------------------------------------------------------------------------
 396
 397 parseCsv :: FilePath -> IO [HyperdataDocument]
 398 parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
 399
 400 parseCsv' :: BL.ByteString -> [HyperdataDocument]
 401 parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs
 402