src/Gargantext/Text/Corpus/Parsers/CSV.hs

   1 {-|
   2 Module      : Gargantext.Text.Corpus.Parsers.CSV
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 CSV parser for Gargantext corpus files.
  11
  12 -}
  13
  14
  15 module Gargantext.Text.Corpus.Parsers.CSV where
  16
  17 import Control.Applicative
  18 import qualified Data.ByteString      as BS
  19 import qualified Data.ByteString.Lazy as BL
  20 import Data.Char (ord)
  21 import Data.Csv
  22 import Data.Either (Either(Left, Right))
  23 import Data.Text (Text, pack, length, intercalate)
  24 import Data.Time.Segment (jour)
  25 import qualified Data.Vector          as V
  26 import Data.Vector (Vector)
  27 import GHC.IO (FilePath)
  28 import GHC.Real (round)
  29 import GHC.Word (Word8)
  30
  31 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
  32 import Gargantext.Prelude hiding (length)
  33 import Gargantext.Text
  34 import Gargantext.Text.Context
  35
  36 ---------------------------------------------------------------
  37 headerCsvGargV3 :: Header
  38 headerCsvGargV3 = header [ "title"
  39          , "source"
  40          , "publication_year"
  41          , "publication_month"
  42          , "publication_day"
  43          , "abstract"
  44          , "authors"
  45          ]
  46 ---------------------------------------------------------------
  47 data CsvGargV3 = CsvGargV3
  48     { d_docId  :: !Int
  49     , d_title  :: !Text
  50     , d_source :: !Text
  51     , d_publication_year  :: !Int
  52     , d_publication_month :: !Int
  53     , d_publication_day   :: !Int
  54     , d_abstract          :: !Text
  55     , d_authors           :: !Text
  56     }
  57     deriving (Show)
  58 ---------------------------------------------------------------
  59 -- | Doc 2 HyperdataDocument
  60 toDoc :: CsvGargV3 -> HyperdataDocument
  61 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
  62   HyperdataDocument (Just "CSV")
  63                     (Just . pack . show $ did)
  64                     Nothing
  65                     Nothing
  66                     Nothing
  67                     Nothing
  68                     (Just dt)
  69                     Nothing
  70                     (Just dau)
  71                     (Just dab)
  72                     (Nothing)
  73                     Nothing
  74                     (Just dpy)
  75                     (Just dpm)
  76                     (Just dpd)
  77                     Nothing
  78                     Nothing
  79                     Nothing
  80                     Nothing
  81
  82 ---------------------------------------------------------------
  83 -- | Types Conversions
  84 toDocs :: Vector CsvDoc -> [CsvGargV3]
  85 toDocs v = V.toList
  86          $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
  87                        -> CsvGargV3 nId t s py pm pd abst auth )
  88                        (V.enumFromN 1 (V.length v'')) v''
  89           where
  90             v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
  91             seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
  92
  93 ---------------------------------------------------------------
  94 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
  95 fromDocs docs = V.map fromDocs' docs
  96   where
  97     fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
  98
  99 ---------------------------------------------------------------
 100 -- | Split a document in its context
 101 -- TODO adapt the size of the paragraph according to the corpus average
 102 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
 103 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
 104                           if docSize > 1000
 105                             then
 106                               if (mod (round m) docSize) >= 10
 107                                 then
 108                                   splitDoc' splt doc
 109                                 else
 110                                   V.fromList [doc]
 111                             else
 112                               V.fromList [doc]
 113   where
 114     splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
 115     splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
 116         where
 117           firstDoc = CsvDoc t s py pm pd firstAbstract auth
 118           firstAbstract = head' "splitDoc'1" abstracts
 119
 120           nextDocs = map (\txt -> CsvDoc
 121                                     (head' "splitDoc'2" $ sentences txt)
 122                                     s py pm pd
 123                                     (unsentences $ tail' "splitDoc'1" $ sentences txt)
 124                                     auth
 125                           ) (tail' "splitDoc'2" abstracts)
 126
 127           abstracts    = (splitBy $ contextSize) abst
 128
 129 ---------------------------------------------------------------
 130 ---------------------------------------------------------------
 131 type Mean = Double
 132
 133 docsSize :: Vector CsvDoc -> Mean
 134 docsSize csvDoc = mean ls
 135   where
 136     ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
 137
 138
 139 ---------------------------------------------------------------
 140 data CsvDoc = CsvDoc
 141     { csv_title  :: !Text
 142     , csv_source :: !Text
 143     , csv_publication_year  :: !Int
 144     , csv_publication_month :: !Int
 145     , csv_publication_day   :: !Int
 146     , csv_abstract          :: !Text
 147     , csv_authors           :: !Text
 148     }
 149     deriving (Show)
 150
 151 instance FromNamedRecord CsvDoc where
 152   parseNamedRecord r = CsvDoc <$> r .: "title"
 153                               <*> r .: "source"
 154                               <*> r .: "publication_year"
 155                               <*> r .: "publication_month"
 156                               <*> r .: "publication_day"
 157                               <*> r .: "abstract"
 158                               <*> r .: "authors"
 159
 160 instance ToNamedRecord CsvDoc where
 161   toNamedRecord (CsvDoc t s py pm pd abst aut) =
 162     namedRecord [ "title"  .= t
 163                 , "source" .= s
 164                 , "publication_year"  .= py
 165                 , "publication_month" .= pm
 166                 , "publication_day"   .= pd
 167                 , "abstract"          .= abst
 168                 , "authors"           .= aut
 169                ]
 170
 171 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
 172 hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
 173                                     (m $ _hyperdataDocument_source h)
 174                                     (mI $ _hyperdataDocument_publication_year h)
 175                                     (mI $ _hyperdataDocument_publication_month h)
 176                                     (mI $ _hyperdataDocument_publication_day   h)
 177                                     (m $ _hyperdataDocument_abstract h)
 178                                     (m $ _hyperdataDocument_authors h)
 179
 180   where
 181     m = maybe "" identity
 182     mI = maybe 0 identity
 183
 184
 185 csvDecodeOptions :: DecodeOptions
 186 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
 187
 188 csvEncodeOptions :: EncodeOptions
 189 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
 190
 191 delimiter :: Word8
 192 delimiter = fromIntegral $ ord '\t'
 193 ------------------------------------------------------------------------
 194 ------------------------------------------------------------------------
 195 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text]
 196 readCsvOn' fields fp = V.toList
 197                    <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
 198                    <$> snd
 199                    <$> readFile fp
 200
 201 ------------------------------------------------------------------------
 202
 203 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
 204 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
 205
 206 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
 207 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
 208
 209 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
 210 readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
 211       Left e        -> panic (pack e)
 212       Right csvDocs -> csvDocs
 213
 214 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
 215 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
 216
 217 ------------------------------------------------------------------------
 218 -- | TODO use readFileLazy
 219 readFile :: FilePath -> IO (Header, Vector CsvDoc)
 220 readFile = fmap readCsvLazyBS . BL.readFile
 221
 222
 223 -- | TODO use readByteStringLazy
 224 readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
 225 readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
 226       Left e        -> panic (pack e)
 227       Right csvDocs -> csvDocs
 228
 229 ------------------------------------------------------------------------
 230
 231 -- | TODO use readFileLazy
 232 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
 233 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
 234
 235 -- | TODO use readByteStringLazy
 236 readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
 237 readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
 238       Left e        -> panic (pack e)
 239       Right csvDocs -> csvDocs
 240
 241 readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
 242 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
 243
 244 ------------------------------------------------------------------------
 245 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 246 writeFile fp (h, vs) = BL.writeFile fp $
 247                       encodeByNameWith csvEncodeOptions h (V.toList vs)
 248
 249 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
 250 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
 251
 252 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
 253 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
 254
 255 ------------------------------------------------------------------------
 256 -- Hal Format
 257 data CsvHal = CsvHal
 258     { csvHal_title  :: !Text
 259     , csvHal_source :: !Text
 260     , csvHal_publication_year  :: !Integer
 261     , csvHal_publication_month :: !Int
 262     , csvHal_publication_day   :: !Int
 263     , csvHal_abstract          :: !Text
 264     , csvHal_authors           :: !Text
 265
 266     , csvHal_url               :: !Text
 267     , csvHal_isbn_s            :: !Text
 268     , csvHal_issue_s           :: !Text
 269     , csvHal_journalPublisher_s:: !Text
 270     , csvHal_language_s        :: !Text
 271
 272     , csvHal_doiId_s           :: !Text
 273     , csvHal_authId_i          :: !Text
 274     , csvHal_instStructId_i    :: !Text
 275     , csvHal_deptStructId_i    :: !Text
 276     , csvHal_labStructId_i     :: !Text
 277
 278     , csvHal_rteamStructId_i   :: !Text
 279     , csvHal_docType_s         :: !Text
 280     }
 281     deriving (Show)
 282
 283 instance FromNamedRecord CsvHal where
 284   parseNamedRecord r = CsvHal <$> r .: "title"
 285                               <*> r .: "source"
 286                               <*> r .: "publication_year"
 287                               <*> r .: "publication_month"
 288                               <*> r .: "publication_day"
 289                               <*> r .: "abstract"
 290                               <*> r .: "authors"
 291
 292                               <*> r .: "url"
 293                               <*> r .: "isbn_s"
 294                               <*> r .: "issue_s"
 295                               <*> r .: "journalPublisher_s"
 296                               <*> r .: "language_s"
 297
 298                               <*> r .: "doiId_s"
 299                               <*> r .: "authId_i"
 300                               <*> r .: "instStructId_i"
 301                               <*> r .: "deptStructId_i"
 302                               <*> r .: "labStructId_i"
 303
 304                               <*> r .: "rteamStructId_i"
 305                               <*> r .: "docType_s"
 306
 307 instance ToNamedRecord CsvHal where
 308   toNamedRecord (CsvHal t s py  pm pd abst aut  url isbn iss j lang  doi auth inst dept lab team doct) =
 309     namedRecord [ "title"  .= t
 310                 , "source" .= s
 311
 312                 , "publication_year"  .= py
 313                 , "publication_month" .= pm
 314                 , "publication_day"   .= pd
 315
 316                 , "abstract"          .= abst
 317                 , "authors"           .= aut
 318
 319                 , "url"                .= url
 320                 , "isbn_s"             .= isbn
 321                 , "issue_s"            .= iss
 322                 , "journalPublisher_s" .= j
 323                 , "language_s"         .= lang
 324
 325                 , "doiId_s"            .= doi
 326                 , "authId_i"           .= auth
 327                 , "instStructId_i"     .= inst
 328                 , "deptStructId_i"     .= dept
 329                 , "labStructId_i"      .= lab
 330
 331                 , "rteamStructId_i"    .= team
 332                 , "docType_s"          .= doct
 333                ]
 334
 335 csvHal2doc :: CsvHal -> HyperdataDocument
 336 csvHal2doc (CsvHal title source
 337        pub_year pub_month pub_day
 338        abstract authors
 339        url _ _ _ _
 340        doi _ inst _ _
 341        _ _ ) = HyperdataDocument (Just "CsvHal")
 342                                (Just doi)
 343                                (Just url)
 344                                Nothing
 345                                Nothing
 346                                Nothing
 347                                (Just title)
 348                                (Just authors)
 349                                (Just inst)
 350                                (Just source)
 351                                (Just abstract)
 352                                (Just $ pack . show $ jour pub_year pub_month pub_day)
 353                                (Just $ fromIntegral pub_year)
 354                                (Just pub_month)
 355                                (Just pub_day)
 356                                Nothing
 357                                Nothing
 358                                Nothing
 359                                Nothing
 360
 361
 362 csv2doc :: CsvDoc -> HyperdataDocument
 363 csv2doc (CsvDoc title source
 364        pub_year pub_month pub_day
 365        abstract authors ) = HyperdataDocument (Just "CsvHal")
 366                                Nothing
 367                                Nothing
 368                                Nothing
 369                                Nothing
 370                                Nothing
 371                                (Just title)
 372                                (Just authors)
 373                                Nothing
 374                                (Just source)
 375                                (Just abstract)
 376                                (Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
 377                                (Just $ fromIntegral pub_year)
 378                                (Just pub_month)
 379                                (Just pub_day)
 380                                Nothing
 381                                Nothing
 382                                Nothing
 383                                Nothing
 384
 385 ------------------------------------------------------------------------
 386 parseHal :: FilePath -> IO [HyperdataDocument]
 387 parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp
 388
 389 parseHal' :: BL.ByteString -> [HyperdataDocument]
 390 parseHal' = V.toList . V.map csvHal2doc . snd . readCsvHalLazyBS
 391
 392 ------------------------------------------------------------------------
 393
 394 parseCsv :: FilePath -> IO [HyperdataDocument]
 395 parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
 396
 397 parseCsv' :: BL.ByteString -> [HyperdataDocument]
 398 parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs
 399