src/Gargantext/Text/Parsers/CSV.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers.CSV
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 CSV parser for Gargantext corpus files.
  11
  12 -}
  13
  14 {-# LANGUAGE NoImplicitPrelude #-}
  15 {-# LANGUAGE OverloadedStrings #-}
  16 {-# LANGUAGE DeriveGeneric     #-}
  17
  18 module Gargantext.Text.Parsers.CSV where
  19
  20 import Control.Applicative
  21 import Data.Char (ord)
  22 import Data.Csv
  23 import Data.Either (Either(Left, Right))
  24 import Data.Text (Text, pack, length, intercalate)
  25 import Data.Time.Segment (jour)
  26 import Data.Vector (Vector)
  27 import GHC.IO (FilePath)
  28 import GHC.Real (round)
  29 import GHC.Word (Word8)
  30 import Gargantext.Database.Types.Node -- (HyperdataDocument(..))
  31 import Gargantext.Prelude hiding (length)
  32 import Gargantext.Text
  33 import Gargantext.Text.Context
  34 import qualified Data.ByteString.Lazy as BL
  35 import qualified Data.ByteString as  BS
  36 import qualified Data.Vector as V
  37
  38 ---------------------------------------------------------------
  39 headerCsvGargV3 :: Header
  40 headerCsvGargV3 = header [ "title"
  41          , "source"
  42          , "publication_year"
  43          , "publication_month"
  44          , "publication_day"
  45          , "abstract"
  46          , "authors"
  47          ]
  48 ---------------------------------------------------------------
  49 data CsvGargV3 = CsvGargV3
  50     { d_docId  :: !Int
  51     , d_title  :: !Text
  52     , d_source :: !Text
  53     , d_publication_year  :: !Int
  54     , d_publication_month :: !Int
  55     , d_publication_day   :: !Int
  56     , d_abstract          :: !Text
  57     , d_authors           :: !Text
  58     }
  59     deriving (Show)
  60 ---------------------------------------------------------------
  61 -- | Doc 2 HyperdataDocument
  62 toDoc :: CsvGargV3 -> HyperdataDocument
  63 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
  64   HyperdataDocument (Just "CSV")
  65                     (Just . pack . show $ did)
  66                     Nothing
  67                     Nothing
  68                     Nothing
  69                     Nothing
  70                     (Just dt)
  71                     Nothing
  72                     (Just dau)
  73                     (Just dab)
  74                     (Nothing)
  75                     Nothing
  76                     (Just dpy)
  77                     (Just dpm)
  78                     (Just dpd)
  79                     Nothing
  80                     Nothing
  81                     Nothing
  82                     Nothing
  83
  84 ---------------------------------------------------------------
  85 -- | Types Conversions
  86 toDocs :: Vector CsvDoc -> [CsvGargV3]
  87 toDocs v = V.toList
  88          $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
  89                        -> CsvGargV3 nId t s py pm pd abst auth )
  90                        (V.enumFromN 1 (V.length v'')) v''
  91           where
  92             v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
  93             seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
  94
  95 ---------------------------------------------------------------
  96 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
  97 fromDocs docs = V.map fromDocs' docs
  98   where
  99     fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
 100
 101 ---------------------------------------------------------------
 102 -- | Split a document in its context
 103 -- TODO adapt the size of the paragraph according to the corpus average
 104
 105 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
 106 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
 107                           if docSize > 1000
 108                             then
 109                               if (mod (round m) docSize) >= 10
 110                                 then
 111                                   splitDoc' splt doc
 112                                 else
 113                                   V.fromList [doc]
 114                             else
 115                               V.fromList [doc]
 116
 117
 118 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
 119 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
 120     where
 121       firstDoc = CsvDoc t s py pm pd firstAbstract auth
 122       firstAbstract = head' "splitDoc'1" abstracts
 123
 124       nextDocs = map (\txt -> CsvDoc
 125                                 (head' "splitDoc'2" $ sentences txt)
 126                                 s py pm pd
 127                                 (unsentences $ tail' "splitDoc'1" $ sentences txt)
 128                                 auth
 129                       ) (tail' "splitDoc'2" abstracts)
 130
 131       abstracts    = (splitBy $ contextSize) abst
 132
 133 ---------------------------------------------------------------
 134 ---------------------------------------------------------------
 135 type Mean = Double
 136
 137 docsSize :: Vector CsvDoc -> Mean
 138 docsSize csvDoc = mean ls
 139   where
 140     ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
 141
 142
 143 ---------------------------------------------------------------
 144 data CsvDoc = CsvDoc
 145     { csv_title  :: !Text
 146     , csv_source :: !Text
 147     , csv_publication_year  :: !Int
 148     , csv_publication_month :: !Int
 149     , csv_publication_day   :: !Int
 150     , csv_abstract          :: !Text
 151     , csv_authors           :: !Text
 152     }
 153     deriving (Show)
 154
 155 instance FromNamedRecord CsvDoc where
 156   parseNamedRecord r = CsvDoc <$> r .: "title"
 157                               <*> r .: "source"
 158                               <*> r .: "publication_year"
 159                               <*> r .: "publication_month"
 160                               <*> r .: "publication_day"
 161                               <*> r .: "abstract"
 162                               <*> r .: "authors"
 163
 164 instance ToNamedRecord CsvDoc where
 165   toNamedRecord (CsvDoc t s py pm pd abst aut) =
 166     namedRecord [ "title"  .= t
 167                 , "source" .= s
 168                 , "publication_year"  .= py
 169                 , "publication_month" .= pm
 170                 , "publication_day"   .= pd
 171                 , "abstract"          .= abst
 172                 , "authors"           .= aut
 173                ]
 174
 175 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
 176 hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
 177                                     (m $ _hyperdataDocument_source h)
 178                                     (mI $ _hyperdataDocument_publication_year h)
 179                                     (mI $ _hyperdataDocument_publication_month h)
 180                                     (mI $ _hyperdataDocument_publication_day   h)
 181                                     (m $ _hyperdataDocument_abstract h)
 182                                     (m $ _hyperdataDocument_authors h)
 183
 184   where
 185     m = maybe "" identity
 186     mI = maybe 0 identity
 187
 188
 189 csvDecodeOptions :: DecodeOptions
 190 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
 191
 192 csvEncodeOptions :: EncodeOptions
 193 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
 194
 195 delimiter :: Word8
 196 delimiter = fromIntegral $ ord '\t'
 197 ------------------------------------------------------------------------
 198 ------------------------------------------------------------------------
 199 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
 200 readCsvOn fields fp = V.toList
 201                    <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
 202                    <$> snd
 203                    <$> readFile fp
 204
 205 ------------------------------------------------------------------------
 206
 207 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
 208 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
 209
 210 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
 211 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
 212
 213 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
 214 readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
 215       Left e        -> panic (pack e)
 216       Right csvDocs -> csvDocs
 217
 218 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
 219 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
 220
 221 ------------------------------------------------------------------------
 222 -- | TODO use readFileLazy
 223 readFile :: FilePath -> IO (Header, Vector CsvDoc)
 224 readFile = fmap readCsvLazyBS . BL.readFile
 225
 226
 227 -- | TODO use readByteStringLazy
 228 readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
 229 readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
 230       Left e        -> panic (pack e)
 231       Right csvDocs -> csvDocs
 232
 233 ------------------------------------------------------------------------
 234 -- | TODO use readFileLazy
 235 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
 236 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
 237
 238 -- | TODO use readByteStringLazy
 239 readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
 240 readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
 241       Left e        -> panic (pack e)
 242       Right csvDocs -> csvDocs
 243
 244 readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
 245 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
 246
 247 ------------------------------------------------------------------------
 248 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 249 writeFile fp (h, vs) = BL.writeFile fp $
 250                       encodeByNameWith csvEncodeOptions h (V.toList vs)
 251
 252 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
 253 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
 254
 255 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
 256 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
 257
 258 ------------------------------------------------------------------------
 259 -- Hal Format
 260 data CsvHal = CsvHal
 261     { csvHal_title  :: !Text
 262     , csvHal_source :: !Text
 263     , csvHal_publication_year  :: !Integer
 264     , csvHal_publication_month :: !Int
 265     , csvHal_publication_day   :: !Int
 266     , csvHal_abstract          :: !Text
 267     , csvHal_authors           :: !Text
 268
 269     , csvHal_url               :: !Text
 270     , csvHal_isbn_s            :: !Text
 271     , csvHal_issue_s           :: !Text
 272     , csvHal_journalPublisher_s:: !Text
 273     , csvHal_language_s        :: !Text
 274
 275     , csvHal_doiId_s           :: !Text
 276     , csvHal_authId_i          :: !Text
 277     , csvHal_instStructId_i    :: !Text
 278     , csvHal_deptStructId_i    :: !Text
 279     , csvHal_labStructId_i     :: !Text
 280
 281     , csvHal_rteamStructId_i   :: !Text
 282     , csvHal_docType_s         :: !Text
 283     }
 284     deriving (Show)
 285
 286 instance FromNamedRecord CsvHal where
 287   parseNamedRecord r = CsvHal <$> r .: "title"
 288                               <*> r .: "source"
 289                               <*> r .: "publication_year"
 290                               <*> r .: "publication_month"
 291                               <*> r .: "publication_day"
 292                               <*> r .: "abstract"
 293                               <*> r .: "authors"
 294
 295                               <*> r .: "url"
 296                               <*> r .: "isbn_s"
 297                               <*> r .: "issue_s"
 298                               <*> r .: "journalPublisher_s"
 299                               <*> r .: "language_s"
 300
 301                               <*> r .: "doiId_s"
 302                               <*> r .: "authId_i"
 303                               <*> r .: "instStructId_i"
 304                               <*> r .: "deptStructId_i"
 305                               <*> r .: "labStructId_i"
 306
 307                               <*> r .: "rteamStructId_i"
 308                               <*> r .: "docType_s"
 309
 310 instance ToNamedRecord CsvHal where
 311   toNamedRecord (CsvHal t s py  pm pd abst aut  url isbn iss j lang  doi auth inst dept lab team doct) =
 312     namedRecord [ "title"  .= t
 313                 , "source" .= s
 314
 315                 , "publication_year"  .= py
 316                 , "publication_month" .= pm
 317                 , "publication_day"   .= pd
 318
 319                 , "abstract"          .= abst
 320                 , "authors"           .= aut
 321
 322                 , "url"                .= url
 323                 , "isbn_s"             .= isbn
 324                 , "issue_s"            .= iss
 325                 , "journalPublisher_s" .= j
 326                 , "language_s"         .= lang
 327
 328                 , "doiId_s"            .= doi
 329                 , "authId_i"           .= auth
 330                 , "instStructId_i"     .= inst
 331                 , "deptStructId_i"     .= dept
 332                 , "labStructId_i"      .= lab
 333
 334                 , "rteamStructId_i"    .= team
 335                 , "docType_s"          .= doct
 336                ]
 337
 338 csvHal2doc :: CsvHal -> HyperdataDocument
 339 csvHal2doc (CsvHal title source
 340        pub_year pub_month pub_day
 341        abstract authors
 342        url _ _ _ _
 343        doi _ inst _ _
 344        _ _ ) = HyperdataDocument (Just "CsvHal")
 345                                (Just doi)
 346                                (Just url)
 347                                Nothing
 348                                Nothing
 349                                Nothing
 350                                (Just title)
 351                                (Just authors)
 352                                (Just inst)
 353                                (Just source)
 354                                (Just abstract)
 355                                (Just $ pack . show $ jour pub_year pub_month pub_day)
 356                                (Just $ fromIntegral pub_year)
 357                                (Just pub_month)
 358                                (Just pub_day)
 359                                Nothing
 360                                Nothing
 361                                Nothing
 362                                Nothing
 363
 364 ------------------------------------------------------------------------
 365 parseHal :: FilePath -> IO [HyperdataDocument]
 366 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readCsvHal fp
 367 ------------------------------------------------------------------------
 368