src/Gargantext/Text/Parsers/CSV.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers.CSV
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 CSV parser for Gargantext corpus files.
  11
  12 -}
  13
  14 {-# LANGUAGE NoImplicitPrelude #-}
  15 {-# LANGUAGE OverloadedStrings #-}
  16 {-# LANGUAGE DeriveGeneric     #-}
  17
  18 module Gargantext.Text.Parsers.CSV where
  19
  20 import GHC.Real (round)
  21 import GHC.IO (FilePath)
  22
  23 import Control.Applicative
  24
  25 import Data.Char (ord)
  26 import Data.Csv
  27 import Data.Either (Either(Left, Right))
  28 import Data.Text (Text, pack, length, intercalate)
  29 import qualified Data.ByteString.Lazy as BL
  30 import Data.Time.Segment (jour)
  31
  32 import Data.Vector (Vector)
  33 import qualified Data.Vector as V
  34 import Safe (tailMay)
  35
  36 import Gargantext.Database.Types.Node (HyperdataDocument(..))
  37 import Gargantext.Text
  38 import Gargantext.Text.Context
  39 import Gargantext.Prelude hiding (length)
  40
  41 ---------------------------------------------------------------
  42 headerCsvGargV3 :: Header
  43 headerCsvGargV3 = header [ "title"
  44          , "source"
  45          , "publication_year"
  46          , "publication_month"
  47          , "publication_day"
  48          , "abstract"
  49          , "authors"
  50          ]
  51 ---------------------------------------------------------------
  52 data Doc = Doc
  53     { d_docId  :: !Int
  54     , d_title  :: !Text
  55     , d_source :: !Text
  56     , d_publication_year  :: !Int
  57     , d_publication_month :: !Int
  58     , d_publication_day   :: !Int
  59     , d_abstract          :: !Text
  60     , d_authors           :: !Text
  61     }
  62     deriving (Show)
  63 ---------------------------------------------------------------
  64 -- | Doc 2 HyperdataDocument
  65 doc2hyperdataDocument :: Doc -> HyperdataDocument
  66 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
  67 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
  68   HyperdataDocument (Just "CSV")
  69                     (Just . pack . show $ did)
  70                     Nothing
  71                     Nothing
  72                     Nothing
  73                     Nothing
  74                     (Just dt)
  75                     Nothing
  76                     (Just dau)
  77                     (Just dab)
  78                     (Nothing)
  79                     Nothing
  80                     (Just dpy)
  81                     (Just dpm)
  82                     (Just dpd)
  83                     Nothing
  84                     Nothing
  85                     Nothing
  86                     Nothing
  87 ---------------------------------------------------------------
  88 -- | Types Conversions
  89 toDocs :: Vector CsvDoc -> [Doc]
  90 toDocs v = V.toList
  91          $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
  92                        -> Doc nId t s py pm pd abst auth )
  93                        (V.enumFromN 1 (V.length v'')) v''
  94           where
  95             v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
  96             seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
  97
  98 ---------------------------------------------------------------
  99 fromDocs :: Vector Doc -> Vector CsvDoc
 100 fromDocs docs = V.map fromDocs' docs
 101   where
 102     fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
 103
 104 ---------------------------------------------------------------
 105 -- | Split a document in its context
 106 -- TODO adapt the size of the paragraph according to the corpus average
 107
 108 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
 109 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
 110                           if docSize > 1000
 111                             then
 112                               if (mod (round m) docSize) >= 10
 113                                 then
 114                                   splitDoc' splt doc
 115                                 else
 116                                   V.fromList [doc]
 117                             else
 118                               V.fromList [doc]
 119
 120
 121 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
 122 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
 123     where
 124       firstDoc = CsvDoc t s py pm pd firstAbstract auth
 125       firstAbstract = head' abstracts
 126
 127       nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
 128
 129       abstracts    = (splitBy $ contextSize) abst
 130       head' x = maybe ""   identity (head x)
 131       tail' x = maybe [""] identity (tailMay x)
 132
 133 ---------------------------------------------------------------
 134 ---------------------------------------------------------------
 135 type Mean = Double
 136
 137 docsSize :: Vector CsvDoc -> Mean
 138 docsSize csvDoc = mean ls
 139   where
 140     ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
 141
 142
 143 ---------------------------------------------------------------
 144 data CsvDoc = CsvDoc
 145     { csv_title  :: !Text
 146     , csv_source :: !Text
 147     , csv_publication_year  :: !Int
 148     , csv_publication_month :: !Int
 149     , csv_publication_day   :: !Int
 150     , csv_abstract          :: !Text
 151     , csv_authors           :: !Text
 152     }
 153     deriving (Show)
 154
 155 instance FromNamedRecord CsvDoc where
 156   parseNamedRecord r = CsvDoc <$> r .: "title"
 157                               <*> r .: "source"
 158                               <*> r .: "publication_year"
 159                               <*> r .: "publication_month"
 160                               <*> r .: "publication_day"
 161                               <*> r .: "abstract"
 162                               <*> r .: "authors"
 163
 164 instance ToNamedRecord CsvDoc where
 165   toNamedRecord (CsvDoc t s py pm pd abst aut) =
 166     namedRecord [ "title"  .= t
 167                 , "source" .= s
 168                 , "publication_year"  .= py
 169                 , "publication_month" .= pm
 170                 , "publication_day"   .= pd
 171                 , "abstract"          .= abst
 172                 , "authors"           .= aut
 173                ]
 174
 175
 176 csvDecodeOptions :: DecodeOptions
 177 csvDecodeOptions = (defaultDecodeOptions
 178                       {decDelimiter = fromIntegral $ ord '\t'}
 179                     )
 180
 181 csvEncodeOptions :: EncodeOptions
 182 csvEncodeOptions = ( defaultEncodeOptions
 183                       {encDelimiter = fromIntegral $ ord '\t'}
 184                     )
 185
 186 ------------------------------------------------------------------------
 187 ------------------------------------------------------------------------
 188 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
 189 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
 190                       <$> snd
 191                       <$> readCsv fp
 192
 193 ------------------------------------------------------------------------
 194 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
 195 readCsv fp = do
 196     csvData <- BL.readFile fp
 197     case decodeByNameWith csvDecodeOptions csvData of
 198       Left e        -> panic (pack e)
 199       Right csvDocs -> pure csvDocs
 200
 201
 202 readHal :: FilePath -> IO (Header, Vector CsvHal)
 203 readHal fp = do
 204     csvData <- BL.readFile fp
 205     case decodeByNameWith csvDecodeOptions csvData of
 206       Left e        -> panic (pack e)
 207       Right csvDocs -> pure csvDocs
 208 ------------------------------------------------------------------------
 209 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 210 writeCsv fp (h, vs) = BL.writeFile fp $
 211                       encodeByNameWith csvEncodeOptions h (V.toList vs)
 212
 213
 214 ------------------------------------------------------------------------
 215 -- Hal Format
 216 data CsvHal = CsvHal
 217     { csvHal_title  :: !Text
 218     , csvHal_source :: !Text
 219     , csvHal_publication_year  :: !Integer
 220     , csvHal_publication_month :: !Int
 221     , csvHal_publication_day   :: !Int
 222     , csvHal_abstract          :: !Text
 223     , csvHal_authors           :: !Text
 224
 225     , csvHal_url               :: !Text
 226     , csvHal_isbn_s            :: !Text
 227     , csvHal_issue_s           :: !Text
 228     , csvHal_journalPublisher_s:: !Text
 229     , csvHal_language_s        :: !Text
 230
 231     , csvHal_doiId_s           :: !Text
 232     , csvHal_authId_i          :: !Text
 233     , csvHal_instStructId_i    :: !Text
 234     , csvHal_deptStructId_i    :: !Text
 235     , csvHal_labStructId_i     :: !Text
 236
 237     , csvHal_rteamStructId_i   :: !Text
 238     , csvHal_docType_s         :: !Text
 239     }
 240     deriving (Show)
 241
 242 instance FromNamedRecord CsvHal where
 243   parseNamedRecord r = CsvHal <$> r .: "title"
 244                               <*> r .: "source"
 245                               <*> r .: "publication_year"
 246                               <*> r .: "publication_month"
 247                               <*> r .: "publication_day"
 248                               <*> r .: "abstract"
 249                               <*> r .: "authors"
 250
 251                               <*> r .: "url"
 252                               <*> r .: "isbn_s"
 253                               <*> r .: "issue_s"
 254                               <*> r .: "journalPublisher_s"
 255                               <*> r .: "language_s"
 256
 257                               <*> r .: "doiId_s"
 258                               <*> r .: "authId_i"
 259                               <*> r .: "instStructId_i"
 260                               <*> r .: "deptStructId_i"
 261                               <*> r .: "labStructId_i"
 262
 263                               <*> r .: "rteamStructId_i"
 264                               <*> r .: "docType_s"
 265
 266 instance ToNamedRecord CsvHal where
 267   toNamedRecord (CsvHal t s py  pm pd abst aut  url isbn iss j lang  doi auth inst dept lab team doct) =
 268     namedRecord [ "title"  .= t
 269                 , "source" .= s
 270
 271                 , "publication_year"  .= py
 272                 , "publication_month" .= pm
 273                 , "publication_day"   .= pd
 274
 275                 , "abstract"          .= abst
 276                 , "authors"           .= aut
 277
 278                 , "url"                .= url
 279                 , "isbn_s"             .= isbn
 280                 , "issue_s"            .= iss
 281                 , "journalPublisher_s" .= j
 282                 , "language_s"         .= lang
 283
 284                 , "doiId_s"            .= doi
 285                 , "authId_i"           .= auth
 286                 , "instStructId_i"     .= inst
 287                 , "deptStructId_i"     .= dept
 288                 , "labStructId_i"      .= lab
 289
 290                 , "rteamStructId_i"    .= team
 291                 , "docType_s"          .= doct
 292                ]
 293
 294 csvHal2doc :: CsvHal -> HyperdataDocument
 295 csvHal2doc (CsvHal title source
 296        pub_year pub_month pub_day
 297        abstract authors
 298        url _ _ _ _
 299        doi _ inst _ _
 300        _ _ ) = HyperdataDocument (Just "CsvHal")
 301                                (Just doi)
 302                                (Just url)
 303                                Nothing
 304                                Nothing
 305                                Nothing
 306                                (Just title)
 307                                (Just authors)
 308                                (Just inst)
 309                                (Just source)
 310                                (Just abstract)
 311                                (Just $ pack . show $ jour pub_year pub_month pub_day)
 312                                (Just $ fromIntegral pub_year)
 313                                (Just pub_month)
 314                                (Just pub_day)
 315                                Nothing
 316                                Nothing
 317                                Nothing
 318                                Nothing
 319
 320 ------------------------------------------------------------------------
 321 parseHal :: FilePath -> IO [HyperdataDocument]
 322 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
 323 ------------------------------------------------------------------------
 324
 325