src/Gargantext/Text/Parsers/CSV.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers.CSV
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 CSV parser for Gargantext corpus files.
  11
  12 -}
  13
  14 {-# LANGUAGE NoImplicitPrelude #-}
  15 {-# LANGUAGE OverloadedStrings #-}
  16 {-# LANGUAGE DeriveGeneric     #-}
  17
  18 module Gargantext.Text.Parsers.CSV where
  19
  20 import GHC.Real (round)
  21 import GHC.IO (FilePath)
  22
  23 import Control.Applicative
  24
  25 import Data.Char (ord)
  26 import Data.Csv
  27 import Data.Either (Either(Left, Right))
  28 import Data.Text (Text, pack, length, intercalate)
  29 import qualified Data.ByteString.Lazy as BL
  30 import Data.Time.Segment (jour)
  31
  32 import Data.Vector (Vector)
  33 import qualified Data.Vector as V
  34 import Safe (tailMay)
  35
  36 import Gargantext.Database.Types.Node (HyperdataDocument(..))
  37 import Gargantext.Text
  38 import Gargantext.Text.Context
  39 import Gargantext.Prelude hiding (length)
  40
  41 ---------------------------------------------------------------
  42 headerCsvGargV3 :: Header
  43 headerCsvGargV3 = header [ "title"
  44          , "source"
  45          , "publication_year"
  46          , "publication_month"
  47          , "publication_day"
  48          , "abstract"
  49          , "authors"
  50          ]
  51 ---------------------------------------------------------------
  52 data Doc = Doc
  53     { d_docId  :: !Int
  54     , d_title  :: !Text
  55     , d_source :: !Text
  56     , d_publication_year  :: !Int
  57     , d_publication_month :: !Int
  58     , d_publication_day   :: !Int
  59     , d_abstract          :: !Text
  60     , d_authors           :: !Text
  61     }
  62     deriving (Show)
  63 ---------------------------------------------------------------
  64 -- | Doc 2 HyperdataDocument
  65 doc2hyperdataDocument :: Doc -> HyperdataDocument
  66 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
  67 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
  68   HyperdataDocument (Just "CSV")
  69                     (Just . pack . show $ did)
  70                     Nothing
  71                     Nothing
  72                     Nothing
  73                     Nothing
  74                     (Just dt)
  75                     Nothing
  76                     (Just dau)
  77                     (Just dab)
  78                     (Nothing)
  79                     Nothing
  80                     (Just dpy)
  81                     (Just dpm)
  82                     (Just dpd)
  83                     Nothing
  84                     Nothing
  85                     Nothing
  86                     Nothing
  87 ---------------------------------------------------------------
  88 -- | Types Conversions
  89 toDocs :: Vector CsvDoc -> [Doc]
  90 toDocs v = V.toList
  91          $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
  92                        -> Doc nId t s py pm pd abst auth )
  93                        (V.enumFromN 1 (V.length v'')) v''
  94           where
  95             v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
  96             seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
  97
  98 ---------------------------------------------------------------
  99 fromDocs :: Vector Doc -> Vector CsvDoc
 100 fromDocs docs = V.map fromDocs' docs
 101   where
 102     fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
 103
 104 ---------------------------------------------------------------
 105 -- | Split a document in its context
 106 -- TODO adapt the size of the paragraph according to the corpus average
 107
 108 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
 109 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
 110                           if docSize > 1000
 111                             then
 112                               if (mod (round m) docSize) >= 10
 113                                 then
 114                                   splitDoc' splt doc
 115                                 else
 116                                   V.fromList [doc]
 117                             else
 118                               V.fromList [doc]
 119
 120
 121 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
 122 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
 123     where
 124       firstDoc = CsvDoc t s py pm pd firstAbstract auth
 125       firstAbstract = head' "splitDoc'1" abstracts
 126
 127       nextDocs = map (\txt -> CsvDoc (head' "splitDoc'2" $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
 128
 129       abstracts    = (splitBy $ contextSize) abst
 130       tail' x = maybe [""] identity (tailMay x)
 131
 132 ---------------------------------------------------------------
 133 ---------------------------------------------------------------
 134 type Mean = Double
 135
 136 docsSize :: Vector CsvDoc -> Mean
 137 docsSize csvDoc = mean ls
 138   where
 139     ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
 140
 141
 142 ---------------------------------------------------------------
 143 data CsvDoc = CsvDoc
 144     { csv_title  :: !Text
 145     , csv_source :: !Text
 146     , csv_publication_year  :: !Int
 147     , csv_publication_month :: !Int
 148     , csv_publication_day   :: !Int
 149     , csv_abstract          :: !Text
 150     , csv_authors           :: !Text
 151     }
 152     deriving (Show)
 153
 154 instance FromNamedRecord CsvDoc where
 155   parseNamedRecord r = CsvDoc <$> r .: "title"
 156                               <*> r .: "source"
 157                               <*> r .: "publication_year"
 158                               <*> r .: "publication_month"
 159                               <*> r .: "publication_day"
 160                               <*> r .: "abstract"
 161                               <*> r .: "authors"
 162
 163 instance ToNamedRecord CsvDoc where
 164   toNamedRecord (CsvDoc t s py pm pd abst aut) =
 165     namedRecord [ "title"  .= t
 166                 , "source" .= s
 167                 , "publication_year"  .= py
 168                 , "publication_month" .= pm
 169                 , "publication_day"   .= pd
 170                 , "abstract"          .= abst
 171                 , "authors"           .= aut
 172                ]
 173
 174
 175 csvDecodeOptions :: DecodeOptions
 176 csvDecodeOptions = (defaultDecodeOptions
 177                       {decDelimiter = fromIntegral $ ord '\t'}
 178                     )
 179
 180 csvEncodeOptions :: EncodeOptions
 181 csvEncodeOptions = ( defaultEncodeOptions
 182                       {encDelimiter = fromIntegral $ ord '\t'}
 183                     )
 184
 185 ------------------------------------------------------------------------
 186 ------------------------------------------------------------------------
 187 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
 188 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
 189                       <$> snd
 190                       <$> readCsv fp
 191
 192 ------------------------------------------------------------------------
 193 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
 194 readCsv fp = do
 195     csvData <- BL.readFile fp
 196     case decodeByNameWith csvDecodeOptions csvData of
 197       Left e        -> panic (pack e)
 198       Right csvDocs -> pure csvDocs
 199
 200
 201 readHal :: FilePath -> IO (Header, Vector CsvHal)
 202 readHal fp = do
 203     csvData <- BL.readFile fp
 204     case decodeByNameWith csvDecodeOptions csvData of
 205       Left e        -> panic (pack e)
 206       Right csvDocs -> pure csvDocs
 207 ------------------------------------------------------------------------
 208 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 209 writeCsv fp (h, vs) = BL.writeFile fp $
 210                       encodeByNameWith csvEncodeOptions h (V.toList vs)
 211
 212
 213 ------------------------------------------------------------------------
 214 -- Hal Format
 215 data CsvHal = CsvHal
 216     { csvHal_title  :: !Text
 217     , csvHal_source :: !Text
 218     , csvHal_publication_year  :: !Integer
 219     , csvHal_publication_month :: !Int
 220     , csvHal_publication_day   :: !Int
 221     , csvHal_abstract          :: !Text
 222     , csvHal_authors           :: !Text
 223
 224     , csvHal_url               :: !Text
 225     , csvHal_isbn_s            :: !Text
 226     , csvHal_issue_s           :: !Text
 227     , csvHal_journalPublisher_s:: !Text
 228     , csvHal_language_s        :: !Text
 229
 230     , csvHal_doiId_s           :: !Text
 231     , csvHal_authId_i          :: !Text
 232     , csvHal_instStructId_i    :: !Text
 233     , csvHal_deptStructId_i    :: !Text
 234     , csvHal_labStructId_i     :: !Text
 235
 236     , csvHal_rteamStructId_i   :: !Text
 237     , csvHal_docType_s         :: !Text
 238     }
 239     deriving (Show)
 240
 241 instance FromNamedRecord CsvHal where
 242   parseNamedRecord r = CsvHal <$> r .: "title"
 243                               <*> r .: "source"
 244                               <*> r .: "publication_year"
 245                               <*> r .: "publication_month"
 246                               <*> r .: "publication_day"
 247                               <*> r .: "abstract"
 248                               <*> r .: "authors"
 249
 250                               <*> r .: "url"
 251                               <*> r .: "isbn_s"
 252                               <*> r .: "issue_s"
 253                               <*> r .: "journalPublisher_s"
 254                               <*> r .: "language_s"
 255
 256                               <*> r .: "doiId_s"
 257                               <*> r .: "authId_i"
 258                               <*> r .: "instStructId_i"
 259                               <*> r .: "deptStructId_i"
 260                               <*> r .: "labStructId_i"
 261
 262                               <*> r .: "rteamStructId_i"
 263                               <*> r .: "docType_s"
 264
 265 instance ToNamedRecord CsvHal where
 266   toNamedRecord (CsvHal t s py  pm pd abst aut  url isbn iss j lang  doi auth inst dept lab team doct) =
 267     namedRecord [ "title"  .= t
 268                 , "source" .= s
 269
 270                 , "publication_year"  .= py
 271                 , "publication_month" .= pm
 272                 , "publication_day"   .= pd
 273
 274                 , "abstract"          .= abst
 275                 , "authors"           .= aut
 276
 277                 , "url"                .= url
 278                 , "isbn_s"             .= isbn
 279                 , "issue_s"            .= iss
 280                 , "journalPublisher_s" .= j
 281                 , "language_s"         .= lang
 282
 283                 , "doiId_s"            .= doi
 284                 , "authId_i"           .= auth
 285                 , "instStructId_i"     .= inst
 286                 , "deptStructId_i"     .= dept
 287                 , "labStructId_i"      .= lab
 288
 289                 , "rteamStructId_i"    .= team
 290                 , "docType_s"          .= doct
 291                ]
 292
 293 csvHal2doc :: CsvHal -> HyperdataDocument
 294 csvHal2doc (CsvHal title source
 295        pub_year pub_month pub_day
 296        abstract authors
 297        url _ _ _ _
 298        doi _ inst _ _
 299        _ _ ) = HyperdataDocument (Just "CsvHal")
 300                                (Just doi)
 301                                (Just url)
 302                                Nothing
 303                                Nothing
 304                                Nothing
 305                                (Just title)
 306                                (Just authors)
 307                                (Just inst)
 308                                (Just source)
 309                                (Just abstract)
 310                                (Just $ pack . show $ jour pub_year pub_month pub_day)
 311                                (Just $ fromIntegral pub_year)
 312                                (Just pub_month)
 313                                (Just pub_day)
 314                                Nothing
 315                                Nothing
 316                                Nothing
 317                                Nothing
 318
 319 ------------------------------------------------------------------------
 320 parseHal :: FilePath -> IO [HyperdataDocument]
 321 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
 322 ------------------------------------------------------------------------
 323
 324