src/Gargantext/Text/Parsers/CSV.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers.CSV
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 CSV parser for Gargantext corpus files.
  11
  12 -}
  13
  14 {-# LANGUAGE NoImplicitPrelude #-}
  15 {-# LANGUAGE OverloadedStrings #-}
  16 {-# LANGUAGE DeriveGeneric     #-}
  17
  18 module Gargantext.Text.Parsers.CSV where
  19
  20 import GHC.Real (round)
  21 import GHC.IO (FilePath)
  22
  23 import Control.Applicative
  24
  25 import Data.Char (ord)
  26 import Data.Csv
  27 import Data.Either (Either(Left, Right))
  28 import Data.Text (Text, pack, length, intercalate)
  29 import qualified Data.ByteString.Lazy as BL
  30
  31 import Data.Vector (Vector)
  32 import qualified Data.Vector as V
  33 import Safe (tailMay)
  34
  35 import Gargantext.Core.Types.Node (HyperdataDocument(..))
  36 import Gargantext.Text
  37 import Gargantext.Text.Context
  38 import Gargantext.Prelude hiding (length)
  39
  40 ---------------------------------------------------------------
  41 data Doc = Doc
  42     { d_docId  :: !Int
  43     , d_title  :: !Text
  44     , d_source :: !Text
  45     , d_publication_year  :: !Int
  46     , d_publication_month :: !Int
  47     , d_publication_day   :: !Int
  48     , d_abstract          :: !Text
  49     , d_authors           :: !Text
  50     }
  51     deriving (Show)
  52 ---------------------------------------------------------------
  53 -- | Doc 2 HyperdataDocument
  54 doc2hyperdataDocument :: Doc -> HyperdataDocument
  55 doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
  56   HyperdataDocument (Just "CSV")
  57                     (Just did)
  58                     Nothing
  59                     Nothing
  60                     (Just dt)
  61                     (Just dau)
  62                     (Just ds)
  63                     (Just dab)
  64                     (Nothing)
  65                     Nothing
  66                     (Just dpy)
  67                     (Just dpm)
  68                     (Just dpd)
  69                     Nothing
  70                     Nothing
  71                     Nothing
  72 ---------------------------------------------------------------
  73 -- | Types Conversions
  74 toDocs :: Vector CsvDoc -> [Doc]
  75 toDocs v = V.toList
  76          $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
  77                        -> Doc nId t s py pm pd abst auth )
  78                        (V.enumFromN 1 (V.length v'')) v''
  79           where
  80             v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
  81             seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
  82
  83 ---------------------------------------------------------------
  84 fromDocs :: Vector Doc -> Vector CsvDoc
  85 fromDocs docs = V.map fromDocs' docs
  86   where
  87     fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
  88
  89 ---------------------------------------------------------------
  90 -- | Split a document in its context
  91 -- TODO adapt the size of the paragraph according to the corpus average
  92
  93 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
  94 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
  95                           if docSize > 1000
  96                             then
  97                               if (mod (round m) docSize) >= 10
  98                                 then
  99                                   splitDoc' splt doc
 100                                 else
 101                                   V.fromList [doc]
 102                             else
 103                               V.fromList [doc]
 104
 105
 106 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
 107 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
 108     where
 109       firstDoc = CsvDoc t s py pm pd firstAbstract auth
 110       firstAbstract = head' abstracts
 111
 112       nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
 113
 114       abstracts    = (splitBy $ contextSize) abst
 115       head' x = maybe ""   identity (head x)
 116       tail' x = maybe [""] identity (tailMay x)
 117
 118 ---------------------------------------------------------------
 119 ---------------------------------------------------------------
 120 type Mean = Double
 121
 122 docsSize :: Vector CsvDoc -> Mean
 123 docsSize csvDoc = mean ls
 124   where
 125     ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
 126
 127
 128 ---------------------------------------------------------------
 129 data CsvDoc = CsvDoc
 130     { csv_title  :: !Text
 131     , csv_source :: !Text
 132     , csv_publication_year  :: !Int
 133     , csv_publication_month :: !Int
 134     , csv_publication_day   :: !Int
 135     , csv_abstract          :: !Text
 136     , csv_authors           :: !Text
 137     }
 138     deriving (Show)
 139
 140 instance FromNamedRecord CsvDoc where
 141   parseNamedRecord r = CsvDoc <$> r .: "title"
 142                               <*> r .: "source"
 143                               <*> r .: "publication_year"
 144                               <*> r .: "publication_month"
 145                               <*> r .: "publication_day"
 146                               <*> r .: "abstract"
 147                               <*> r .: "authors"
 148
 149 instance ToNamedRecord CsvDoc where
 150   toNamedRecord (CsvDoc t s py pm pd abst aut) =
 151     namedRecord [ "title"  .= t
 152                 , "source" .= s
 153                 , "publication_year"  .= py
 154                 , "publication_month" .= pm
 155                 , "publication_day"   .= pd
 156                 , "abstract"          .= abst
 157                 , "authors"           .= aut
 158                ]
 159
 160
 161 csvDecodeOptions :: DecodeOptions
 162 csvDecodeOptions = (defaultDecodeOptions
 163                       {decDelimiter = fromIntegral $ ord '\t'}
 164                     )
 165
 166 csvEncodeOptions :: EncodeOptions
 167 csvEncodeOptions = ( defaultEncodeOptions
 168                       {encDelimiter = fromIntegral $ ord '\t'}
 169                     )
 170
 171 ------------------------------------------------------------------------
 172 ------------------------------------------------------------------------
 173 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
 174 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
 175                       <$> snd
 176                       <$> readCsv fp
 177
 178 ------------------------------------------------------------------------
 179 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
 180 readCsv fp = do
 181     csvData <- BL.readFile fp
 182     case decodeByNameWith csvDecodeOptions csvData of
 183       Left e        -> panic (pack e)
 184       Right csvDocs -> pure csvDocs
 185
 186
 187 readHal :: FilePath -> IO (Header, Vector CsvHal)
 188 readHal fp = do
 189     csvData <- BL.readFile fp
 190     case decodeByNameWith csvDecodeOptions csvData of
 191       Left e        -> panic (pack e)
 192       Right csvDocs -> pure csvDocs
 193 ------------------------------------------------------------------------
 194
 195
 196 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 197 writeCsv fp (h, vs) = BL.writeFile fp $
 198                       encodeByNameWith csvEncodeOptions h (V.toList vs)
 199
 200
 201 ------------------------------------------------------------------------
 202 -- Hal Format
 203 data CsvHal = CsvHal
 204     { csvHal_title  :: !Text
 205     , csvHal_source :: !Text
 206     , csvHal_publication_year  :: !Int
 207     , csvHal_publication_month :: !Int
 208     , csvHal_publication_day   :: !Int
 209     , csvHal_abstract          :: !Text
 210     , csvHal_authors           :: !Text
 211
 212     , csvHal_url               :: !Text
 213     , csvHal_isbn_s            :: !Text
 214     , csvHal_issue_s           :: !Text
 215     , csvHal_journalPublisher_s:: !Text
 216     , csvHal_language_s        :: !Text
 217
 218     , csvHal_doiId_s           :: !Text
 219     , csvHal_authId_i          :: !Text
 220     , csvHal_instStructId_i    :: !Text
 221     , csvHal_deptStructId_i    :: !Text
 222     , csvHal_labStructId_i     :: !Text
 223
 224     , csvHal_rteamStructId_i   :: !Text
 225     , csvHal_docType_s         :: !Text
 226     }
 227     deriving (Show)
 228
 229 instance FromNamedRecord CsvHal where
 230   parseNamedRecord r = CsvHal <$> r .: "title"
 231                               <*> r .: "source"
 232                               <*> r .: "publication_year"
 233                               <*> r .: "publication_month"
 234                               <*> r .: "publication_day"
 235                               <*> r .: "abstract"
 236                               <*> r .: "authors"
 237
 238                               <*> r .: "url"
 239                               <*> r .: "isbn_s"
 240                               <*> r .: "issue_s"
 241                               <*> r .: "journalPublisher_s"
 242                               <*> r .: "language_s"
 243
 244                               <*> r .: "doiId_s"
 245                               <*> r .: "authId_i"
 246                               <*> r .: "instStructId_i"
 247                               <*> r .: "deptStructId_i"
 248                               <*> r .: "labStructId_i"
 249
 250                               <*> r .: "rteamStructId_i"
 251                               <*> r .: "docType_s"
 252
 253 instance ToNamedRecord CsvHal where
 254   toNamedRecord (CsvHal t s py  pm pd abst aut  url isbn iss jour lang  doi auth inst dept lab team doct) =
 255     namedRecord [ "title"  .= t
 256                 , "source" .= s
 257                 , "publication_year"  .= py
 258                 , "publication_month" .= pm
 259                 , "publication_day"   .= pd
 260                 , "abstract"          .= abst
 261                 , "authors"           .= aut
 262
 263                 , "url"                .= url
 264                 , "isbn_s"             .= isbn
 265                 , "issue_s"            .= iss
 266                 , "journalPublisher_s" .= jour
 267                 , "language_s"         .= lang
 268
 269                 , "doiId_s"            .= doi
 270                 , "authId_i"           .= auth
 271                 , "instStructId_i"     .= inst
 272                 , "deptStructId_i"     .= dept
 273                 , "labStructId_i"      .= lab
 274
 275                 , "rteamStructId_i"    .= team
 276                 , "docType_s"          .= doct
 277                ]