src/Gargantext/Text/Parsers/CSV.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers.CSV
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 CSV parser for Gargantext corpus files.
  11
  12 -}
  13
  14 {-# LANGUAGE NoImplicitPrelude #-}
  15 {-# LANGUAGE OverloadedStrings #-}
  16 {-# LANGUAGE DeriveGeneric     #-}
  17
  18 module Gargantext.Text.Parsers.CSV where
  19
  20 import GHC.Real (round)
  21 import GHC.IO (FilePath)
  22
  23 import Control.Applicative
  24
  25 import Data.Char (ord)
  26 import Data.Csv
  27 import Data.Either (Either(Left, Right))
  28 import Data.Text (Text, pack, length, intercalate)
  29 import qualified Data.ByteString.Lazy as BL
  30
  31 import Data.Vector (Vector)
  32 import qualified Data.Vector as V
  33 import Safe (tailMay)
  34
  35 import Gargantext.Database.Types.Node (HyperdataDocument(..))
  36 import Gargantext.Text
  37 import Gargantext.Text.Context
  38 import Gargantext.Prelude hiding (length)
  39
  40 ---------------------------------------------------------------
  41 data Doc = Doc
  42     { d_docId  :: !Int
  43     , d_title  :: !Text
  44     , d_source :: !Text
  45     , d_publication_year  :: !Int
  46     , d_publication_month :: !Int
  47     , d_publication_day   :: !Int
  48     , d_abstract          :: !Text
  49     , d_authors           :: !Text
  50     }
  51     deriving (Show)
  52 ---------------------------------------------------------------
  53 -- | Doc 2 HyperdataDocument
  54 doc2hyperdataDocument :: Doc -> HyperdataDocument
  55 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
  56 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
  57   HyperdataDocument (Just "CSV")
  58                     (Just . pack . show $ did)
  59                     Nothing
  60                     Nothing
  61                     Nothing
  62                     Nothing
  63                     (Just dt)
  64                     (Just dau)
  65                     (Just dab)
  66                     (Nothing)
  67                     Nothing
  68                     (Just dpy)
  69                     (Just dpm)
  70                     (Just dpd)
  71                     Nothing
  72                     Nothing
  73                     Nothing
  74                     Nothing
  75 ---------------------------------------------------------------
  76 -- | Types Conversions
  77 toDocs :: Vector CsvDoc -> [Doc]
  78 toDocs v = V.toList
  79          $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
  80                        -> Doc nId t s py pm pd abst auth )
  81                        (V.enumFromN 1 (V.length v'')) v''
  82           where
  83             v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
  84             seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
  85
  86 ---------------------------------------------------------------
  87 fromDocs :: Vector Doc -> Vector CsvDoc
  88 fromDocs docs = V.map fromDocs' docs
  89   where
  90     fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
  91
  92 ---------------------------------------------------------------
  93 -- | Split a document in its context
  94 -- TODO adapt the size of the paragraph according to the corpus average
  95
  96 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
  97 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
  98                           if docSize > 1000
  99                             then
 100                               if (mod (round m) docSize) >= 10
 101                                 then
 102                                   splitDoc' splt doc
 103                                 else
 104                                   V.fromList [doc]
 105                             else
 106                               V.fromList [doc]
 107
 108
 109 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
 110 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
 111     where
 112       firstDoc = CsvDoc t s py pm pd firstAbstract auth
 113       firstAbstract = head' abstracts
 114
 115       nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
 116
 117       abstracts    = (splitBy $ contextSize) abst
 118       head' x = maybe ""   identity (head x)
 119       tail' x = maybe [""] identity (tailMay x)
 120
 121 ---------------------------------------------------------------
 122 ---------------------------------------------------------------
 123 type Mean = Double
 124
 125 docsSize :: Vector CsvDoc -> Mean
 126 docsSize csvDoc = mean ls
 127   where
 128     ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
 129
 130
 131 ---------------------------------------------------------------
 132 data CsvDoc = CsvDoc
 133     { csv_title  :: !Text
 134     , csv_source :: !Text
 135     , csv_publication_year  :: !Int
 136     , csv_publication_month :: !Int
 137     , csv_publication_day   :: !Int
 138     , csv_abstract          :: !Text
 139     , csv_authors           :: !Text
 140     }
 141     deriving (Show)
 142
 143 instance FromNamedRecord CsvDoc where
 144   parseNamedRecord r = CsvDoc <$> r .: "title"
 145                               <*> r .: "source"
 146                               <*> r .: "publication_year"
 147                               <*> r .: "publication_month"
 148                               <*> r .: "publication_day"
 149                               <*> r .: "abstract"
 150                               <*> r .: "authors"
 151
 152 instance ToNamedRecord CsvDoc where
 153   toNamedRecord (CsvDoc t s py pm pd abst aut) =
 154     namedRecord [ "title"  .= t
 155                 , "source" .= s
 156                 , "publication_year"  .= py
 157                 , "publication_month" .= pm
 158                 , "publication_day"   .= pd
 159                 , "abstract"          .= abst
 160                 , "authors"           .= aut
 161                ]
 162
 163
 164 csvDecodeOptions :: DecodeOptions
 165 csvDecodeOptions = (defaultDecodeOptions
 166                       {decDelimiter = fromIntegral $ ord '\t'}
 167                     )
 168
 169 csvEncodeOptions :: EncodeOptions
 170 csvEncodeOptions = ( defaultEncodeOptions
 171                       {encDelimiter = fromIntegral $ ord '\t'}
 172                     )
 173
 174 ------------------------------------------------------------------------
 175 ------------------------------------------------------------------------
 176 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
 177 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
 178                       <$> snd
 179                       <$> readCsv fp
 180
 181 ------------------------------------------------------------------------
 182 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
 183 readCsv fp = do
 184     csvData <- BL.readFile fp
 185     case decodeByNameWith csvDecodeOptions csvData of
 186       Left e        -> panic (pack e)
 187       Right csvDocs -> pure csvDocs
 188
 189
 190 readHal :: FilePath -> IO (Header, Vector CsvHal)
 191 readHal fp = do
 192     csvData <- BL.readFile fp
 193     case decodeByNameWith csvDecodeOptions csvData of
 194       Left e        -> panic (pack e)
 195       Right csvDocs -> pure csvDocs
 196 ------------------------------------------------------------------------
 197
 198
 199 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 200 writeCsv fp (h, vs) = BL.writeFile fp $
 201                       encodeByNameWith csvEncodeOptions h (V.toList vs)
 202
 203
 204 ------------------------------------------------------------------------
 205 -- Hal Format
 206 data CsvHal = CsvHal
 207     { csvHal_title  :: !Text
 208     , csvHal_source :: !Text
 209     , csvHal_publication_year  :: !Int
 210     , csvHal_publication_month :: !Int
 211     , csvHal_publication_day   :: !Int
 212     , csvHal_abstract          :: !Text
 213     , csvHal_authors           :: !Text
 214
 215     , csvHal_url               :: !Text
 216     , csvHal_isbn_s            :: !Text
 217     , csvHal_issue_s           :: !Text
 218     , csvHal_journalPublisher_s:: !Text
 219     , csvHal_language_s        :: !Text
 220
 221     , csvHal_doiId_s           :: !Text
 222     , csvHal_authId_i          :: !Text
 223     , csvHal_instStructId_i    :: !Text
 224     , csvHal_deptStructId_i    :: !Text
 225     , csvHal_labStructId_i     :: !Text
 226
 227     , csvHal_rteamStructId_i   :: !Text
 228     , csvHal_docType_s         :: !Text
 229     }
 230     deriving (Show)
 231
 232 instance FromNamedRecord CsvHal where
 233   parseNamedRecord r = CsvHal <$> r .: "title"
 234                               <*> r .: "source"
 235                               <*> r .: "publication_year"
 236                               <*> r .: "publication_month"
 237                               <*> r .: "publication_day"
 238                               <*> r .: "abstract"
 239                               <*> r .: "authors"
 240
 241                               <*> r .: "url"
 242                               <*> r .: "isbn_s"
 243                               <*> r .: "issue_s"
 244                               <*> r .: "journalPublisher_s"
 245                               <*> r .: "language_s"
 246
 247                               <*> r .: "doiId_s"
 248                               <*> r .: "authId_i"
 249                               <*> r .: "instStructId_i"
 250                               <*> r .: "deptStructId_i"
 251                               <*> r .: "labStructId_i"
 252
 253                               <*> r .: "rteamStructId_i"
 254                               <*> r .: "docType_s"
 255
 256 instance ToNamedRecord CsvHal where
 257   toNamedRecord (CsvHal t s py  pm pd abst aut  url isbn iss jour lang  doi auth inst dept lab team doct) =
 258     namedRecord [ "title"  .= t
 259                 , "source" .= s
 260                 , "publication_year"  .= py
 261                 , "publication_month" .= pm
 262                 , "publication_day"   .= pd
 263                 , "abstract"          .= abst
 264                 , "authors"           .= aut
 265
 266                 , "url"                .= url
 267                 , "isbn_s"             .= isbn
 268                 , "issue_s"            .= iss
 269                 , "journalPublisher_s" .= jour
 270                 , "language_s"         .= lang
 271
 272                 , "doiId_s"            .= doi
 273                 , "authId_i"           .= auth
 274                 , "instStructId_i"     .= inst
 275                 , "deptStructId_i"     .= dept
 276                 , "labStructId_i"      .= lab
 277
 278                 , "rteamStructId_i"    .= team
 279                 , "docType_s"          .= doct
 280                ]