src/Gargantext/Text/Parsers/CSV.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers.CSV
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 CSV parser for Gargantext corpus files.
  11
  12 -}
  13
  14 {-# LANGUAGE NoImplicitPrelude #-}
  15 {-# LANGUAGE OverloadedStrings #-}
  16 {-# LANGUAGE DeriveGeneric     #-}
  17
  18 module Gargantext.Text.Parsers.CSV where
  19
  20 import GHC.Real (round)
  21 import GHC.IO (FilePath)
  22
  23 import Control.Applicative
  24
  25 import Data.Char (ord)
  26 import Data.Csv
  27 import Data.Either (Either(Left, Right))
  28 import Data.Text (Text, pack, length, intercalate)
  29 import qualified Data.ByteString.Lazy as BL
  30
  31 import Data.Vector (Vector)
  32 import qualified Data.Vector as V
  33 import Safe (tailMay)
  34
  35 import Gargantext.Database.Types.Node (HyperdataDocument(..))
  36 import Gargantext.Text
  37 import Gargantext.Text.Context
  38 import Gargantext.Prelude hiding (length)
  39
  40 ---------------------------------------------------------------
  41 data Doc = Doc
  42     { d_docId  :: !Int
  43     , d_title  :: !Text
  44     , d_source :: !Text
  45     , d_publication_year  :: !Int
  46     , d_publication_month :: !Int
  47     , d_publication_day   :: !Int
  48     , d_abstract          :: !Text
  49     , d_authors           :: !Text
  50     }
  51     deriving (Show)
  52 ---------------------------------------------------------------
  53 -- | Doc 2 HyperdataDocument
  54 doc2hyperdataDocument :: Doc -> HyperdataDocument
  55 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
  56 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
  57   HyperdataDocument (Just "CSV")
  58                     (Just did)
  59                     Nothing
  60                     Nothing
  61                     Nothing
  62                     (Just dt)
  63                     (Just dau)
  64                     (Just dab)
  65                     (Nothing)
  66                     Nothing
  67                     (Just dpy)
  68                     (Just dpm)
  69                     (Just dpd)
  70                     Nothing
  71                     Nothing
  72                     Nothing
  73                     Nothing
  74 ---------------------------------------------------------------
  75 -- | Types Conversions
  76 toDocs :: Vector CsvDoc -> [Doc]
  77 toDocs v = V.toList
  78          $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
  79                        -> Doc nId t s py pm pd abst auth )
  80                        (V.enumFromN 1 (V.length v'')) v''
  81           where
  82             v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
  83             seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
  84
  85 ---------------------------------------------------------------
  86 fromDocs :: Vector Doc -> Vector CsvDoc
  87 fromDocs docs = V.map fromDocs' docs
  88   where
  89     fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
  90
  91 ---------------------------------------------------------------
  92 -- | Split a document in its context
  93 -- TODO adapt the size of the paragraph according to the corpus average
  94
  95 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
  96 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
  97                           if docSize > 1000
  98                             then
  99                               if (mod (round m) docSize) >= 10
 100                                 then
 101                                   splitDoc' splt doc
 102                                 else
 103                                   V.fromList [doc]
 104                             else
 105                               V.fromList [doc]
 106
 107
 108 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
 109 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
 110     where
 111       firstDoc = CsvDoc t s py pm pd firstAbstract auth
 112       firstAbstract = head' abstracts
 113
 114       nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
 115
 116       abstracts    = (splitBy $ contextSize) abst
 117       head' x = maybe ""   identity (head x)
 118       tail' x = maybe [""] identity (tailMay x)
 119
 120 ---------------------------------------------------------------
 121 ---------------------------------------------------------------
 122 type Mean = Double
 123
 124 docsSize :: Vector CsvDoc -> Mean
 125 docsSize csvDoc = mean ls
 126   where
 127     ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
 128
 129
 130 ---------------------------------------------------------------
 131 data CsvDoc = CsvDoc
 132     { csv_title  :: !Text
 133     , csv_source :: !Text
 134     , csv_publication_year  :: !Int
 135     , csv_publication_month :: !Int
 136     , csv_publication_day   :: !Int
 137     , csv_abstract          :: !Text
 138     , csv_authors           :: !Text
 139     }
 140     deriving (Show)
 141
 142 instance FromNamedRecord CsvDoc where
 143   parseNamedRecord r = CsvDoc <$> r .: "title"
 144                               <*> r .: "source"
 145                               <*> r .: "publication_year"
 146                               <*> r .: "publication_month"
 147                               <*> r .: "publication_day"
 148                               <*> r .: "abstract"
 149                               <*> r .: "authors"
 150
 151 instance ToNamedRecord CsvDoc where
 152   toNamedRecord (CsvDoc t s py pm pd abst aut) =
 153     namedRecord [ "title"  .= t
 154                 , "source" .= s
 155                 , "publication_year"  .= py
 156                 , "publication_month" .= pm
 157                 , "publication_day"   .= pd
 158                 , "abstract"          .= abst
 159                 , "authors"           .= aut
 160                ]
 161
 162
 163 csvDecodeOptions :: DecodeOptions
 164 csvDecodeOptions = (defaultDecodeOptions
 165                       {decDelimiter = fromIntegral $ ord '\t'}
 166                     )
 167
 168 csvEncodeOptions :: EncodeOptions
 169 csvEncodeOptions = ( defaultEncodeOptions
 170                       {encDelimiter = fromIntegral $ ord '\t'}
 171                     )
 172
 173 ------------------------------------------------------------------------
 174 ------------------------------------------------------------------------
 175 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
 176 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
 177                       <$> snd
 178                       <$> readCsv fp
 179
 180 ------------------------------------------------------------------------
 181 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
 182 readCsv fp = do
 183     csvData <- BL.readFile fp
 184     case decodeByNameWith csvDecodeOptions csvData of
 185       Left e        -> panic (pack e)
 186       Right csvDocs -> pure csvDocs
 187
 188
 189 readHal :: FilePath -> IO (Header, Vector CsvHal)
 190 readHal fp = do
 191     csvData <- BL.readFile fp
 192     case decodeByNameWith csvDecodeOptions csvData of
 193       Left e        -> panic (pack e)
 194       Right csvDocs -> pure csvDocs
 195 ------------------------------------------------------------------------
 196
 197
 198 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 199 writeCsv fp (h, vs) = BL.writeFile fp $
 200                       encodeByNameWith csvEncodeOptions h (V.toList vs)
 201
 202
 203 ------------------------------------------------------------------------
 204 -- Hal Format
 205 data CsvHal = CsvHal
 206     { csvHal_title  :: !Text
 207     , csvHal_source :: !Text
 208     , csvHal_publication_year  :: !Int
 209     , csvHal_publication_month :: !Int
 210     , csvHal_publication_day   :: !Int
 211     , csvHal_abstract          :: !Text
 212     , csvHal_authors           :: !Text
 213
 214     , csvHal_url               :: !Text
 215     , csvHal_isbn_s            :: !Text
 216     , csvHal_issue_s           :: !Text
 217     , csvHal_journalPublisher_s:: !Text
 218     , csvHal_language_s        :: !Text
 219
 220     , csvHal_doiId_s           :: !Text
 221     , csvHal_authId_i          :: !Text
 222     , csvHal_instStructId_i    :: !Text
 223     , csvHal_deptStructId_i    :: !Text
 224     , csvHal_labStructId_i     :: !Text
 225
 226     , csvHal_rteamStructId_i   :: !Text
 227     , csvHal_docType_s         :: !Text
 228     }
 229     deriving (Show)
 230
 231 instance FromNamedRecord CsvHal where
 232   parseNamedRecord r = CsvHal <$> r .: "title"
 233                               <*> r .: "source"
 234                               <*> r .: "publication_year"
 235                               <*> r .: "publication_month"
 236                               <*> r .: "publication_day"
 237                               <*> r .: "abstract"
 238                               <*> r .: "authors"
 239
 240                               <*> r .: "url"
 241                               <*> r .: "isbn_s"
 242                               <*> r .: "issue_s"
 243                               <*> r .: "journalPublisher_s"
 244                               <*> r .: "language_s"
 245
 246                               <*> r .: "doiId_s"
 247                               <*> r .: "authId_i"
 248                               <*> r .: "instStructId_i"
 249                               <*> r .: "deptStructId_i"
 250                               <*> r .: "labStructId_i"
 251
 252                               <*> r .: "rteamStructId_i"
 253                               <*> r .: "docType_s"
 254
 255 instance ToNamedRecord CsvHal where
 256   toNamedRecord (CsvHal t s py  pm pd abst aut  url isbn iss jour lang  doi auth inst dept lab team doct) =
 257     namedRecord [ "title"  .= t
 258                 , "source" .= s
 259                 , "publication_year"  .= py
 260                 , "publication_month" .= pm
 261                 , "publication_day"   .= pd
 262                 , "abstract"          .= abst
 263                 , "authors"           .= aut
 264
 265                 , "url"                .= url
 266                 , "isbn_s"             .= isbn
 267                 , "issue_s"            .= iss
 268                 , "journalPublisher_s" .= jour
 269                 , "language_s"         .= lang
 270
 271                 , "doiId_s"            .= doi
 272                 , "authId_i"           .= auth
 273                 , "instStructId_i"     .= inst
 274                 , "deptStructId_i"     .= dept
 275                 , "labStructId_i"      .= lab
 276
 277                 , "rteamStructId_i"    .= team
 278                 , "docType_s"          .= doct
 279                ]