src/Gargantext/Text/Parsers/CSV.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers.CSV
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 CSV parser for Gargantext corpus files.
  11
  12 -}
  13
  14 {-# LANGUAGE NoImplicitPrelude #-}
  15 {-# LANGUAGE OverloadedStrings #-}
  16 {-# LANGUAGE DeriveGeneric     #-}
  17
  18 module Gargantext.Text.Parsers.CSV where
  19
  20 import GHC.Real (round)
  21 import GHC.IO (FilePath)
  22
  23 import Control.Applicative
  24
  25 import Data.Char (ord)
  26 import Data.Csv
  27 import Data.Either (Either(Left, Right))
  28 import Data.Text (Text, pack, length, intercalate)
  29 import qualified Data.ByteString.Lazy as BL
  30 import Data.Time.Segment (jour)
  31
  32 import Data.Vector (Vector)
  33 import qualified Data.Vector as V
  34 import Safe (tailMay)
  35
  36 import Gargantext.Database.Types.Node (HyperdataDocument(..))
  37 import Gargantext.Text
  38 import Gargantext.Text.Context
  39 import Gargantext.Prelude hiding (length)
  40
  41 ---------------------------------------------------------------
  42 data Doc = Doc
  43     { d_docId  :: !Int
  44     , d_title  :: !Text
  45     , d_source :: !Text
  46     , d_publication_year  :: !Int
  47     , d_publication_month :: !Int
  48     , d_publication_day   :: !Int
  49     , d_abstract          :: !Text
  50     , d_authors           :: !Text
  51     }
  52     deriving (Show)
  53 ---------------------------------------------------------------
  54 -- | Doc 2 HyperdataDocument
  55 doc2hyperdataDocument :: Doc -> HyperdataDocument
  56 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
  57 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
  58   HyperdataDocument (Just "CSV")
  59                     (Just . pack . show $ did)
  60                     Nothing
  61                     Nothing
  62                     Nothing
  63                     Nothing
  64                     (Just dt)
  65                     Nothing
  66                     (Just dau)
  67                     (Just dab)
  68                     (Nothing)
  69                     Nothing
  70                     (Just dpy)
  71                     (Just dpm)
  72                     (Just dpd)
  73                     Nothing
  74                     Nothing
  75                     Nothing
  76                     Nothing
  77 ---------------------------------------------------------------
  78 -- | Types Conversions
  79 toDocs :: Vector CsvDoc -> [Doc]
  80 toDocs v = V.toList
  81          $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
  82                        -> Doc nId t s py pm pd abst auth )
  83                        (V.enumFromN 1 (V.length v'')) v''
  84           where
  85             v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
  86             seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
  87
  88 ---------------------------------------------------------------
  89 fromDocs :: Vector Doc -> Vector CsvDoc
  90 fromDocs docs = V.map fromDocs' docs
  91   where
  92     fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
  93
  94 ---------------------------------------------------------------
  95 -- | Split a document in its context
  96 -- TODO adapt the size of the paragraph according to the corpus average
  97
  98 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
  99 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
 100                           if docSize > 1000
 101                             then
 102                               if (mod (round m) docSize) >= 10
 103                                 then
 104                                   splitDoc' splt doc
 105                                 else
 106                                   V.fromList [doc]
 107                             else
 108                               V.fromList [doc]
 109
 110
 111 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
 112 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
 113     where
 114       firstDoc = CsvDoc t s py pm pd firstAbstract auth
 115       firstAbstract = head' abstracts
 116
 117       nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
 118
 119       abstracts    = (splitBy $ contextSize) abst
 120       head' x = maybe ""   identity (head x)
 121       tail' x = maybe [""] identity (tailMay x)
 122
 123 ---------------------------------------------------------------
 124 ---------------------------------------------------------------
 125 type Mean = Double
 126
 127 docsSize :: Vector CsvDoc -> Mean
 128 docsSize csvDoc = mean ls
 129   where
 130     ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
 131
 132
 133 ---------------------------------------------------------------
 134 data CsvDoc = CsvDoc
 135     { csv_title  :: !Text
 136     , csv_source :: !Text
 137     , csv_publication_year  :: !Int
 138     , csv_publication_month :: !Int
 139     , csv_publication_day   :: !Int
 140     , csv_abstract          :: !Text
 141     , csv_authors           :: !Text
 142     }
 143     deriving (Show)
 144
 145 instance FromNamedRecord CsvDoc where
 146   parseNamedRecord r = CsvDoc <$> r .: "title"
 147                               <*> r .: "source"
 148                               <*> r .: "publication_year"
 149                               <*> r .: "publication_month"
 150                               <*> r .: "publication_day"
 151                               <*> r .: "abstract"
 152                               <*> r .: "authors"
 153
 154 instance ToNamedRecord CsvDoc where
 155   toNamedRecord (CsvDoc t s py pm pd abst aut) =
 156     namedRecord [ "title"  .= t
 157                 , "source" .= s
 158                 , "publication_year"  .= py
 159                 , "publication_month" .= pm
 160                 , "publication_day"   .= pd
 161                 , "abstract"          .= abst
 162                 , "authors"           .= aut
 163                ]
 164
 165
 166 csvDecodeOptions :: DecodeOptions
 167 csvDecodeOptions = (defaultDecodeOptions
 168                       {decDelimiter = fromIntegral $ ord '\t'}
 169                     )
 170
 171 csvEncodeOptions :: EncodeOptions
 172 csvEncodeOptions = ( defaultEncodeOptions
 173                       {encDelimiter = fromIntegral $ ord '\t'}
 174                     )
 175
 176 ------------------------------------------------------------------------
 177 ------------------------------------------------------------------------
 178 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
 179 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
 180                       <$> snd
 181                       <$> readCsv fp
 182
 183 ------------------------------------------------------------------------
 184 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
 185 readCsv fp = do
 186     csvData <- BL.readFile fp
 187     case decodeByNameWith csvDecodeOptions csvData of
 188       Left e        -> panic (pack e)
 189       Right csvDocs -> pure csvDocs
 190
 191
 192 readHal :: FilePath -> IO (Header, Vector CsvHal)
 193 readHal fp = do
 194     csvData <- BL.readFile fp
 195     case decodeByNameWith csvDecodeOptions csvData of
 196       Left e        -> panic (pack e)
 197       Right csvDocs -> pure csvDocs
 198 ------------------------------------------------------------------------
 199 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 200 writeCsv fp (h, vs) = BL.writeFile fp $
 201                       encodeByNameWith csvEncodeOptions h (V.toList vs)
 202
 203
 204 ------------------------------------------------------------------------
 205 -- Hal Format
 206 data CsvHal = CsvHal
 207     { csvHal_title  :: !Text
 208     , csvHal_source :: !Text
 209     , csvHal_publication_year  :: !Integer
 210     , csvHal_publication_month :: !Int
 211     , csvHal_publication_day   :: !Int
 212     , csvHal_abstract          :: !Text
 213     , csvHal_authors           :: !Text
 214
 215     , csvHal_url               :: !Text
 216     , csvHal_isbn_s            :: !Text
 217     , csvHal_issue_s           :: !Text
 218     , csvHal_journalPublisher_s:: !Text
 219     , csvHal_language_s        :: !Text
 220
 221     , csvHal_doiId_s           :: !Text
 222     , csvHal_authId_i          :: !Text
 223     , csvHal_instStructId_i    :: !Text
 224     , csvHal_deptStructId_i    :: !Text
 225     , csvHal_labStructId_i     :: !Text
 226
 227     , csvHal_rteamStructId_i   :: !Text
 228     , csvHal_docType_s         :: !Text
 229     }
 230     deriving (Show)
 231
 232 instance FromNamedRecord CsvHal where
 233   parseNamedRecord r = CsvHal <$> r .: "title"
 234                               <*> r .: "source"
 235                               <*> r .: "publication_year"
 236                               <*> r .: "publication_month"
 237                               <*> r .: "publication_day"
 238                               <*> r .: "abstract"
 239                               <*> r .: "authors"
 240
 241                               <*> r .: "url"
 242                               <*> r .: "isbn_s"
 243                               <*> r .: "issue_s"
 244                               <*> r .: "journalPublisher_s"
 245                               <*> r .: "language_s"
 246
 247                               <*> r .: "doiId_s"
 248                               <*> r .: "authId_i"
 249                               <*> r .: "instStructId_i"
 250                               <*> r .: "deptStructId_i"
 251                               <*> r .: "labStructId_i"
 252
 253                               <*> r .: "rteamStructId_i"
 254                               <*> r .: "docType_s"
 255
 256 instance ToNamedRecord CsvHal where
 257   toNamedRecord (CsvHal t s py  pm pd abst aut  url isbn iss j lang  doi auth inst dept lab team doct) =
 258     namedRecord [ "title"  .= t
 259                 , "source" .= s
 260
 261                 , "publication_year"  .= py
 262                 , "publication_month" .= pm
 263                 , "publication_day"   .= pd
 264
 265                 , "abstract"          .= abst
 266                 , "authors"           .= aut
 267
 268                 , "url"                .= url
 269                 , "isbn_s"             .= isbn
 270                 , "issue_s"            .= iss
 271                 , "journalPublisher_s" .= j
 272                 , "language_s"         .= lang
 273
 274                 , "doiId_s"            .= doi
 275                 , "authId_i"           .= auth
 276                 , "instStructId_i"     .= inst
 277                 , "deptStructId_i"     .= dept
 278                 , "labStructId_i"      .= lab
 279
 280                 , "rteamStructId_i"    .= team
 281                 , "docType_s"          .= doct
 282                ]
 283
 284 csvHal2doc :: CsvHal -> HyperdataDocument
 285 csvHal2doc (CsvHal title source
 286        pub_year pub_month pub_day
 287        abstract authors
 288        url _ _ _ _
 289        doi _ inst _ _
 290        _ _ ) = HyperdataDocument (Just "CsvHal")
 291                                (Just doi)
 292                                (Just url)
 293                                Nothing
 294                                Nothing
 295                                Nothing
 296                                (Just title)
 297                                (Just inst)
 298                                (Just authors)
 299                                (Just source)
 300                                (Just abstract)
 301                                (Just $ pack . show $ jour pub_year pub_month pub_day)
 302                                (Just $ fromIntegral pub_year)
 303                                (Just pub_month)
 304                                (Just pub_day)
 305                                Nothing
 306                                Nothing
 307                                Nothing
 308                                Nothing
 309
 310 ------------------------------------------------------------------------
 311 parseHal :: FilePath -> IO [HyperdataDocument]
 312 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
 313 ------------------------------------------------------------------------
 314
 315