src/Gargantext/Text/Parsers/CSV.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers.CSV
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 CSV parser for Gargantext corpus files.
  11
  12 -}
  13
  14 {-# LANGUAGE NoImplicitPrelude #-}
  15 {-# LANGUAGE OverloadedStrings #-}
  16 {-# LANGUAGE DeriveGeneric     #-}
  17
  18 module Gargantext.Text.Parsers.CSV where
  19
  20 import GHC.Real (round)
  21 import GHC.IO (FilePath)
  22
  23 import Control.Applicative
  24
  25 import Data.Char (ord)
  26 import Data.Csv
  27 import Data.Either (Either(Left, Right))
  28 import Data.Text (Text, pack, length, intercalate)
  29 import qualified Data.ByteString.Lazy as BL
  30
  31 import Data.Vector (Vector)
  32 import qualified Data.Vector as V
  33 import Safe (tailMay)
  34
  35 import Gargantext.Core.Types.Node (HyperdataDocument(..))
  36 import Gargantext.Text
  37 import Gargantext.Text.Context
  38 import Gargantext.Prelude hiding (length)
  39
  40 ---------------------------------------------------------------
  41 data Doc = Doc
  42     { d_docId  :: !Int
  43     , d_title  :: !Text
  44     , d_source :: !Text
  45     , d_publication_year  :: !Int
  46     , d_publication_month :: !Int
  47     , d_publication_day   :: !Int
  48     , d_abstract          :: !Text
  49     , d_authors           :: !Text
  50     }
  51     deriving (Show)
  52 ---------------------------------------------------------------
  53 -- | Doc 2 HyperdataDocument
  54 doc2hyperdataDocument :: Doc -> HyperdataDocument
  55 doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
  56   HyperdataDocument (Just "CSV")
  57                     (Just did)
  58                     Nothing
  59                     Nothing
  60                     (Just dt)
  61                     (Just dau)
  62                     (Just ds)
  63                     (Just dab)
  64                     (Nothing)
  65                     Nothing
  66                     (Just dpy)
  67                     (Just dpm)
  68                     (Just dpd)
  69                     Nothing
  70                     Nothing
  71                     Nothing
  72 ---------------------------------------------------------------
  73 -- | Types Conversions
  74 toDocs :: Vector CsvDoc -> [Doc]
  75 toDocs v = V.toList
  76          $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
  77                        -> Doc nId t s py pm pd abst auth )
  78                        (V.enumFromN 1 (V.length v'')) v''
  79           where
  80             v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
  81             seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
  82
  83 ---------------------------------------------------------------
  84 fromDocs :: Vector Doc -> Vector CsvDoc
  85 fromDocs docs = V.map fromDocs' docs
  86   where
  87     fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
  88
  89 ---------------------------------------------------------------
  90 -- | Split a document in its context
  91 -- TODO adapt the size of the paragraph according to the corpus average
  92
  93 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
  94 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
  95                           if docSize > 1000
  96                             then
  97                               if (mod (round m) docSize) >= 10
  98                                 then
  99                                   splitDoc' splt doc
 100                                 else
 101                                   V.fromList [doc]
 102                             else
 103                               V.fromList [doc]
 104
 105
 106 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
 107 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
 108     where
 109       firstDoc = CsvDoc t s py pm pd firstAbstract auth
 110       firstAbstract = head' abstracts
 111
 112       nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
 113
 114       abstracts    = (splitBy $ contextSize) abst
 115       head' x = maybe ""   identity (head x)
 116       tail' x = maybe [""] identity (tailMay x)
 117
 118 ---------------------------------------------------------------
 119 ---------------------------------------------------------------
 120 type Mean = Double
 121
 122 docsSize :: Vector CsvDoc -> Mean
 123 docsSize csvDoc = mean ls
 124   where
 125     ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
 126
 127
 128 ---------------------------------------------------------------
 129 data CsvDoc = CsvDoc
 130     { csv_title  :: !Text
 131     , csv_source :: !Text
 132     , csv_publication_year  :: !Int
 133     , csv_publication_month :: !Int
 134     , csv_publication_day   :: !Int
 135     , csv_abstract          :: !Text
 136     , csv_authors           :: !Text
 137     }
 138     deriving (Show)
 139
 140 instance FromNamedRecord CsvDoc where
 141   parseNamedRecord r = CsvDoc <$> r .: "title"
 142                               <*> r .: "source"
 143                               <*> r .: "publication_year"
 144                               <*> r .: "publication_month"
 145                               <*> r .: "publication_day"
 146                               <*> r .: "abstract"
 147                               <*> r .: "authors"
 148
 149 instance ToNamedRecord CsvDoc where
 150   toNamedRecord (CsvDoc t s py pm pd abst aut) =
 151     namedRecord [ "title"  .= t
 152                 , "source" .= s
 153                 , "publication_year"  .= py
 154                 , "publication_month" .= pm
 155                 , "publication_day"   .= pd
 156                 , "abstract"          .= abst
 157                 , "authors"           .= aut
 158                 ]
 159
 160
 161 csvDecodeOptions :: DecodeOptions
 162 csvDecodeOptions = (defaultDecodeOptions
 163                       {decDelimiter = fromIntegral $ ord '\t'}
 164                     )
 165
 166 csvEncodeOptions :: EncodeOptions
 167 csvEncodeOptions = ( defaultEncodeOptions
 168                       {encDelimiter = fromIntegral $ ord '\t'}
 169                     )
 170
 171 ------------------------------------------------------------------------
 172 ------------------------------------------------------------------------
 173 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
 174 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
 175                       <$> snd
 176                       <$> readCsv fp
 177
 178 ------------------------------------------------------------------------
 179 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
 180 readCsv fp = do
 181     csvData <- BL.readFile fp
 182     case decodeByNameWith csvDecodeOptions csvData of
 183       Left e        -> panic (pack e)
 184       Right csvDocs -> pure csvDocs
 185
 186
 187 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 188 writeCsv fp (h, vs) = BL.writeFile fp $
 189                       encodeByNameWith csvEncodeOptions h (V.toList vs)
 190