src/Gargantext/Text/Parsers/CSV.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers.CSV
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 CSV parser for Gargantext corpus files.
  11
  12 -}
  13
  14 {-# LANGUAGE NoImplicitPrelude #-}
  15 {-# LANGUAGE OverloadedStrings #-}
  16 {-# LANGUAGE DeriveGeneric     #-}
  17
  18 module Gargantext.Text.Parsers.CSV where
  19
  20 import GHC.Real (round)
  21 import GHC.IO (FilePath)
  22
  23 import Control.Applicative
  24
  25 import Data.Char (ord)
  26 import Data.Csv
  27 import Data.Either (Either(Left, Right))
  28 import Data.Text (Text, pack, length)
  29 import qualified Data.ByteString.Lazy as BL
  30
  31 import Data.Vector (Vector)
  32 import qualified Data.Vector as V
  33 import Safe (tailMay)
  34
  35 import Gargantext.Text
  36 import Gargantext.Text.Context
  37 import Gargantext.Prelude hiding (length)
  38
  39 ---------------------------------------------------------------
  40 data Doc = Doc
  41     { d_docId  :: !Int
  42     , d_title  :: !Text
  43     , d_source :: !Text
  44     , d_publication_year  :: !Int
  45     , d_publication_month :: !Int
  46     , d_publication_day   :: !Int
  47     , d_abstract          :: !Text
  48     , d_authors           :: !Text
  49     }
  50     deriving (Show)
  51 ---------------------------------------------------------------
  52 toDocs :: Vector CsvDoc -> [Doc]
  53 toDocs v = V.toList
  54          $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
  55                        -> Doc nId t s py pm pd abst auth )
  56                        (V.enumFromN 1 (V.length v'')) v''
  57           where
  58             v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
  59             seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
  60
  61 ---------------------------------------------------------------
  62 fromDocs :: Vector Doc -> Vector CsvDoc
  63 fromDocs docs = V.map fromDocs' docs
  64   where
  65     fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
  66
  67 ---------------------------------------------------------------
  68 -- | Split a document in its context
  69 -- TODO adapt the size of the paragraph according to the corpus average
  70
  71
  72 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
  73 splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
  74                           if docSize > 1000
  75                             then
  76                               if (mod (round m) docSize) >= 10
  77                                 then
  78                                   splitDoc' splt doc
  79                                 else
  80                                   V.fromList [doc]
  81                             else
  82                               V.fromList [doc]
  83
  84
  85 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
  86 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
  87     where
  88       firstDoc = CsvDoc t s py pm pd firstAbstract auth
  89       firstAbstract = head' abstracts
  90
  91       nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
  92
  93       abstracts    = (splitBy $ contextSize) abst
  94       head' x = maybe ""   identity (head x)
  95       tail' x = maybe [""] identity (tailMay x)
  96
  97 ---------------------------------------------------------------
  98 ---------------------------------------------------------------
  99 type Mean = Double
 100
 101 docsSize :: Vector CsvDoc -> Mean
 102 docsSize csvDoc = mean ls
 103   where
 104     ls = V.toList $ V.map (fromIntegral . length . c_abstract) csvDoc
 105
 106
 107 ---------------------------------------------------------------
 108 data CsvDoc = CsvDoc
 109     { c_title  :: !Text
 110     , c_source :: !Text
 111     , c_publication_year  :: !Int
 112     , c_publication_month :: !Int
 113     , c_publication_day   :: !Int
 114     , c_abstract          :: !Text
 115     , c_authors           :: !Text
 116     }
 117     deriving (Show)
 118
 119 instance FromNamedRecord CsvDoc where
 120   parseNamedRecord r = CsvDoc <$> r .: "title"
 121                               <*> r .: "source"
 122                               <*> r .: "publication_year"
 123                               <*> r .: "publication_month"
 124                               <*> r .: "publication_day"
 125                               <*> r .: "abstract"
 126                               <*> r .: "authors"
 127
 128 instance ToNamedRecord CsvDoc where
 129   toNamedRecord (CsvDoc t s py pm pd abst aut) =
 130     namedRecord [ "title"  .= t
 131                 , "source" .= s
 132                 , "publication_year"  .= py
 133                 , "publication_month" .= pm
 134                 , "publication_day"   .= pd
 135                 , "abstract"          .= abst
 136                 , "authors"           .= aut
 137                 ]
 138
 139
 140 csvDecodeOptions :: DecodeOptions
 141 csvDecodeOptions = (defaultDecodeOptions
 142                       {decDelimiter = fromIntegral $ ord '\t'}
 143                     )
 144
 145 csvEncodeOptions :: EncodeOptions
 146 csvEncodeOptions = ( defaultEncodeOptions
 147                       {encDelimiter = fromIntegral $ ord '\t'}
 148                     )
 149
 150
 151 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
 152 readCsv fp = do
 153     csvData <- BL.readFile fp
 154     case decodeByNameWith csvDecodeOptions csvData of
 155       Left e    -> panic (pack e)
 156       Right csvDocs -> pure csvDocs
 157
 158
 159 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 160 writeCsv fp (h, vs) = BL.writeFile fp $
 161                       encodeByNameWith csvEncodeOptions h (V.toList vs)
 162