src/Gargantext/Text/Parsers/CSV.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers.CSV
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 CSV parser for Gargantext corpus files.
  11
  12 -}
  13
  14 {-# LANGUAGE NoImplicitPrelude #-}
  15 {-# LANGUAGE OverloadedStrings #-}
  16 {-# LANGUAGE DeriveGeneric     #-}
  17
  18 module Gargantext.Text.Parsers.CSV where
  19
  20 import GHC.Real (round)
  21 import GHC.IO (FilePath)
  22
  23 import Control.Applicative
  24
  25 import Data.Char (ord)
  26 import Data.Csv
  27 import Data.Either (Either(Left, Right))
  28 import Data.Text (Text, pack, length, intercalate)
  29 import qualified Data.ByteString.Lazy as BL
  30
  31 import Data.Vector (Vector)
  32 import qualified Data.Vector as V
  33 import Safe (tailMay)
  34
  35 import Gargantext.Text
  36 import Gargantext.Text.Context
  37 import Gargantext.Prelude hiding (length)
  38
  39 ---------------------------------------------------------------
  40 data Doc = Doc
  41     { d_docId  :: !Int
  42     , d_title  :: !Text
  43     , d_source :: !Text
  44     , d_publication_year  :: !Int
  45     , d_publication_month :: !Int
  46     , d_publication_day   :: !Int
  47     , d_abstract          :: !Text
  48     , d_authors           :: !Text
  49     }
  50     deriving (Show)
  51 ---------------------------------------------------------------
  52 toDocs :: Vector CsvDoc -> [Doc]
  53 toDocs v = V.toList
  54          $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
  55                        -> Doc nId t s py pm pd abst auth )
  56                        (V.enumFromN 1 (V.length v'')) v''
  57           where
  58             v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
  59             seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
  60
  61 ---------------------------------------------------------------
  62 fromDocs :: Vector Doc -> Vector CsvDoc
  63 fromDocs docs = V.map fromDocs' docs
  64   where
  65     fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
  66
  67 ---------------------------------------------------------------
  68 -- | Split a document in its context
  69 -- TODO adapt the size of the paragraph according to the corpus average
  70
  71 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
  72 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
  73                           if docSize > 1000
  74                             then
  75                               if (mod (round m) docSize) >= 10
  76                                 then
  77                                   splitDoc' splt doc
  78                                 else
  79                                   V.fromList [doc]
  80                             else
  81                               V.fromList [doc]
  82
  83
  84 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
  85 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
  86     where
  87       firstDoc = CsvDoc t s py pm pd firstAbstract auth
  88       firstAbstract = head' abstracts
  89
  90       nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
  91
  92       abstracts    = (splitBy $ contextSize) abst
  93       head' x = maybe ""   identity (head x)
  94       tail' x = maybe [""] identity (tailMay x)
  95
  96 ---------------------------------------------------------------
  97 ---------------------------------------------------------------
  98 type Mean = Double
  99
 100 docsSize :: Vector CsvDoc -> Mean
 101 docsSize csvDoc = mean ls
 102   where
 103     ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
 104
 105
 106 ---------------------------------------------------------------
 107 data CsvDoc = CsvDoc
 108     { csv_title  :: !Text
 109     , csv_source :: !Text
 110     , csv_publication_year  :: !Int
 111     , csv_publication_month :: !Int
 112     , csv_publication_day   :: !Int
 113     , csv_abstract          :: !Text
 114     , csv_authors           :: !Text
 115     }
 116     deriving (Show)
 117
 118 instance FromNamedRecord CsvDoc where
 119   parseNamedRecord r = CsvDoc <$> r .: "title"
 120                               <*> r .: "source"
 121                               <*> r .: "publication_year"
 122                               <*> r .: "publication_month"
 123                               <*> r .: "publication_day"
 124                               <*> r .: "abstract"
 125                               <*> r .: "authors"
 126
 127 instance ToNamedRecord CsvDoc where
 128   toNamedRecord (CsvDoc t s py pm pd abst aut) =
 129     namedRecord [ "title"  .= t
 130                 , "source" .= s
 131                 , "publication_year"  .= py
 132                 , "publication_month" .= pm
 133                 , "publication_day"   .= pd
 134                 , "abstract"          .= abst
 135                 , "authors"           .= aut
 136                 ]
 137
 138
 139 csvDecodeOptions :: DecodeOptions
 140 csvDecodeOptions = (defaultDecodeOptions
 141                       {decDelimiter = fromIntegral $ ord '\t'}
 142                     )
 143
 144 csvEncodeOptions :: EncodeOptions
 145 csvEncodeOptions = ( defaultEncodeOptions
 146                       {encDelimiter = fromIntegral $ ord '\t'}
 147                     )
 148
 149
 150 ------------------------------------------------------------------------
 151 ------------------------------------------------------------------------
 152 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
 153 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
 154                       <$> snd
 155                       <$> readCsv fp
 156
 157 ------------------------------------------------------------------------
 158 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
 159 readCsv fp = do
 160     csvData <- BL.readFile fp
 161     case decodeByNameWith csvDecodeOptions csvData of
 162       Left e        -> panic (pack e)
 163       Right csvDocs -> pure csvDocs
 164
 165
 166 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 167 writeCsv fp (h, vs) = BL.writeFile fp $
 168                       encodeByNameWith csvEncodeOptions h (V.toList vs)
 169