src/Gargantext/Text/Parsers/CSV.hs

   1 {-|
   2 Module      : Gargantext.Text.Parsers.CSV
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 CSV parser for Gargantext corpus files.
  11
  12 -}
  13
  14 {-# LANGUAGE NoImplicitPrelude #-}
  15 {-# LANGUAGE OverloadedStrings #-}
  16 {-# LANGUAGE DeriveGeneric      #-}
  17
  18 module Gargantext.Text.Parsers.CSV where
  19
  20 import GHC.Real (round)
  21 import GHC.IO (FilePath)
  22
  23 import Control.Applicative
  24
  25 import Data.Char (ord)
  26 import Data.Csv
  27 import Data.Either (Either(Left, Right))
  28 import Data.String (IsString)
  29 import Data.Text (Text, pack, unpack, length)
  30 import qualified Data.ByteString.Lazy as BL
  31
  32 import Data.Vector (Vector)
  33 import qualified Data.Vector as V
  34 import Safe (tailMay)
  35 import Text.HTML.TagSoup
  36
  37 import Gargantext.Text
  38 import Gargantext.Prelude hiding (length)
  39
  40 ---------------------------------------------------------------
  41 data Doc = Doc
  42     { d_docId  :: !Int
  43     , d_title  :: !Text
  44     , d_source :: !Text
  45     , d_publication_year  :: !Int
  46     , d_publication_month :: !Int
  47     , d_publication_day   :: !Int
  48     , d_abstract          :: !Text
  49     , d_authors           :: !Text
  50     }
  51     deriving (Show)
  52 ---------------------------------------------------------------
  53 toDocs :: Vector CsvDoc -> [Doc]
  54 toDocs v = V.toList
  55          $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
  56                        -> Doc nId t s py pm pd abst auth )
  57                        (V.enumFromN 1 (V.length v'')) v''
  58           where
  59             v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
  60             seps= (V.fromList [Paragraph, Sentences, Chars])
  61
  62 ---------------------------------------------------------------
  63 fromDocs :: Vector Doc -> Vector CsvDoc
  64 fromDocs docs = V.map fromDocs' docs
  65   where
  66     fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
  67
  68 ---------------------------------------------------------------
  69 -- | Split a document in its context
  70 -- TODO adapt the size of the paragraph according to the corpus average
  71
  72 data SplitBy = Paragraph | Sentences | Chars
  73
  74 splitDoc :: Mean -> SplitBy -> CsvDoc -> Vector CsvDoc
  75 splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
  76                           if docSize > 1000
  77                             then
  78                               if (mod (round m) docSize) >= 10
  79                                 then
  80                                   splitDoc' splt doc
  81                                 else
  82                                   V.fromList [doc]
  83                             else
  84                               V.fromList [doc]
  85
  86
  87 splitDoc' :: SplitBy -> CsvDoc -> Vector CsvDoc
  88 splitDoc' splt (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
  89     where
  90       firstDoc = CsvDoc t s py pm pd firstAbstract auth
  91       firstAbstract = head' abstracts
  92
  93       nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
  94
  95       abstracts    = (splitBy splt) abst
  96       head' x = maybe ""   identity (head x)
  97       tail' x = maybe [""] identity (tailMay x)
  98
  99
 100 splitBy :: SplitBy -> Text -> [Text]
 101 splitBy Chars     = map pack . chunkAlong 1000 1 . unpack
 102 splitBy Sentences = map unsentences . chunkAlong 20 1  . sentences
 103 splitBy Paragraph = map removeTag   . filter isTagText . parseTags
 104   where
 105     removeTag :: IsString p => Tag p -> p
 106     removeTag (TagText x) = x
 107     removeTag (TagComment x) = x
 108     removeTag _          = ""
 109
 110 ---------------------------------------------------------------
 111 ---------------------------------------------------------------
 112 type Mean = Double
 113
 114 docsSize :: Vector CsvDoc -> Mean
 115 docsSize csvDoc = mean ls
 116   where
 117     ls = V.toList $ V.map (fromIntegral . length . c_abstract) csvDoc
 118
 119
 120 ---------------------------------------------------------------
 121 data CsvDoc = CsvDoc
 122     { c_title  :: !Text
 123     , c_source :: !Text
 124     , c_publication_year  :: !Int
 125     , c_publication_month :: !Int
 126     , c_publication_day   :: !Int
 127     , c_abstract          :: !Text
 128     , c_authors           :: !Text
 129     }
 130     deriving (Show)
 131
 132 instance FromNamedRecord CsvDoc where
 133   parseNamedRecord r = CsvDoc <$> r .: "title"
 134                               <*> r .: "source"
 135                               <*> r .: "publication_year"
 136                               <*> r .: "publication_month"
 137                               <*> r .: "publication_day"
 138                               <*> r .: "abstract"
 139                               <*> r .: "authors"
 140
 141 instance ToNamedRecord CsvDoc where
 142   toNamedRecord (CsvDoc t s py pm pd abst aut) =
 143     namedRecord [ "title"  .= t
 144                 , "source" .= s
 145                 , "publication_year"  .= py
 146                 , "publication_month" .= pm
 147                 , "publication_day"   .= pd
 148                 , "abstract"          .= abst
 149                 , "authors"           .= aut
 150                 ]
 151
 152
 153 csvDecodeOptions :: DecodeOptions
 154 csvDecodeOptions = (defaultDecodeOptions
 155                       {decDelimiter = fromIntegral $ ord '\t'}
 156                     )
 157
 158 csvEncodeOptions :: EncodeOptions
 159 csvEncodeOptions = ( defaultEncodeOptions
 160                       {encDelimiter = fromIntegral $ ord '\t'}
 161                     )
 162
 163
 164 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
 165 readCsv fp = do
 166     csvData <- BL.readFile fp
 167     case decodeByNameWith csvDecodeOptions csvData of
 168       Left e    -> panic (pack e)
 169       Right csvDocs -> pure csvDocs
 170
 171
 172 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
 173 writeCsv fp (h, vs) = BL.writeFile fp $
 174                       encodeByNameWith csvEncodeOptions h (V.toList vs)
 175