2 Module : Gargantext.Text.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
18 module Gargantext.Text.Parsers.CSV where
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
23 import Control.Applicative
25 import Data.Char (ord)
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length)
29 import qualified Data.ByteString.Lazy as BL
31 import Data.Vector (Vector)
32 import qualified Data.Vector as V
35 import Gargantext.Text
36 import Gargantext.Text.Context
37 import Gargantext.Prelude hiding (length)
39 ---------------------------------------------------------------
44 , d_publication_year :: !Int
45 , d_publication_month :: !Int
46 , d_publication_day :: !Int
51 ---------------------------------------------------------------
52 toDocs :: Vector CsvDoc -> [Doc]
54 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
55 -> Doc nId t s py pm pd abst auth )
56 (V.enumFromN 1 (V.length v'')) v''
58 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
59 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
61 ---------------------------------------------------------------
62 fromDocs :: Vector Doc -> Vector CsvDoc
63 fromDocs docs = V.map fromDocs' docs
65 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
67 ---------------------------------------------------------------
68 -- | Split a document in its context
69 -- TODO adapt the size of the paragraph according to the corpus average
72 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
73 splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
76 if (mod (round m) docSize) >= 10
85 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
86 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
88 firstDoc = CsvDoc t s py pm pd firstAbstract auth
89 firstAbstract = head' abstracts
91 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
93 abstracts = (splitBy $ contextSize) abst
94 head' x = maybe "" identity (head x)
95 tail' x = maybe [""] identity (tailMay x)
97 ---------------------------------------------------------------
98 ---------------------------------------------------------------
101 docsSize :: Vector CsvDoc -> Mean
102 docsSize csvDoc = mean ls
104 ls = V.toList $ V.map (fromIntegral . length . c_abstract) csvDoc
107 ---------------------------------------------------------------
111 , c_publication_year :: !Int
112 , c_publication_month :: !Int
113 , c_publication_day :: !Int
114 , c_abstract :: !Text
119 instance FromNamedRecord CsvDoc where
120 parseNamedRecord r = CsvDoc <$> r .: "title"
122 <*> r .: "publication_year"
123 <*> r .: "publication_month"
124 <*> r .: "publication_day"
128 instance ToNamedRecord CsvDoc where
129 toNamedRecord (CsvDoc t s py pm pd abst aut) =
130 namedRecord [ "title" .= t
132 , "publication_year" .= py
133 , "publication_month" .= pm
134 , "publication_day" .= pd
140 csvDecodeOptions :: DecodeOptions
141 csvDecodeOptions = (defaultDecodeOptions
142 {decDelimiter = fromIntegral $ ord '\t'}
145 csvEncodeOptions :: EncodeOptions
146 csvEncodeOptions = ( defaultEncodeOptions
147 {encDelimiter = fromIntegral $ ord '\t'}
151 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
153 csvData <- BL.readFile fp
154 case decodeByNameWith csvDecodeOptions csvData of
155 Left e -> panic (pack e)
156 Right csvDocs -> pure csvDocs
159 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
160 writeCsv fp (h, vs) = BL.writeFile fp $
161 encodeByNameWith csvEncodeOptions h (V.toList vs)