2 Module : Gargantext.Text.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
18 module Gargantext.Text.Parsers.CSV where
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
23 import Control.Applicative
25 import Data.Char (ord)
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
31 import Data.Vector (Vector)
32 import qualified Data.Vector as V
35 import Gargantext.Core.Types.Node (HyperdataDocument(..))
36 import Gargantext.Text
37 import Gargantext.Text.Context
38 import Gargantext.Prelude hiding (length)
40 ---------------------------------------------------------------
45 , d_publication_year :: !Int
46 , d_publication_month :: !Int
47 , d_publication_day :: !Int
52 ---------------------------------------------------------------
53 -- | Doc 2 HyperdataDocument
54 doc2hyperdataDocument :: Doc -> HyperdataDocument
55 doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
56 HyperdataDocument (Just "CSV")
72 ---------------------------------------------------------------
73 -- | Types Conversions
74 toDocs :: Vector CsvDoc -> [Doc]
76 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
77 -> Doc nId t s py pm pd abst auth )
78 (V.enumFromN 1 (V.length v'')) v''
80 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
81 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
83 ---------------------------------------------------------------
84 fromDocs :: Vector Doc -> Vector CsvDoc
85 fromDocs docs = V.map fromDocs' docs
87 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
89 ---------------------------------------------------------------
90 -- | Split a document in its context
91 -- TODO adapt the size of the paragraph according to the corpus average
93 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
94 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
97 if (mod (round m) docSize) >= 10
106 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
107 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
109 firstDoc = CsvDoc t s py pm pd firstAbstract auth
110 firstAbstract = head' abstracts
112 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
114 abstracts = (splitBy $ contextSize) abst
115 head' x = maybe "" identity (head x)
116 tail' x = maybe [""] identity (tailMay x)
118 ---------------------------------------------------------------
119 ---------------------------------------------------------------
122 docsSize :: Vector CsvDoc -> Mean
123 docsSize csvDoc = mean ls
125 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
128 ---------------------------------------------------------------
131 , csv_source :: !Text
132 , csv_publication_year :: !Int
133 , csv_publication_month :: !Int
134 , csv_publication_day :: !Int
135 , csv_abstract :: !Text
136 , csv_authors :: !Text
140 instance FromNamedRecord CsvDoc where
141 parseNamedRecord r = CsvDoc <$> r .: "title"
143 <*> r .: "publication_year"
144 <*> r .: "publication_month"
145 <*> r .: "publication_day"
149 instance ToNamedRecord CsvDoc where
150 toNamedRecord (CsvDoc t s py pm pd abst aut) =
151 namedRecord [ "title" .= t
153 , "publication_year" .= py
154 , "publication_month" .= pm
155 , "publication_day" .= pd
161 csvDecodeOptions :: DecodeOptions
162 csvDecodeOptions = (defaultDecodeOptions
163 {decDelimiter = fromIntegral $ ord '\t'}
166 csvEncodeOptions :: EncodeOptions
167 csvEncodeOptions = ( defaultEncodeOptions
168 {encDelimiter = fromIntegral $ ord '\t'}
171 ------------------------------------------------------------------------
172 ------------------------------------------------------------------------
173 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
174 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
178 ------------------------------------------------------------------------
179 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
181 csvData <- BL.readFile fp
182 case decodeByNameWith csvDecodeOptions csvData of
183 Left e -> panic (pack e)
184 Right csvDocs -> pure csvDocs
187 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
188 writeCsv fp (h, vs) = BL.writeFile fp $
189 encodeByNameWith csvEncodeOptions h (V.toList vs)