2 Module : Gargantext.Text.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
18 module Gargantext.Text.Parsers.CSV where
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
23 import Control.Applicative
25 import Data.Char (ord)
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
31 import Data.Vector (Vector)
32 import qualified Data.Vector as V
35 import Gargantext.Text
36 import Gargantext.Text.Context
37 import Gargantext.Prelude hiding (length)
39 ---------------------------------------------------------------
44 , d_publication_year :: !Int
45 , d_publication_month :: !Int
46 , d_publication_day :: !Int
51 ---------------------------------------------------------------
52 toDocs :: Vector CsvDoc -> [Doc]
54 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
55 -> Doc nId t s py pm pd abst auth )
56 (V.enumFromN 1 (V.length v'')) v''
58 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
59 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
61 ---------------------------------------------------------------
62 fromDocs :: Vector Doc -> Vector CsvDoc
63 fromDocs docs = V.map fromDocs' docs
65 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
67 ---------------------------------------------------------------
68 -- | Split a document in its context
69 -- TODO adapt the size of the paragraph according to the corpus average
71 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
72 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
75 if (mod (round m) docSize) >= 10
84 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
85 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
87 firstDoc = CsvDoc t s py pm pd firstAbstract auth
88 firstAbstract = head' abstracts
90 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
92 abstracts = (splitBy $ contextSize) abst
93 head' x = maybe "" identity (head x)
94 tail' x = maybe [""] identity (tailMay x)
96 ---------------------------------------------------------------
97 ---------------------------------------------------------------
100 docsSize :: Vector CsvDoc -> Mean
101 docsSize csvDoc = mean ls
103 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
106 ---------------------------------------------------------------
109 , csv_source :: !Text
110 , csv_publication_year :: !Int
111 , csv_publication_month :: !Int
112 , csv_publication_day :: !Int
113 , csv_abstract :: !Text
114 , csv_authors :: !Text
118 instance FromNamedRecord CsvDoc where
119 parseNamedRecord r = CsvDoc <$> r .: "title"
121 <*> r .: "publication_year"
122 <*> r .: "publication_month"
123 <*> r .: "publication_day"
127 instance ToNamedRecord CsvDoc where
128 toNamedRecord (CsvDoc t s py pm pd abst aut) =
129 namedRecord [ "title" .= t
131 , "publication_year" .= py
132 , "publication_month" .= pm
133 , "publication_day" .= pd
139 csvDecodeOptions :: DecodeOptions
140 csvDecodeOptions = (defaultDecodeOptions
141 {decDelimiter = fromIntegral $ ord '\t'}
144 csvEncodeOptions :: EncodeOptions
145 csvEncodeOptions = ( defaultEncodeOptions
146 {encDelimiter = fromIntegral $ ord '\t'}
149 ------------------------------------------------------------------------
150 ------------------------------------------------------------------------
151 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
152 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
156 ------------------------------------------------------------------------
157 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
159 csvData <- BL.readFile fp
160 case decodeByNameWith csvDecodeOptions csvData of
161 Left e -> panic (pack e)
162 Right csvDocs -> pure csvDocs
165 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
166 writeCsv fp (h, vs) = BL.writeFile fp $
167 encodeByNameWith csvEncodeOptions h (V.toList vs)