2 Module : Gargantext.Text.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
18 module Gargantext.Text.Parsers.CSV where
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
23 import Control.Applicative
25 import Data.Char (ord)
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
31 import Data.Vector (Vector)
32 import qualified Data.Vector as V
35 import Gargantext.Database.Types.Node (HyperdataDocument(..))
36 import Gargantext.Text
37 import Gargantext.Text.Context
38 import Gargantext.Prelude hiding (length)
40 ---------------------------------------------------------------
45 , d_publication_year :: !Int
46 , d_publication_month :: !Int
47 , d_publication_day :: !Int
52 ---------------------------------------------------------------
53 -- | Doc 2 HyperdataDocument
54 doc2hyperdataDocument :: Doc -> HyperdataDocument
55 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
56 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
57 HyperdataDocument (Just "CSV")
58 (Just . pack . show $ did)
75 ---------------------------------------------------------------
76 -- | Types Conversions
77 toDocs :: Vector CsvDoc -> [Doc]
79 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
80 -> Doc nId t s py pm pd abst auth )
81 (V.enumFromN 1 (V.length v'')) v''
83 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
84 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
86 ---------------------------------------------------------------
87 fromDocs :: Vector Doc -> Vector CsvDoc
88 fromDocs docs = V.map fromDocs' docs
90 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
92 ---------------------------------------------------------------
93 -- | Split a document in its context
94 -- TODO adapt the size of the paragraph according to the corpus average
96 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
97 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
100 if (mod (round m) docSize) >= 10
109 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
110 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
112 firstDoc = CsvDoc t s py pm pd firstAbstract auth
113 firstAbstract = head' abstracts
115 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
117 abstracts = (splitBy $ contextSize) abst
118 head' x = maybe "" identity (head x)
119 tail' x = maybe [""] identity (tailMay x)
121 ---------------------------------------------------------------
122 ---------------------------------------------------------------
125 docsSize :: Vector CsvDoc -> Mean
126 docsSize csvDoc = mean ls
128 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
131 ---------------------------------------------------------------
134 , csv_source :: !Text
135 , csv_publication_year :: !Int
136 , csv_publication_month :: !Int
137 , csv_publication_day :: !Int
138 , csv_abstract :: !Text
139 , csv_authors :: !Text
143 instance FromNamedRecord CsvDoc where
144 parseNamedRecord r = CsvDoc <$> r .: "title"
146 <*> r .: "publication_year"
147 <*> r .: "publication_month"
148 <*> r .: "publication_day"
152 instance ToNamedRecord CsvDoc where
153 toNamedRecord (CsvDoc t s py pm pd abst aut) =
154 namedRecord [ "title" .= t
156 , "publication_year" .= py
157 , "publication_month" .= pm
158 , "publication_day" .= pd
164 csvDecodeOptions :: DecodeOptions
165 csvDecodeOptions = (defaultDecodeOptions
166 {decDelimiter = fromIntegral $ ord '\t'}
169 csvEncodeOptions :: EncodeOptions
170 csvEncodeOptions = ( defaultEncodeOptions
171 {encDelimiter = fromIntegral $ ord '\t'}
174 ------------------------------------------------------------------------
175 ------------------------------------------------------------------------
176 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
177 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
181 ------------------------------------------------------------------------
182 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
184 csvData <- BL.readFile fp
185 case decodeByNameWith csvDecodeOptions csvData of
186 Left e -> panic (pack e)
187 Right csvDocs -> pure csvDocs
190 readHal :: FilePath -> IO (Header, Vector CsvHal)
192 csvData <- BL.readFile fp
193 case decodeByNameWith csvDecodeOptions csvData of
194 Left e -> panic (pack e)
195 Right csvDocs -> pure csvDocs
196 ------------------------------------------------------------------------
199 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
200 writeCsv fp (h, vs) = BL.writeFile fp $
201 encodeByNameWith csvEncodeOptions h (V.toList vs)
204 ------------------------------------------------------------------------
207 { csvHal_title :: !Text
208 , csvHal_source :: !Text
209 , csvHal_publication_year :: !Int
210 , csvHal_publication_month :: !Int
211 , csvHal_publication_day :: !Int
212 , csvHal_abstract :: !Text
213 , csvHal_authors :: !Text
215 , csvHal_url :: !Text
216 , csvHal_isbn_s :: !Text
217 , csvHal_issue_s :: !Text
218 , csvHal_journalPublisher_s:: !Text
219 , csvHal_language_s :: !Text
221 , csvHal_doiId_s :: !Text
222 , csvHal_authId_i :: !Text
223 , csvHal_instStructId_i :: !Text
224 , csvHal_deptStructId_i :: !Text
225 , csvHal_labStructId_i :: !Text
227 , csvHal_rteamStructId_i :: !Text
228 , csvHal_docType_s :: !Text
232 instance FromNamedRecord CsvHal where
233 parseNamedRecord r = CsvHal <$> r .: "title"
235 <*> r .: "publication_year"
236 <*> r .: "publication_month"
237 <*> r .: "publication_day"
244 <*> r .: "journalPublisher_s"
245 <*> r .: "language_s"
249 <*> r .: "instStructId_i"
250 <*> r .: "deptStructId_i"
251 <*> r .: "labStructId_i"
253 <*> r .: "rteamStructId_i"
256 instance ToNamedRecord CsvHal where
257 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss jour lang doi auth inst dept lab team doct) =
258 namedRecord [ "title" .= t
260 , "publication_year" .= py
261 , "publication_month" .= pm
262 , "publication_day" .= pd
269 , "journalPublisher_s" .= jour
270 , "language_s" .= lang
274 , "instStructId_i" .= inst
275 , "deptStructId_i" .= dept
276 , "labStructId_i" .= lab
278 , "rteamStructId_i" .= team
279 , "docType_s" .= doct