2 Module : Gargantext.Text.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
18 module Gargantext.Text.Parsers.CSV where
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
22 import Control.Applicative
24 import Data.Char (ord)
26 import Data.Either (Either(Left, Right))
27 import Data.Text (Text, pack, length, intercalate)
28 import qualified Data.ByteString.Lazy as BL
29 import Data.Time.Segment (jour)
31 import Data.Vector (Vector)
32 import qualified Data.Vector as V
34 import Gargantext.Database.Types.Node -- (HyperdataDocument(..))
35 import Gargantext.Text
36 import Gargantext.Text.Context
37 import Gargantext.Prelude hiding (length)
39 ---------------------------------------------------------------
40 headerCsvGargV3 :: Header
41 headerCsvGargV3 = header [ "title"
49 ---------------------------------------------------------------
54 , d_publication_year :: !Int
55 , d_publication_month :: !Int
56 , d_publication_day :: !Int
61 ---------------------------------------------------------------
62 -- | Doc 2 HyperdataDocument
63 doc2hyperdataDocument :: Doc -> HyperdataDocument
64 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
65 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
66 HyperdataDocument (Just "CSV")
67 (Just . pack . show $ did)
89 ---------------------------------------------------------------
90 -- | Types Conversions
91 toDocs :: Vector CsvDoc -> [Doc]
93 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
94 -> Doc nId t s py pm pd abst auth )
95 (V.enumFromN 1 (V.length v'')) v''
97 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
98 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
100 ---------------------------------------------------------------
101 fromDocs :: Vector Doc -> Vector CsvDoc
102 fromDocs docs = V.map fromDocs' docs
104 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
106 ---------------------------------------------------------------
107 -- | Split a document in its context
108 -- TODO adapt the size of the paragraph according to the corpus average
110 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
111 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
114 if (mod (round m) docSize) >= 10
123 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
124 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
126 firstDoc = CsvDoc t s py pm pd firstAbstract auth
127 firstAbstract = head' "splitDoc'1" abstracts
129 nextDocs = map (\txt -> CsvDoc
130 (head' "splitDoc'2" $ sentences txt)
132 (unsentences $ tail' "splitDoc'1" $ sentences txt)
134 ) (tail' "splitDoc'2" abstracts)
136 abstracts = (splitBy $ contextSize) abst
138 ---------------------------------------------------------------
139 ---------------------------------------------------------------
142 docsSize :: Vector CsvDoc -> Mean
143 docsSize csvDoc = mean ls
145 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
148 ---------------------------------------------------------------
151 , csv_source :: !Text
152 , csv_publication_year :: !Int
153 , csv_publication_month :: !Int
154 , csv_publication_day :: !Int
155 , csv_abstract :: !Text
156 , csv_authors :: !Text
160 instance FromNamedRecord CsvDoc where
161 parseNamedRecord r = CsvDoc <$> r .: "title"
163 <*> r .: "publication_year"
164 <*> r .: "publication_month"
165 <*> r .: "publication_day"
169 instance ToNamedRecord CsvDoc where
170 toNamedRecord (CsvDoc t s py pm pd abst aut) =
171 namedRecord [ "title" .= t
173 , "publication_year" .= py
174 , "publication_month" .= pm
175 , "publication_day" .= pd
180 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
181 hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
182 (m $ _hyperdataDocument_source h)
183 (mI $ _hyperdataDocument_publication_year h)
184 (mI $ _hyperdataDocument_publication_month h)
185 (mI $ _hyperdataDocument_publication_day h)
186 (m $ _hyperdataDocument_abstract h)
187 (m $ _hyperdataDocument_authors h)
190 m = maybe "" identity
191 mI = maybe 0 identity
194 csvDecodeOptions :: DecodeOptions
195 csvDecodeOptions = (defaultDecodeOptions
196 {decDelimiter = fromIntegral $ ord '\t'}
199 csvEncodeOptions :: EncodeOptions
200 csvEncodeOptions = ( defaultEncodeOptions
201 {encDelimiter = fromIntegral $ ord '\t'}
204 ------------------------------------------------------------------------
205 ------------------------------------------------------------------------
206 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
207 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
211 ------------------------------------------------------------------------
212 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
214 csvData <- BL.readFile fp
215 case decodeByNameWith csvDecodeOptions csvData of
216 Left e -> panic (pack e)
217 Right csvDocs -> pure csvDocs
220 readHal :: FilePath -> IO (Header, Vector CsvHal)
222 csvData <- BL.readFile fp
223 case decodeByNameWith csvDecodeOptions csvData of
224 Left e -> panic (pack e)
225 Right csvDocs -> pure csvDocs
226 ------------------------------------------------------------------------
227 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
228 writeCsv fp (h, vs) = BL.writeFile fp $
229 encodeByNameWith csvEncodeOptions h (V.toList vs)
231 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
232 writeDocs2Csv fp hs = BL.writeFile fp $
233 encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
235 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
236 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
238 ------------------------------------------------------------------------
241 { csvHal_title :: !Text
242 , csvHal_source :: !Text
243 , csvHal_publication_year :: !Integer
244 , csvHal_publication_month :: !Int
245 , csvHal_publication_day :: !Int
246 , csvHal_abstract :: !Text
247 , csvHal_authors :: !Text
249 , csvHal_url :: !Text
250 , csvHal_isbn_s :: !Text
251 , csvHal_issue_s :: !Text
252 , csvHal_journalPublisher_s:: !Text
253 , csvHal_language_s :: !Text
255 , csvHal_doiId_s :: !Text
256 , csvHal_authId_i :: !Text
257 , csvHal_instStructId_i :: !Text
258 , csvHal_deptStructId_i :: !Text
259 , csvHal_labStructId_i :: !Text
261 , csvHal_rteamStructId_i :: !Text
262 , csvHal_docType_s :: !Text
266 instance FromNamedRecord CsvHal where
267 parseNamedRecord r = CsvHal <$> r .: "title"
269 <*> r .: "publication_year"
270 <*> r .: "publication_month"
271 <*> r .: "publication_day"
278 <*> r .: "journalPublisher_s"
279 <*> r .: "language_s"
283 <*> r .: "instStructId_i"
284 <*> r .: "deptStructId_i"
285 <*> r .: "labStructId_i"
287 <*> r .: "rteamStructId_i"
290 instance ToNamedRecord CsvHal where
291 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
292 namedRecord [ "title" .= t
295 , "publication_year" .= py
296 , "publication_month" .= pm
297 , "publication_day" .= pd
305 , "journalPublisher_s" .= j
306 , "language_s" .= lang
310 , "instStructId_i" .= inst
311 , "deptStructId_i" .= dept
312 , "labStructId_i" .= lab
314 , "rteamStructId_i" .= team
315 , "docType_s" .= doct
318 csvHal2doc :: CsvHal -> HyperdataDocument
319 csvHal2doc (CsvHal title source
320 pub_year pub_month pub_day
324 _ _ ) = HyperdataDocument (Just "CsvHal")
335 (Just $ pack . show $ jour pub_year pub_month pub_day)
336 (Just $ fromIntegral pub_year)
344 ------------------------------------------------------------------------
345 parseHal :: FilePath -> IO [HyperdataDocument]
346 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
347 ------------------------------------------------------------------------