2 Module : Gargantext.Text.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
18 module Gargantext.Text.Parsers.CSV where
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
23 import Control.Applicative
25 import Data.Char (ord)
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
30 import Data.Time.Segment (jour)
32 import Data.Vector (Vector)
33 import qualified Data.Vector as V
35 import Gargantext.Database.Types.Node -- (HyperdataDocument(..))
36 import Gargantext.Text
37 import Gargantext.Text.Context
38 import Gargantext.Prelude hiding (length)
40 ---------------------------------------------------------------
41 headerCsvGargV3 :: Header
42 headerCsvGargV3 = header [ "title"
50 ---------------------------------------------------------------
55 , d_publication_year :: !Int
56 , d_publication_month :: !Int
57 , d_publication_day :: !Int
62 ---------------------------------------------------------------
63 -- | Doc 2 HyperdataDocument
64 doc2hyperdataDocument :: Doc -> HyperdataDocument
65 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
66 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
67 HyperdataDocument (Just "CSV")
68 (Just . pack . show $ did)
90 ---------------------------------------------------------------
91 -- | Types Conversions
92 toDocs :: Vector CsvDoc -> [Doc]
94 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
95 -> Doc nId t s py pm pd abst auth )
96 (V.enumFromN 1 (V.length v'')) v''
98 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
99 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
101 ---------------------------------------------------------------
102 fromDocs :: Vector Doc -> Vector CsvDoc
103 fromDocs docs = V.map fromDocs' docs
105 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
107 ---------------------------------------------------------------
108 -- | Split a document in its context
109 -- TODO adapt the size of the paragraph according to the corpus average
111 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
112 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
115 if (mod (round m) docSize) >= 10
124 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
125 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
127 firstDoc = CsvDoc t s py pm pd firstAbstract auth
128 firstAbstract = head' "splitDoc'1" abstracts
130 nextDocs = map (\txt -> CsvDoc
131 (head' "splitDoc'2" $ sentences txt)
133 (unsentences $ tail' "splitDoc'1" $ sentences txt)
135 ) (tail' "splitDoc'2" abstracts)
137 abstracts = (splitBy $ contextSize) abst
139 ---------------------------------------------------------------
140 ---------------------------------------------------------------
143 docsSize :: Vector CsvDoc -> Mean
144 docsSize csvDoc = mean ls
146 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
149 ---------------------------------------------------------------
152 , csv_source :: !Text
153 , csv_publication_year :: !Int
154 , csv_publication_month :: !Int
155 , csv_publication_day :: !Int
156 , csv_abstract :: !Text
157 , csv_authors :: !Text
161 instance FromNamedRecord CsvDoc where
162 parseNamedRecord r = CsvDoc <$> r .: "title"
164 <*> r .: "publication_year"
165 <*> r .: "publication_month"
166 <*> r .: "publication_day"
170 instance ToNamedRecord CsvDoc where
171 toNamedRecord (CsvDoc t s py pm pd abst aut) =
172 namedRecord [ "title" .= t
174 , "publication_year" .= py
175 , "publication_month" .= pm
176 , "publication_day" .= pd
181 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
182 hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
183 (m $ _hyperdataDocument_source h)
184 (mI $ _hyperdataDocument_publication_year h)
185 (mI $ _hyperdataDocument_publication_month h)
186 (mI $ _hyperdataDocument_publication_day h)
187 (m $ _hyperdataDocument_abstract h)
188 (m $ _hyperdataDocument_authors h)
191 m = maybe "" identity
192 mI = maybe 0 identity
195 csvDecodeOptions :: DecodeOptions
196 csvDecodeOptions = (defaultDecodeOptions
197 {decDelimiter = fromIntegral $ ord '\t'}
200 csvEncodeOptions :: EncodeOptions
201 csvEncodeOptions = ( defaultEncodeOptions
202 {encDelimiter = fromIntegral $ ord '\t'}
205 ------------------------------------------------------------------------
206 ------------------------------------------------------------------------
207 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
208 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
212 ------------------------------------------------------------------------
213 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
215 csvData <- BL.readFile fp
216 case decodeByNameWith csvDecodeOptions csvData of
217 Left e -> panic (pack e)
218 Right csvDocs -> pure csvDocs
221 readHal :: FilePath -> IO (Header, Vector CsvHal)
223 csvData <- BL.readFile fp
224 case decodeByNameWith csvDecodeOptions csvData of
225 Left e -> panic (pack e)
226 Right csvDocs -> pure csvDocs
227 ------------------------------------------------------------------------
228 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
229 writeCsv fp (h, vs) = BL.writeFile fp $
230 encodeByNameWith csvEncodeOptions h (V.toList vs)
232 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
233 writeDocs2Csv fp hs = BL.writeFile fp $
234 encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
235 ------------------------------------------------------------------------
238 { csvHal_title :: !Text
239 , csvHal_source :: !Text
240 , csvHal_publication_year :: !Integer
241 , csvHal_publication_month :: !Int
242 , csvHal_publication_day :: !Int
243 , csvHal_abstract :: !Text
244 , csvHal_authors :: !Text
246 , csvHal_url :: !Text
247 , csvHal_isbn_s :: !Text
248 , csvHal_issue_s :: !Text
249 , csvHal_journalPublisher_s:: !Text
250 , csvHal_language_s :: !Text
252 , csvHal_doiId_s :: !Text
253 , csvHal_authId_i :: !Text
254 , csvHal_instStructId_i :: !Text
255 , csvHal_deptStructId_i :: !Text
256 , csvHal_labStructId_i :: !Text
258 , csvHal_rteamStructId_i :: !Text
259 , csvHal_docType_s :: !Text
263 instance FromNamedRecord CsvHal where
264 parseNamedRecord r = CsvHal <$> r .: "title"
266 <*> r .: "publication_year"
267 <*> r .: "publication_month"
268 <*> r .: "publication_day"
275 <*> r .: "journalPublisher_s"
276 <*> r .: "language_s"
280 <*> r .: "instStructId_i"
281 <*> r .: "deptStructId_i"
282 <*> r .: "labStructId_i"
284 <*> r .: "rteamStructId_i"
287 instance ToNamedRecord CsvHal where
288 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
289 namedRecord [ "title" .= t
292 , "publication_year" .= py
293 , "publication_month" .= pm
294 , "publication_day" .= pd
302 , "journalPublisher_s" .= j
303 , "language_s" .= lang
307 , "instStructId_i" .= inst
308 , "deptStructId_i" .= dept
309 , "labStructId_i" .= lab
311 , "rteamStructId_i" .= team
312 , "docType_s" .= doct
315 csvHal2doc :: CsvHal -> HyperdataDocument
316 csvHal2doc (CsvHal title source
317 pub_year pub_month pub_day
321 _ _ ) = HyperdataDocument (Just "CsvHal")
332 (Just $ pack . show $ jour pub_year pub_month pub_day)
333 (Just $ fromIntegral pub_year)
341 ------------------------------------------------------------------------
342 parseHal :: FilePath -> IO [HyperdataDocument]
343 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
344 ------------------------------------------------------------------------