2 Module : Gargantext.Text.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
18 module Gargantext.Text.Parsers.CSV where
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
23 import Control.Applicative
25 import Data.Char (ord)
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
30 import Data.Time.Segment (jour)
32 import Data.Vector (Vector)
33 import qualified Data.Vector as V
36 import Gargantext.Database.Types.Node (HyperdataDocument(..))
37 import Gargantext.Text
38 import Gargantext.Text.Context
39 import Gargantext.Prelude hiding (length)
41 ---------------------------------------------------------------
46 , d_publication_year :: !Int
47 , d_publication_month :: !Int
48 , d_publication_day :: !Int
53 ---------------------------------------------------------------
54 -- | Doc 2 HyperdataDocument
55 doc2hyperdataDocument :: Doc -> HyperdataDocument
56 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
57 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
58 HyperdataDocument (Just "CSV")
59 (Just . pack . show $ did)
77 ---------------------------------------------------------------
78 -- | Types Conversions
79 toDocs :: Vector CsvDoc -> [Doc]
81 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
82 -> Doc nId t s py pm pd abst auth )
83 (V.enumFromN 1 (V.length v'')) v''
85 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
86 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
88 ---------------------------------------------------------------
89 fromDocs :: Vector Doc -> Vector CsvDoc
90 fromDocs docs = V.map fromDocs' docs
92 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
94 ---------------------------------------------------------------
95 -- | Split a document in its context
96 -- TODO adapt the size of the paragraph according to the corpus average
98 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
99 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
102 if (mod (round m) docSize) >= 10
111 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
112 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
114 firstDoc = CsvDoc t s py pm pd firstAbstract auth
115 firstAbstract = head' abstracts
117 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
119 abstracts = (splitBy $ contextSize) abst
120 head' x = maybe "" identity (head x)
121 tail' x = maybe [""] identity (tailMay x)
123 ---------------------------------------------------------------
124 ---------------------------------------------------------------
127 docsSize :: Vector CsvDoc -> Mean
128 docsSize csvDoc = mean ls
130 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
133 ---------------------------------------------------------------
136 , csv_source :: !Text
137 , csv_publication_year :: !Int
138 , csv_publication_month :: !Int
139 , csv_publication_day :: !Int
140 , csv_abstract :: !Text
141 , csv_authors :: !Text
145 instance FromNamedRecord CsvDoc where
146 parseNamedRecord r = CsvDoc <$> r .: "title"
148 <*> r .: "publication_year"
149 <*> r .: "publication_month"
150 <*> r .: "publication_day"
154 instance ToNamedRecord CsvDoc where
155 toNamedRecord (CsvDoc t s py pm pd abst aut) =
156 namedRecord [ "title" .= t
158 , "publication_year" .= py
159 , "publication_month" .= pm
160 , "publication_day" .= pd
166 csvDecodeOptions :: DecodeOptions
167 csvDecodeOptions = (defaultDecodeOptions
168 {decDelimiter = fromIntegral $ ord '\t'}
171 csvEncodeOptions :: EncodeOptions
172 csvEncodeOptions = ( defaultEncodeOptions
173 {encDelimiter = fromIntegral $ ord '\t'}
176 ------------------------------------------------------------------------
177 ------------------------------------------------------------------------
178 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
179 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
183 ------------------------------------------------------------------------
184 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
186 csvData <- BL.readFile fp
187 case decodeByNameWith csvDecodeOptions csvData of
188 Left e -> panic (pack e)
189 Right csvDocs -> pure csvDocs
192 readHal :: FilePath -> IO (Header, Vector CsvHal)
194 csvData <- BL.readFile fp
195 case decodeByNameWith csvDecodeOptions csvData of
196 Left e -> panic (pack e)
197 Right csvDocs -> pure csvDocs
198 ------------------------------------------------------------------------
199 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
200 writeCsv fp (h, vs) = BL.writeFile fp $
201 encodeByNameWith csvEncodeOptions h (V.toList vs)
204 ------------------------------------------------------------------------
207 { csvHal_title :: !Text
208 , csvHal_source :: !Text
209 , csvHal_publication_year :: !Integer
210 , csvHal_publication_month :: !Int
211 , csvHal_publication_day :: !Int
212 , csvHal_abstract :: !Text
213 , csvHal_authors :: !Text
215 , csvHal_url :: !Text
216 , csvHal_isbn_s :: !Text
217 , csvHal_issue_s :: !Text
218 , csvHal_journalPublisher_s:: !Text
219 , csvHal_language_s :: !Text
221 , csvHal_doiId_s :: !Text
222 , csvHal_authId_i :: !Text
223 , csvHal_instStructId_i :: !Text
224 , csvHal_deptStructId_i :: !Text
225 , csvHal_labStructId_i :: !Text
227 , csvHal_rteamStructId_i :: !Text
228 , csvHal_docType_s :: !Text
232 instance FromNamedRecord CsvHal where
233 parseNamedRecord r = CsvHal <$> r .: "title"
235 <*> r .: "publication_year"
236 <*> r .: "publication_month"
237 <*> r .: "publication_day"
244 <*> r .: "journalPublisher_s"
245 <*> r .: "language_s"
249 <*> r .: "instStructId_i"
250 <*> r .: "deptStructId_i"
251 <*> r .: "labStructId_i"
253 <*> r .: "rteamStructId_i"
256 instance ToNamedRecord CsvHal where
257 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
258 namedRecord [ "title" .= t
261 , "publication_year" .= py
262 , "publication_month" .= pm
263 , "publication_day" .= pd
271 , "journalPublisher_s" .= j
272 , "language_s" .= lang
276 , "instStructId_i" .= inst
277 , "deptStructId_i" .= dept
278 , "labStructId_i" .= lab
280 , "rteamStructId_i" .= team
281 , "docType_s" .= doct
284 csvHal2doc :: CsvHal -> HyperdataDocument
285 csvHal2doc (CsvHal title source
286 pub_year pub_month pub_day
290 _ _ ) = HyperdataDocument (Just "CsvHal")
301 (Just $ pack . show $ jour pub_year pub_month pub_day)
302 (Just $ fromIntegral pub_year)
310 ------------------------------------------------------------------------
311 parseHal :: FilePath -> IO [HyperdataDocument]
312 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
313 ------------------------------------------------------------------------