2 Module : Gargantext.Text.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
18 module Gargantext.Text.Parsers.CSV where
20 import Control.Applicative
21 import Data.Char (ord)
23 import Data.Either (Either(Left, Right))
24 import Data.Text (Text, pack, length, intercalate)
25 import Data.Time.Segment (jour)
26 import Data.Vector (Vector)
27 import GHC.IO (FilePath)
28 import GHC.Real (round)
29 import GHC.Word (Word8)
30 import Gargantext.Database.Types.Node -- (HyperdataDocument(..))
31 import Gargantext.Prelude hiding (length)
32 import Gargantext.Text
33 import Gargantext.Text.Context
34 import qualified Data.ByteString.Lazy as BL
35 import qualified Data.ByteString as BS
36 import qualified Data.Vector as V
38 ---------------------------------------------------------------
39 headerCsvGargV3 :: Header
40 headerCsvGargV3 = header [ "title"
48 ---------------------------------------------------------------
53 , d_publication_year :: !Int
54 , d_publication_month :: !Int
55 , d_publication_day :: !Int
60 ---------------------------------------------------------------
61 -- | Doc 2 HyperdataDocument
62 doc2hyperdataDocument :: Doc -> HyperdataDocument
63 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
64 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
65 HyperdataDocument (Just "CSV")
66 (Just . pack . show $ did)
88 ---------------------------------------------------------------
89 -- | Types Conversions
90 toDocs :: Vector CsvDoc -> [Doc]
92 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
93 -> Doc nId t s py pm pd abst auth )
94 (V.enumFromN 1 (V.length v'')) v''
96 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
97 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
99 ---------------------------------------------------------------
100 fromDocs :: Vector Doc -> Vector CsvDoc
101 fromDocs docs = V.map fromDocs' docs
103 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
105 ---------------------------------------------------------------
106 -- | Split a document in its context
107 -- TODO adapt the size of the paragraph according to the corpus average
109 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
110 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
113 if (mod (round m) docSize) >= 10
122 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
123 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
125 firstDoc = CsvDoc t s py pm pd firstAbstract auth
126 firstAbstract = head' "splitDoc'1" abstracts
128 nextDocs = map (\txt -> CsvDoc
129 (head' "splitDoc'2" $ sentences txt)
131 (unsentences $ tail' "splitDoc'1" $ sentences txt)
133 ) (tail' "splitDoc'2" abstracts)
135 abstracts = (splitBy $ contextSize) abst
137 ---------------------------------------------------------------
138 ---------------------------------------------------------------
141 docsSize :: Vector CsvDoc -> Mean
142 docsSize csvDoc = mean ls
144 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
147 ---------------------------------------------------------------
150 , csv_source :: !Text
151 , csv_publication_year :: !Int
152 , csv_publication_month :: !Int
153 , csv_publication_day :: !Int
154 , csv_abstract :: !Text
155 , csv_authors :: !Text
159 instance FromNamedRecord CsvDoc where
160 parseNamedRecord r = CsvDoc <$> r .: "title"
162 <*> r .: "publication_year"
163 <*> r .: "publication_month"
164 <*> r .: "publication_day"
168 instance ToNamedRecord CsvDoc where
169 toNamedRecord (CsvDoc t s py pm pd abst aut) =
170 namedRecord [ "title" .= t
172 , "publication_year" .= py
173 , "publication_month" .= pm
174 , "publication_day" .= pd
179 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
180 hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
181 (m $ _hyperdataDocument_source h)
182 (mI $ _hyperdataDocument_publication_year h)
183 (mI $ _hyperdataDocument_publication_month h)
184 (mI $ _hyperdataDocument_publication_day h)
185 (m $ _hyperdataDocument_abstract h)
186 (m $ _hyperdataDocument_authors h)
189 m = maybe "" identity
190 mI = maybe 0 identity
193 csvDecodeOptions :: DecodeOptions
194 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
196 csvEncodeOptions :: EncodeOptions
197 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
200 delimiter = fromIntegral $ ord '\t'
201 ------------------------------------------------------------------------
202 ------------------------------------------------------------------------
203 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
204 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
208 ------------------------------------------------------------------------
210 readFileLazy :: (FromNamedRecord a) => a -> FilePath -> IO (Header, Vector a)
211 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
213 readFileStrict :: (FromNamedRecord a) => a -> FilePath -> IO (Header, Vector a)
214 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
217 readByteStringLazy :: (FromNamedRecord a) => a -> BL.ByteString -> (Header, Vector a)
218 readByteStringLazy f bs = case decodeByNameWith csvDecodeOptions bs of
219 Left e -> panic (pack e)
220 Right csvDocs -> csvDocs
222 readByteStringStrict :: (FromNamedRecord a) => a -> BS.ByteString -> (Header, Vector a)
223 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
225 ------------------------------------------------------------------------
226 -- | TODO use readFileLazy
227 readFile :: FilePath -> IO (Header, Vector CsvDoc)
228 readFile = fmap readCsvLazyBS . BL.readFile
230 -- | TODO use readByteStringLazy
231 readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
232 readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
233 Left e -> panic (pack e)
234 Right csvDocs -> csvDocs
236 ------------------------------------------------------------------------
237 -- | TODO use readFileLazy
238 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
239 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
241 -- | TODO use readByteStringLazy
242 readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
243 readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
244 Left e -> panic (pack e)
245 Right csvDocs -> csvDocs
247 readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
248 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
250 ------------------------------------------------------------------------
251 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
252 writeFile fp (h, vs) = BL.writeFile fp $
253 encodeByNameWith csvEncodeOptions h (V.toList vs)
255 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
256 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
258 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
259 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
261 ------------------------------------------------------------------------
264 { csvHal_title :: !Text
265 , csvHal_source :: !Text
266 , csvHal_publication_year :: !Integer
267 , csvHal_publication_month :: !Int
268 , csvHal_publication_day :: !Int
269 , csvHal_abstract :: !Text
270 , csvHal_authors :: !Text
272 , csvHal_url :: !Text
273 , csvHal_isbn_s :: !Text
274 , csvHal_issue_s :: !Text
275 , csvHal_journalPublisher_s:: !Text
276 , csvHal_language_s :: !Text
278 , csvHal_doiId_s :: !Text
279 , csvHal_authId_i :: !Text
280 , csvHal_instStructId_i :: !Text
281 , csvHal_deptStructId_i :: !Text
282 , csvHal_labStructId_i :: !Text
284 , csvHal_rteamStructId_i :: !Text
285 , csvHal_docType_s :: !Text
289 instance FromNamedRecord CsvHal where
290 parseNamedRecord r = CsvHal <$> r .: "title"
292 <*> r .: "publication_year"
293 <*> r .: "publication_month"
294 <*> r .: "publication_day"
301 <*> r .: "journalPublisher_s"
302 <*> r .: "language_s"
306 <*> r .: "instStructId_i"
307 <*> r .: "deptStructId_i"
308 <*> r .: "labStructId_i"
310 <*> r .: "rteamStructId_i"
313 instance ToNamedRecord CsvHal where
314 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
315 namedRecord [ "title" .= t
318 , "publication_year" .= py
319 , "publication_month" .= pm
320 , "publication_day" .= pd
328 , "journalPublisher_s" .= j
329 , "language_s" .= lang
333 , "instStructId_i" .= inst
334 , "deptStructId_i" .= dept
335 , "labStructId_i" .= lab
337 , "rteamStructId_i" .= team
338 , "docType_s" .= doct
341 csvHal2doc :: CsvHal -> HyperdataDocument
342 csvHal2doc (CsvHal title source
343 pub_year pub_month pub_day
347 _ _ ) = HyperdataDocument (Just "CsvHal")
358 (Just $ pack . show $ jour pub_year pub_month pub_day)
359 (Just $ fromIntegral pub_year)
367 ------------------------------------------------------------------------
368 parseHal :: FilePath -> IO [HyperdataDocument]
369 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readCsvHal fp
370 ------------------------------------------------------------------------