2 Module : Gargantext.Core.Text.Corpus.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
15 module Gargantext.Core.Text.Corpus.Parsers.CSV where
17 import Control.Applicative
18 import qualified Data.ByteString as BS
19 import qualified Data.ByteString.Lazy as BL
20 import Data.Char (ord)
22 import Data.Either (Either(Left, Right))
23 import Data.Text (Text, pack, length, intercalate)
24 import Data.Time.Segment (jour)
25 import qualified Data.Vector as V
26 import Data.Vector (Vector)
27 import GHC.IO (FilePath)
28 import GHC.Word (Word8)
30 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
31 import Gargantext.Prelude hiding (length)
32 import Gargantext.Core.Text
33 import Gargantext.Core.Text.Context
35 ---------------------------------------------------------------
36 headerCsvGargV3 :: Header
37 headerCsvGargV3 = header [ "title"
45 ---------------------------------------------------------------
46 data CsvGargV3 = CsvGargV3
50 , d_publication_year :: !Int
51 , d_publication_month :: !Int
52 , d_publication_day :: !Int
57 ---------------------------------------------------------------
58 -- | Doc 2 HyperdataDocument
59 toDoc :: CsvGargV3 -> HyperdataDocument
60 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
61 HyperdataDocument (Just "CSV")
62 (Just . pack . show $ did)
81 ---------------------------------------------------------------
82 -- | Types Conversions
83 toDocs :: Vector CsvDoc -> [CsvGargV3]
85 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
86 -> CsvGargV3 nId t s py pm pd abst auth )
87 (V.enumFromN 1 (V.length v'')) v''
89 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
90 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
92 ---------------------------------------------------------------
93 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
94 fromDocs docs = V.map fromDocs' docs
96 fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
98 ---------------------------------------------------------------
99 -- | Split a document in its context
100 -- TODO adapt the size of the paragraph according to the corpus average
101 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
102 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
105 if (mod (round m) docSize) >= 10
113 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
114 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
116 firstDoc = CsvDoc t s py pm pd firstAbstract auth
117 firstAbstract = head' "splitDoc'1" abstracts
119 nextDocs = map (\txt -> CsvDoc
120 (head' "splitDoc'2" $ sentences txt)
122 (unsentences $ tail' "splitDoc'1" $ sentences txt)
124 ) (tail' "splitDoc'2" abstracts)
126 abstracts = (splitBy $ contextSize) abst
128 ---------------------------------------------------------------
129 ---------------------------------------------------------------
132 docsSize :: Vector CsvDoc -> Mean
133 docsSize csvDoc = mean ls
135 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
138 ---------------------------------------------------------------
141 , csv_source :: !Text
142 , csv_publication_year :: !Int
143 , csv_publication_month :: !Int
144 , csv_publication_day :: !Int
145 , csv_abstract :: !Text
146 , csv_authors :: !Text
150 instance FromNamedRecord CsvDoc where
151 parseNamedRecord r = CsvDoc <$> r .: "title"
153 <*> r .: "publication_year"
154 <*> r .: "publication_month"
155 <*> r .: "publication_day"
159 instance ToNamedRecord CsvDoc where
160 toNamedRecord (CsvDoc t s py pm pd abst aut) =
161 namedRecord [ "title" .= t
163 , "publication_year" .= py
164 , "publication_month" .= pm
165 , "publication_day" .= pd
170 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
171 hyperdataDocument2csvDoc h = CsvDoc (m $ _hd_title h)
173 (mI $ _hd_publication_year h)
174 (mI $ _hd_publication_month h)
175 (mI $ _hd_publication_day h)
180 m = maybe "" identity
181 mI = maybe 0 identity
184 csvDecodeOptions :: DecodeOptions
185 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
187 csvEncodeOptions :: EncodeOptions
188 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
191 delimiter = fromIntegral $ ord '\t'
192 ------------------------------------------------------------------------
193 ------------------------------------------------------------------------
194 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text]
195 readCsvOn' fields fp = V.toList
196 <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
200 ------------------------------------------------------------------------
202 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
203 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
205 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
206 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
208 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
209 readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
210 Left e -> panic (pack e)
211 Right csvDocs -> csvDocs
213 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
214 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
216 ------------------------------------------------------------------------
217 -- | TODO use readFileLazy
218 readFile :: FilePath -> IO (Header, Vector CsvDoc)
219 readFile = fmap readCsvLazyBS . BL.readFile
222 -- | TODO use readByteStringLazy
223 readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
224 readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
225 Left e -> panic (pack e)
226 Right csvDocs -> csvDocs
228 ------------------------------------------------------------------------
230 -- | TODO use readFileLazy
231 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
232 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
234 -- | TODO use readByteStringLazy
235 readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
236 readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
237 Left e -> panic (pack e)
238 Right csvDocs -> csvDocs
240 readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
241 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
243 ------------------------------------------------------------------------
244 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
245 writeFile fp (h, vs) = BL.writeFile fp $
246 encodeByNameWith csvEncodeOptions h (V.toList vs)
248 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
249 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
251 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
252 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
254 ------------------------------------------------------------------------
257 { csvHal_title :: !Text
258 , csvHal_source :: !Text
259 , csvHal_publication_year :: !Integer
260 , csvHal_publication_month :: !Int
261 , csvHal_publication_day :: !Int
262 , csvHal_abstract :: !Text
263 , csvHal_authors :: !Text
265 , csvHal_url :: !Text
266 , csvHal_isbn_s :: !Text
267 , csvHal_issue_s :: !Text
268 , csvHal_journalPublisher_s:: !Text
269 , csvHal_language_s :: !Text
271 , csvHal_doiId_s :: !Text
272 , csvHal_authId_i :: !Text
273 , csvHal_instStructId_i :: !Text
274 , csvHal_deptStructId_i :: !Text
275 , csvHal_labStructId_i :: !Text
277 , csvHal_rteamStructId_i :: !Text
278 , csvHal_docType_s :: !Text
282 instance FromNamedRecord CsvHal where
283 parseNamedRecord r = CsvHal <$> r .: "title"
285 <*> r .: "publication_year"
286 <*> r .: "publication_month"
287 <*> r .: "publication_day"
294 <*> r .: "journalPublisher_s"
295 <*> r .: "language_s"
299 <*> r .: "instStructId_i"
300 <*> r .: "deptStructId_i"
301 <*> r .: "labStructId_i"
303 <*> r .: "rteamStructId_i"
306 instance ToNamedRecord CsvHal where
307 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
308 namedRecord [ "title" .= t
311 , "publication_year" .= py
312 , "publication_month" .= pm
313 , "publication_day" .= pd
321 , "journalPublisher_s" .= j
322 , "language_s" .= lang
326 , "instStructId_i" .= inst
327 , "deptStructId_i" .= dept
328 , "labStructId_i" .= lab
330 , "rteamStructId_i" .= team
331 , "docType_s" .= doct
334 csvHal2doc :: CsvHal -> HyperdataDocument
335 csvHal2doc (CsvHal title source
336 pub_year pub_month pub_day
340 _ _ ) = HyperdataDocument (Just "CsvHal")
351 (Just $ pack . show $ jour pub_year pub_month pub_day)
352 (Just $ fromIntegral pub_year)
361 csv2doc :: CsvDoc -> HyperdataDocument
362 csv2doc (CsvDoc title source
363 pub_year pub_month pub_day
364 abstract authors ) = HyperdataDocument (Just "CsvHal")
375 (Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
376 (Just $ fromIntegral pub_year)
384 ------------------------------------------------------------------------
385 parseHal :: FilePath -> IO [HyperdataDocument]
386 parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp
388 parseHal' :: BL.ByteString -> [HyperdataDocument]
389 parseHal' = V.toList . V.map csvHal2doc . snd . readCsvHalLazyBS
391 ------------------------------------------------------------------------
393 parseCsv :: FilePath -> IO [HyperdataDocument]
394 parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
396 parseCsv' :: BL.ByteString -> [HyperdataDocument]
397 parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs