]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Corpus/Parsers/CSV.hs
[hyperdata] refactor code to add hyperdata graph metrics
[gargantext.git] / src / Gargantext / Text / Corpus / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Text.Corpus.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
17
18 module Gargantext.Text.Corpus.Parsers.CSV where
19
20 import Control.Applicative
21 import qualified Data.ByteString as BS
22 import qualified Data.ByteString.Lazy as BL
23 import Data.Char (ord)
24 import Data.Csv
25 import Data.Either (Either(Left, Right))
26 import Data.Text (Text, pack, length, intercalate)
27 import Data.Time.Segment (jour)
28 import qualified Data.Vector as V
29 import Data.Vector (Vector)
30 import GHC.IO (FilePath)
31 import GHC.Real (round)
32 import GHC.Word (Word8)
33
34 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
35 import Gargantext.Prelude hiding (length)
36 import Gargantext.Text
37 import Gargantext.Text.Context
38
39 ---------------------------------------------------------------
40 headerCsvGargV3 :: Header
41 headerCsvGargV3 = header [ "title"
42 , "source"
43 , "publication_year"
44 , "publication_month"
45 , "publication_day"
46 , "abstract"
47 , "authors"
48 ]
49 ---------------------------------------------------------------
50 data CsvGargV3 = CsvGargV3
51 { d_docId :: !Int
52 , d_title :: !Text
53 , d_source :: !Text
54 , d_publication_year :: !Int
55 , d_publication_month :: !Int
56 , d_publication_day :: !Int
57 , d_abstract :: !Text
58 , d_authors :: !Text
59 }
60 deriving (Show)
61 ---------------------------------------------------------------
62 -- | Doc 2 HyperdataDocument
63 toDoc :: CsvGargV3 -> HyperdataDocument
64 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
65 HyperdataDocument (Just "CSV")
66 (Just . pack . show $ did)
67 Nothing
68 Nothing
69 Nothing
70 Nothing
71 (Just dt)
72 Nothing
73 (Just dau)
74 (Just dab)
75 (Nothing)
76 Nothing
77 (Just dpy)
78 (Just dpm)
79 (Just dpd)
80 Nothing
81 Nothing
82 Nothing
83 Nothing
84
85 ---------------------------------------------------------------
86 -- | Types Conversions
87 toDocs :: Vector CsvDoc -> [CsvGargV3]
88 toDocs v = V.toList
89 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
90 -> CsvGargV3 nId t s py pm pd abst auth )
91 (V.enumFromN 1 (V.length v'')) v''
92 where
93 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
94 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
95
96 ---------------------------------------------------------------
97 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
98 fromDocs docs = V.map fromDocs' docs
99 where
100 fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
101
102 ---------------------------------------------------------------
103 -- | Split a document in its context
104 -- TODO adapt the size of the paragraph according to the corpus average
105 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
106 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
107 if docSize > 1000
108 then
109 if (mod (round m) docSize) >= 10
110 then
111 splitDoc' splt doc
112 else
113 V.fromList [doc]
114 else
115 V.fromList [doc]
116 where
117 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
118 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
119 where
120 firstDoc = CsvDoc t s py pm pd firstAbstract auth
121 firstAbstract = head' "splitDoc'1" abstracts
122
123 nextDocs = map (\txt -> CsvDoc
124 (head' "splitDoc'2" $ sentences txt)
125 s py pm pd
126 (unsentences $ tail' "splitDoc'1" $ sentences txt)
127 auth
128 ) (tail' "splitDoc'2" abstracts)
129
130 abstracts = (splitBy $ contextSize) abst
131
132 ---------------------------------------------------------------
133 ---------------------------------------------------------------
134 type Mean = Double
135
136 docsSize :: Vector CsvDoc -> Mean
137 docsSize csvDoc = mean ls
138 where
139 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
140
141
142 ---------------------------------------------------------------
143 data CsvDoc = CsvDoc
144 { csv_title :: !Text
145 , csv_source :: !Text
146 , csv_publication_year :: !Int
147 , csv_publication_month :: !Int
148 , csv_publication_day :: !Int
149 , csv_abstract :: !Text
150 , csv_authors :: !Text
151 }
152 deriving (Show)
153
154 instance FromNamedRecord CsvDoc where
155 parseNamedRecord r = CsvDoc <$> r .: "title"
156 <*> r .: "source"
157 <*> r .: "publication_year"
158 <*> r .: "publication_month"
159 <*> r .: "publication_day"
160 <*> r .: "abstract"
161 <*> r .: "authors"
162
163 instance ToNamedRecord CsvDoc where
164 toNamedRecord (CsvDoc t s py pm pd abst aut) =
165 namedRecord [ "title" .= t
166 , "source" .= s
167 , "publication_year" .= py
168 , "publication_month" .= pm
169 , "publication_day" .= pd
170 , "abstract" .= abst
171 , "authors" .= aut
172 ]
173
174 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
175 hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
176 (m $ _hyperdataDocument_source h)
177 (mI $ _hyperdataDocument_publication_year h)
178 (mI $ _hyperdataDocument_publication_month h)
179 (mI $ _hyperdataDocument_publication_day h)
180 (m $ _hyperdataDocument_abstract h)
181 (m $ _hyperdataDocument_authors h)
182
183 where
184 m = maybe "" identity
185 mI = maybe 0 identity
186
187
188 csvDecodeOptions :: DecodeOptions
189 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
190
191 csvEncodeOptions :: EncodeOptions
192 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
193
194 delimiter :: Word8
195 delimiter = fromIntegral $ ord '\t'
196 ------------------------------------------------------------------------
197 ------------------------------------------------------------------------
198 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text]
199 readCsvOn' fields fp = V.toList
200 <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
201 <$> snd
202 <$> readFile fp
203
204 ------------------------------------------------------------------------
205
206 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
207 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
208
209 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
210 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
211
212 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
213 readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
214 Left e -> panic (pack e)
215 Right csvDocs -> csvDocs
216
217 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
218 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
219
220 ------------------------------------------------------------------------
221 -- | TODO use readFileLazy
222 readFile :: FilePath -> IO (Header, Vector CsvDoc)
223 readFile = fmap readCsvLazyBS . BL.readFile
224
225
226 -- | TODO use readByteStringLazy
227 readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
228 readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
229 Left e -> panic (pack e)
230 Right csvDocs -> csvDocs
231
232 ------------------------------------------------------------------------
233
234 -- | TODO use readFileLazy
235 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
236 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
237
238 -- | TODO use readByteStringLazy
239 readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
240 readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
241 Left e -> panic (pack e)
242 Right csvDocs -> csvDocs
243
244 readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
245 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
246
247 ------------------------------------------------------------------------
248 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
249 writeFile fp (h, vs) = BL.writeFile fp $
250 encodeByNameWith csvEncodeOptions h (V.toList vs)
251
252 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
253 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
254
255 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
256 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
257
258 ------------------------------------------------------------------------
259 -- Hal Format
260 data CsvHal = CsvHal
261 { csvHal_title :: !Text
262 , csvHal_source :: !Text
263 , csvHal_publication_year :: !Integer
264 , csvHal_publication_month :: !Int
265 , csvHal_publication_day :: !Int
266 , csvHal_abstract :: !Text
267 , csvHal_authors :: !Text
268
269 , csvHal_url :: !Text
270 , csvHal_isbn_s :: !Text
271 , csvHal_issue_s :: !Text
272 , csvHal_journalPublisher_s:: !Text
273 , csvHal_language_s :: !Text
274
275 , csvHal_doiId_s :: !Text
276 , csvHal_authId_i :: !Text
277 , csvHal_instStructId_i :: !Text
278 , csvHal_deptStructId_i :: !Text
279 , csvHal_labStructId_i :: !Text
280
281 , csvHal_rteamStructId_i :: !Text
282 , csvHal_docType_s :: !Text
283 }
284 deriving (Show)
285
286 instance FromNamedRecord CsvHal where
287 parseNamedRecord r = CsvHal <$> r .: "title"
288 <*> r .: "source"
289 <*> r .: "publication_year"
290 <*> r .: "publication_month"
291 <*> r .: "publication_day"
292 <*> r .: "abstract"
293 <*> r .: "authors"
294
295 <*> r .: "url"
296 <*> r .: "isbn_s"
297 <*> r .: "issue_s"
298 <*> r .: "journalPublisher_s"
299 <*> r .: "language_s"
300
301 <*> r .: "doiId_s"
302 <*> r .: "authId_i"
303 <*> r .: "instStructId_i"
304 <*> r .: "deptStructId_i"
305 <*> r .: "labStructId_i"
306
307 <*> r .: "rteamStructId_i"
308 <*> r .: "docType_s"
309
310 instance ToNamedRecord CsvHal where
311 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
312 namedRecord [ "title" .= t
313 , "source" .= s
314
315 , "publication_year" .= py
316 , "publication_month" .= pm
317 , "publication_day" .= pd
318
319 , "abstract" .= abst
320 , "authors" .= aut
321
322 , "url" .= url
323 , "isbn_s" .= isbn
324 , "issue_s" .= iss
325 , "journalPublisher_s" .= j
326 , "language_s" .= lang
327
328 , "doiId_s" .= doi
329 , "authId_i" .= auth
330 , "instStructId_i" .= inst
331 , "deptStructId_i" .= dept
332 , "labStructId_i" .= lab
333
334 , "rteamStructId_i" .= team
335 , "docType_s" .= doct
336 ]
337
338 csvHal2doc :: CsvHal -> HyperdataDocument
339 csvHal2doc (CsvHal title source
340 pub_year pub_month pub_day
341 abstract authors
342 url _ _ _ _
343 doi _ inst _ _
344 _ _ ) = HyperdataDocument (Just "CsvHal")
345 (Just doi)
346 (Just url)
347 Nothing
348 Nothing
349 Nothing
350 (Just title)
351 (Just authors)
352 (Just inst)
353 (Just source)
354 (Just abstract)
355 (Just $ pack . show $ jour pub_year pub_month pub_day)
356 (Just $ fromIntegral pub_year)
357 (Just pub_month)
358 (Just pub_day)
359 Nothing
360 Nothing
361 Nothing
362 Nothing
363
364
365 csv2doc :: CsvDoc -> HyperdataDocument
366 csv2doc (CsvDoc title source
367 pub_year pub_month pub_day
368 abstract authors ) = HyperdataDocument (Just "CsvHal")
369 Nothing
370 Nothing
371 Nothing
372 Nothing
373 Nothing
374 (Just title)
375 (Just authors)
376 Nothing
377 (Just source)
378 (Just abstract)
379 (Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
380 (Just $ fromIntegral pub_year)
381 (Just pub_month)
382 (Just pub_day)
383 Nothing
384 Nothing
385 Nothing
386 Nothing
387
388 ------------------------------------------------------------------------
389 parseHal :: FilePath -> IO [HyperdataDocument]
390 parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp
391
392 parseHal' :: BL.ByteString -> [HyperdataDocument]
393 parseHal' = V.toList . V.map csvHal2doc . snd . readCsvHalLazyBS
394
395 ------------------------------------------------------------------------
396
397 parseCsv :: FilePath -> IO [HyperdataDocument]
398 parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
399
400 parseCsv' :: BL.ByteString -> [HyperdataDocument]
401 parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs
402