]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
Merge branch 'dev' into dev-ngrams-groups
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14
15 module Gargantext.Core.Text.Corpus.Parsers.CSV where
16
17 import Control.Applicative
18 import qualified Data.ByteString as BS
19 import qualified Data.ByteString.Lazy as BL
20 import Data.Char (ord)
21 import Data.Csv
22 import Data.Either (Either(Left, Right))
23 import Data.Text (Text, pack, length, intercalate)
24 import Data.Time.Segment (jour)
25 import qualified Data.Vector as V
26 import Data.Vector (Vector)
27 import GHC.IO (FilePath)
28 import GHC.Word (Word8)
29
30 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
31 import Gargantext.Prelude hiding (length)
32 import Gargantext.Core.Text
33 import Gargantext.Core.Text.Context
34
35 ---------------------------------------------------------------
36 headerCsvGargV3 :: Header
37 headerCsvGargV3 = header [ "title"
38 , "source"
39 , "publication_year"
40 , "publication_month"
41 , "publication_day"
42 , "abstract"
43 , "authors"
44 ]
45 ---------------------------------------------------------------
46 data CsvGargV3 = CsvGargV3
47 { d_docId :: !Int
48 , d_title :: !Text
49 , d_source :: !Text
50 , d_publication_year :: !Int
51 , d_publication_month :: !Int
52 , d_publication_day :: !Int
53 , d_abstract :: !Text
54 , d_authors :: !Text
55 }
56 deriving (Show)
57 ---------------------------------------------------------------
58 -- | Doc 2 HyperdataDocument
59 toDoc :: CsvGargV3 -> HyperdataDocument
60 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
61 HyperdataDocument (Just "CSV")
62 (Just . pack . show $ did)
63 Nothing
64 Nothing
65 Nothing
66 Nothing
67 (Just dt)
68 Nothing
69 (Just dau)
70 (Just dab)
71 (Nothing)
72 Nothing
73 (Just dpy)
74 (Just dpm)
75 (Just dpd)
76 Nothing
77 Nothing
78 Nothing
79 Nothing
80
81 ---------------------------------------------------------------
82 -- | Types Conversions
83 toDocs :: Vector CsvDoc -> [CsvGargV3]
84 toDocs v = V.toList
85 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
86 -> CsvGargV3 nId t s py pm pd abst auth )
87 (V.enumFromN 1 (V.length v'')) v''
88 where
89 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
90 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
91
92 ---------------------------------------------------------------
93 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
94 fromDocs docs = V.map fromDocs' docs
95 where
96 fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
97
98 ---------------------------------------------------------------
99 -- | Split a document in its context
100 -- TODO adapt the size of the paragraph according to the corpus average
101 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
102 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
103 if docSize > 1000
104 then
105 if (mod (round m) docSize) >= 10
106 then
107 splitDoc' splt doc
108 else
109 V.fromList [doc]
110 else
111 V.fromList [doc]
112 where
113 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
114 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
115 where
116 firstDoc = CsvDoc t s py pm pd firstAbstract auth
117 firstAbstract = head' "splitDoc'1" abstracts
118
119 nextDocs = map (\txt -> CsvDoc
120 (head' "splitDoc'2" $ sentences txt)
121 s py pm pd
122 (unsentences $ tail' "splitDoc'1" $ sentences txt)
123 auth
124 ) (tail' "splitDoc'2" abstracts)
125
126 abstracts = (splitBy $ contextSize) abst
127
128 ---------------------------------------------------------------
129 ---------------------------------------------------------------
130 type Mean = Double
131
132 docsSize :: Vector CsvDoc -> Mean
133 docsSize csvDoc = mean ls
134 where
135 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
136
137
138 ---------------------------------------------------------------
139 data CsvDoc = CsvDoc
140 { csv_title :: !Text
141 , csv_source :: !Text
142 , csv_publication_year :: !Int
143 , csv_publication_month :: !Int
144 , csv_publication_day :: !Int
145 , csv_abstract :: !Text
146 , csv_authors :: !Text
147 }
148 deriving (Show)
149
150 instance FromNamedRecord CsvDoc where
151 parseNamedRecord r = CsvDoc <$> r .: "title"
152 <*> r .: "source"
153 <*> r .: "publication_year"
154 <*> r .: "publication_month"
155 <*> r .: "publication_day"
156 <*> r .: "abstract"
157 <*> r .: "authors"
158
159 instance ToNamedRecord CsvDoc where
160 toNamedRecord (CsvDoc t s py pm pd abst aut) =
161 namedRecord [ "title" .= t
162 , "source" .= s
163 , "publication_year" .= py
164 , "publication_month" .= pm
165 , "publication_day" .= pd
166 , "abstract" .= abst
167 , "authors" .= aut
168 ]
169
170 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
171 hyperdataDocument2csvDoc h = CsvDoc (m $ _hd_title h)
172 (m $ _hd_source h)
173 (mI $ _hd_publication_year h)
174 (mI $ _hd_publication_month h)
175 (mI $ _hd_publication_day h)
176 (m $ _hd_abstract h)
177 (m $ _hd_authors h)
178
179 where
180 m = maybe "" identity
181 mI = maybe 0 identity
182
183
184 csvDecodeOptions :: DecodeOptions
185 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
186
187 csvEncodeOptions :: EncodeOptions
188 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
189
190 delimiter :: Word8
191 delimiter = fromIntegral $ ord '\t'
192 ------------------------------------------------------------------------
193 ------------------------------------------------------------------------
194 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text]
195 readCsvOn' fields fp = V.toList
196 <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
197 <$> snd
198 <$> readFile fp
199
200 ------------------------------------------------------------------------
201
202 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
203 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
204
205 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
206 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
207
208 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
209 readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
210 Left e -> panic (pack e)
211 Right csvDocs -> csvDocs
212
213 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
214 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
215
216 ------------------------------------------------------------------------
217 -- | TODO use readFileLazy
218 readFile :: FilePath -> IO (Header, Vector CsvDoc)
219 readFile = fmap readCsvLazyBS . BL.readFile
220
221
222 -- | TODO use readByteStringLazy
223 readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
224 readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
225 Left e -> panic (pack e)
226 Right csvDocs -> csvDocs
227
228 ------------------------------------------------------------------------
229
230 -- | TODO use readFileLazy
231 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
232 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
233
234 -- | TODO use readByteStringLazy
235 readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
236 readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
237 Left e -> panic (pack e)
238 Right csvDocs -> csvDocs
239
240 readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
241 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
242
243 ------------------------------------------------------------------------
244 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
245 writeFile fp (h, vs) = BL.writeFile fp $
246 encodeByNameWith csvEncodeOptions h (V.toList vs)
247
248 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
249 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
250
251 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
252 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
253
254 ------------------------------------------------------------------------
255 -- Hal Format
256 data CsvHal = CsvHal
257 { csvHal_title :: !Text
258 , csvHal_source :: !Text
259 , csvHal_publication_year :: !Integer
260 , csvHal_publication_month :: !Int
261 , csvHal_publication_day :: !Int
262 , csvHal_abstract :: !Text
263 , csvHal_authors :: !Text
264
265 , csvHal_url :: !Text
266 , csvHal_isbn_s :: !Text
267 , csvHal_issue_s :: !Text
268 , csvHal_journalPublisher_s:: !Text
269 , csvHal_language_s :: !Text
270
271 , csvHal_doiId_s :: !Text
272 , csvHal_authId_i :: !Text
273 , csvHal_instStructId_i :: !Text
274 , csvHal_deptStructId_i :: !Text
275 , csvHal_labStructId_i :: !Text
276
277 , csvHal_rteamStructId_i :: !Text
278 , csvHal_docType_s :: !Text
279 }
280 deriving (Show)
281
282 instance FromNamedRecord CsvHal where
283 parseNamedRecord r = CsvHal <$> r .: "title"
284 <*> r .: "source"
285 <*> r .: "publication_year"
286 <*> r .: "publication_month"
287 <*> r .: "publication_day"
288 <*> r .: "abstract"
289 <*> r .: "authors"
290
291 <*> r .: "url"
292 <*> r .: "isbn_s"
293 <*> r .: "issue_s"
294 <*> r .: "journalPublisher_s"
295 <*> r .: "language_s"
296
297 <*> r .: "doiId_s"
298 <*> r .: "authId_i"
299 <*> r .: "instStructId_i"
300 <*> r .: "deptStructId_i"
301 <*> r .: "labStructId_i"
302
303 <*> r .: "rteamStructId_i"
304 <*> r .: "docType_s"
305
306 instance ToNamedRecord CsvHal where
307 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
308 namedRecord [ "title" .= t
309 , "source" .= s
310
311 , "publication_year" .= py
312 , "publication_month" .= pm
313 , "publication_day" .= pd
314
315 , "abstract" .= abst
316 , "authors" .= aut
317
318 , "url" .= url
319 , "isbn_s" .= isbn
320 , "issue_s" .= iss
321 , "journalPublisher_s" .= j
322 , "language_s" .= lang
323
324 , "doiId_s" .= doi
325 , "authId_i" .= auth
326 , "instStructId_i" .= inst
327 , "deptStructId_i" .= dept
328 , "labStructId_i" .= lab
329
330 , "rteamStructId_i" .= team
331 , "docType_s" .= doct
332 ]
333
334 csvHal2doc :: CsvHal -> HyperdataDocument
335 csvHal2doc (CsvHal title source
336 pub_year pub_month pub_day
337 abstract authors
338 url _ _ _ _
339 doi _ inst _ _
340 _ _ ) = HyperdataDocument (Just "CsvHal")
341 (Just doi)
342 (Just url)
343 Nothing
344 Nothing
345 Nothing
346 (Just title)
347 (Just authors)
348 (Just inst)
349 (Just source)
350 (Just abstract)
351 (Just $ pack . show $ jour pub_year pub_month pub_day)
352 (Just $ fromIntegral pub_year)
353 (Just pub_month)
354 (Just pub_day)
355 Nothing
356 Nothing
357 Nothing
358 Nothing
359
360
361 csv2doc :: CsvDoc -> HyperdataDocument
362 csv2doc (CsvDoc title source
363 pub_year pub_month pub_day
364 abstract authors ) = HyperdataDocument (Just "CsvHal")
365 Nothing
366 Nothing
367 Nothing
368 Nothing
369 Nothing
370 (Just title)
371 (Just authors)
372 Nothing
373 (Just source)
374 (Just abstract)
375 (Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
376 (Just $ fromIntegral pub_year)
377 (Just pub_month)
378 (Just pub_day)
379 Nothing
380 Nothing
381 Nothing
382 Nothing
383
384 ------------------------------------------------------------------------
385 parseHal :: FilePath -> IO [HyperdataDocument]
386 parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp
387
388 parseHal' :: BL.ByteString -> [HyperdataDocument]
389 parseHal' = V.toList . V.map csvHal2doc . snd . readCsvHalLazyBS
390
391 ------------------------------------------------------------------------
392
393 parseCsv :: FilePath -> IO [HyperdataDocument]
394 parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
395
396 parseCsv' :: BL.ByteString -> [HyperdataDocument]
397 parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs
398