]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
[searx] first draft of searx parsing, updated stack to lts 18.4
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14
15 module Gargantext.Core.Text.Corpus.Parsers.CSV where
16
17 import Control.Applicative
18 import qualified Data.ByteString as BS
19 import qualified Data.ByteString.Lazy as BL
20 import Data.Char (ord)
21 import Data.Csv
22 import Data.Either (Either(Left, Right))
23 import Data.Text (Text, pack, length, intercalate)
24 import Data.Time.Segment (jour)
25 import qualified Data.Vector as V
26 import Data.Vector (Vector)
27 import GHC.IO (FilePath)
28 import GHC.Word (Word8)
29
30 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
31 import Gargantext.Prelude hiding (length)
32 import Gargantext.Core.Text
33 import Gargantext.Core.Text.Context
34
35 ---------------------------------------------------------------
36 headerCsvGargV3 :: Header
37 headerCsvGargV3 =
38 header [ "title"
39 , "source"
40 , "publication_year"
41 , "publication_month"
42 , "publication_day"
43 , "abstract"
44 , "authors"
45 ]
46 ---------------------------------------------------------------
47 data CsvGargV3 = CsvGargV3
48 { d_docId :: !Int
49 , d_title :: !Text
50 , d_source :: !Text
51 , d_publication_year :: !Int
52 , d_publication_month :: !Int
53 , d_publication_day :: !Int
54 , d_abstract :: !Text
55 , d_authors :: !Text
56 }
57 deriving (Show)
58 ---------------------------------------------------------------
59 -- | Doc 2 HyperdataDocument
60 toDoc :: CsvGargV3 -> HyperdataDocument
61 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
62 HyperdataDocument (Just "CSV")
63 (Just . pack . show $ did)
64 Nothing
65 Nothing
66 Nothing
67 Nothing
68 (Just dt)
69 Nothing
70 (Just dau)
71 (Just dab)
72 (Nothing)
73 Nothing
74 (Just dpy)
75 (Just dpm)
76 (Just dpd)
77 Nothing
78 Nothing
79 Nothing
80 Nothing
81
82 ---------------------------------------------------------------
83 -- | Types Conversions
84 toDocs :: Vector CsvDoc -> [CsvGargV3]
85 toDocs v = V.toList
86 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
87 -> CsvGargV3 nId t s py pm pd abst auth )
88 (V.enumFromN 1 (V.length v'')) v''
89 where
90 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
91 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
92
93 ---------------------------------------------------------------
94 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
95 fromDocs docs = V.map fromDocs' docs
96 where
97 fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
98
99 ---------------------------------------------------------------
100 -- | Split a document in its context
101 -- TODO adapt the size of the paragraph according to the corpus average
102 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
103 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
104 if docSize > 1000
105 then
106 if (mod (round m) docSize) >= 10
107 then
108 splitDoc' splt doc
109 else
110 V.fromList [doc]
111 else
112 V.fromList [doc]
113 where
114 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
115 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
116 where
117 firstDoc = CsvDoc t s py pm pd firstAbstract auth
118 firstAbstract = head' "splitDoc'1" abstracts
119
120 nextDocs = map (\txt -> CsvDoc
121 (head' "splitDoc'2" $ sentences txt)
122 s py pm pd
123 (unsentences $ tail' "splitDoc'1" $ sentences txt)
124 auth
125 ) (tail' "splitDoc'2" abstracts)
126
127 abstracts = (splitBy $ contextSize) abst
128
129 ---------------------------------------------------------------
130 ---------------------------------------------------------------
131 type Mean = Double
132
133 docsSize :: Vector CsvDoc -> Mean
134 docsSize csvDoc = mean ls
135 where
136 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
137
138
139 ---------------------------------------------------------------
140 data CsvDoc = CsvDoc
141 { csv_title :: !Text
142 , csv_source :: !Text
143 , csv_publication_year :: !Int
144 , csv_publication_month :: !Int
145 , csv_publication_day :: !Int
146 , csv_abstract :: !Text
147 , csv_authors :: !Text
148 }
149 deriving (Show)
150
151 instance FromNamedRecord CsvDoc where
152 parseNamedRecord r = CsvDoc <$> r .: "title"
153 <*> r .: "source"
154 <*> r .: "publication_year"
155 <*> r .: "publication_month"
156 <*> r .: "publication_day"
157 <*> r .: "abstract"
158 <*> r .: "authors"
159
160 instance ToNamedRecord CsvDoc where
161 toNamedRecord (CsvDoc t s py pm pd abst aut) =
162 namedRecord [ "title" .= t
163 , "source" .= s
164 , "publication_year" .= py
165 , "publication_month" .= pm
166 , "publication_day" .= pd
167 , "abstract" .= abst
168 , "authors" .= aut
169 ]
170
171 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
172 hyperdataDocument2csvDoc h = CsvDoc (m $ _hd_title h)
173 (m $ _hd_source h)
174 (mI $ _hd_publication_year h)
175 (mI $ _hd_publication_month h)
176 (mI $ _hd_publication_day h)
177 (m $ _hd_abstract h)
178 (m $ _hd_authors h)
179
180 where
181 m = maybe "" identity
182 mI = maybe 0 identity
183
184
185 csvDecodeOptions :: DecodeOptions
186 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
187
188 csvEncodeOptions :: EncodeOptions
189 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
190
191 delimiter :: Word8
192 delimiter = fromIntegral $ ord '\t'
193 ------------------------------------------------------------------------
194 ------------------------------------------------------------------------
195 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text]
196 readCsvOn' fields fp = V.toList
197 <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
198 <$> snd
199 <$> readFile fp
200
201 ------------------------------------------------------------------------
202
203 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
204 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
205
206 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
207 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
208
209 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
210 readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
211 Left e -> panic (pack e)
212 Right csvDocs -> csvDocs
213
214 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
215 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
216
217 ------------------------------------------------------------------------
218 -- | TODO use readFileLazy
219 readFile :: FilePath -> IO (Header, Vector CsvDoc)
220 readFile = fmap readCsvLazyBS . BL.readFile
221
222
223 -- | TODO use readByteStringLazy
224 readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
225 readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
226 Left e -> panic (pack e)
227 Right csvDocs -> csvDocs
228
229 ------------------------------------------------------------------------
230 -- | TODO use readFileLazy
231 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
232 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
233
234 -- | TODO use readByteStringLazy
235 readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
236 readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
237 Left e -> panic (pack e)
238 Right csvDocs -> csvDocs
239
240 readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
241 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
242
243 ------------------------------------------------------------------------
244 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
245 writeFile fp (h, vs) = BL.writeFile fp $
246 encodeByNameWith csvEncodeOptions h (V.toList vs)
247
248 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
249 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
250
251 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
252 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
253
254 ------------------------------------------------------------------------
255 -- Hal Format
256 data CsvHal = CsvHal
257 { csvHal_title :: !Text
258 , csvHal_source :: !Text
259 , csvHal_publication_year :: !Integer
260 , csvHal_publication_month :: !Int
261 , csvHal_publication_day :: !Int
262 , csvHal_abstract :: !Text
263 , csvHal_authors :: !Text
264
265 , csvHal_url :: !Text
266 , csvHal_isbn_s :: !Text
267 , csvHal_issue_s :: !Text
268 , csvHal_journalPublisher_s:: !Text
269 , csvHal_language_s :: !Text
270
271 , csvHal_doiId_s :: !Text
272 , csvHal_authId_i :: !Text
273 , csvHal_instStructId_i :: !Text
274 , csvHal_deptStructId_i :: !Text
275 , csvHal_labStructId_i :: !Text
276
277 , csvHal_rteamStructId_i :: !Text
278 , csvHal_docType_s :: !Text
279 }
280 deriving (Show)
281
282 instance FromNamedRecord CsvHal where
283 parseNamedRecord r = CsvHal <$> r .: "title"
284 <*> r .: "source"
285 <*> r .: "publication_year"
286 <*> r .: "publication_month"
287 <*> r .: "publication_day"
288 <*> r .: "abstract"
289 <*> r .: "authors"
290
291 <*> r .: "url"
292 <*> r .: "isbn_s"
293 <*> r .: "issue_s"
294 <*> r .: "journalPublisher_s"
295 <*> r .: "language_s"
296
297 <*> r .: "doiId_s"
298 <*> r .: "authId_i"
299 <*> r .: "instStructId_i"
300 <*> r .: "deptStructId_i"
301 <*> r .: "labStructId_i"
302
303 <*> r .: "rteamStructId_i"
304 <*> r .: "docType_s"
305
306 instance ToNamedRecord CsvHal where
307 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
308 namedRecord [ "title" .= t
309 , "source" .= s
310
311 , "publication_year" .= py
312 , "publication_month" .= pm
313 , "publication_day" .= pd
314
315 , "abstract" .= abst
316 , "authors" .= aut
317
318 , "url" .= url
319 , "isbn_s" .= isbn
320 , "issue_s" .= iss
321 , "journalPublisher_s" .= j
322 , "language_s" .= lang
323
324 , "doiId_s" .= doi
325 , "authId_i" .= auth
326 , "instStructId_i" .= inst
327 , "deptStructId_i" .= dept
328 , "labStructId_i" .= lab
329
330 , "rteamStructId_i" .= team
331 , "docType_s" .= doct
332 ]
333
334 csvHal2doc :: CsvHal -> HyperdataDocument
335 csvHal2doc (CsvHal title source
336 pub_year pub_month pub_day
337 abstract authors
338 url _ _ _ _
339 doi _ inst _ _
340 _ _ ) = HyperdataDocument (Just "CsvHal")
341 (Just doi)
342 (Just url)
343 Nothing
344 Nothing
345 Nothing
346 (Just title)
347 (Just authors)
348 (Just inst)
349 (Just source)
350 (Just abstract)
351 (Just $ pack . show $ jour pub_year pub_month pub_day)
352 (Just $ fromIntegral pub_year)
353 (Just pub_month)
354 (Just pub_day)
355 Nothing
356 Nothing
357 Nothing
358 Nothing
359
360
361 csv2doc :: CsvDoc -> HyperdataDocument
362 csv2doc (CsvDoc title source
363 pub_year pub_month pub_day
364 abstract authors ) = HyperdataDocument (Just "CsvHal")
365 Nothing
366 Nothing
367 Nothing
368 Nothing
369 Nothing
370 (Just title)
371 (Just authors)
372 Nothing
373 (Just source)
374 (Just abstract)
375 (Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
376 (Just $ fromIntegral pub_year)
377 (Just pub_month)
378 (Just pub_day)
379 Nothing
380 Nothing
381 Nothing
382 Nothing
383
384 ------------------------------------------------------------------------
385 parseHal :: FilePath -> IO [HyperdataDocument]
386 parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp
387
388 parseHal' :: BL.ByteString -> [HyperdataDocument]
389 parseHal' = V.toList . V.map csvHal2doc . snd . readCsvHalLazyBS
390
391 ------------------------------------------------------------------------
392 parseCsv :: FilePath -> IO [HyperdataDocument]
393 parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
394
395 parseCsv' :: BL.ByteString -> [HyperdataDocument]
396 parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs
397
398 ------------------------------------------------------------------------
399 -- Csv v3 weighted for phylo
400
401 data Csv' = Csv'
402 { csv'_title :: !Text
403 , csv'_source :: !Text
404 , csv'_publication_year :: !Int
405 , csv'_publication_month :: !Int
406 , csv'_publication_day :: !Int
407 , csv'_abstract :: !Text
408 , csv'_authors :: !Text
409 , csv'_weight :: !Double } deriving (Show)
410
411
412 instance FromNamedRecord Csv' where
413 parseNamedRecord r = Csv' <$> r .: "title"
414 <*> r .: "source"
415 <*> r .: "publication_year"
416 <*> r .: "publication_month"
417 <*> r .: "publication_day"
418 <*> r .: "abstract"
419 <*> r .: "authors"
420 <*> r .: "weight"
421
422 readWeightedCsv :: FilePath -> IO (Header, Vector Csv')
423 readWeightedCsv fp =
424 fmap (\bs ->
425 case decodeByNameWith csvDecodeOptions bs of
426 Left e -> panic (pack e)
427 Right corpus -> corpus
428 ) $ BL.readFile fp