]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Corpus/Parsers/CSV.hs
[FIX] List with TFICF.
[gargantext.git] / src / Gargantext / Text / Corpus / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Text.Corpus.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
17
18 module Gargantext.Text.Corpus.Parsers.CSV where
19
20 import Control.Applicative
21 import Data.Char (ord)
22 import Data.Csv
23 import Data.Either (Either(Left, Right))
24 import Data.Text (Text, pack, length, intercalate)
25 import Data.Time.Segment (jour)
26 import Data.Vector (Vector)
27 import GHC.IO (FilePath)
28 import GHC.Real (round)
29 import GHC.Word (Word8)
30 import Gargantext.Database.Types.Node -- (HyperdataDocument(..))
31 import Gargantext.Prelude hiding (length)
32 import Gargantext.Text
33 import Gargantext.Text.Context
34 import qualified Data.ByteString.Lazy as BL
35 import qualified Data.ByteString as BS
36 import qualified Data.Vector as V
37
38 ---------------------------------------------------------------
39 headerCsvGargV3 :: Header
40 headerCsvGargV3 = header [ "title"
41 , "source"
42 , "publication_year"
43 , "publication_month"
44 , "publication_day"
45 , "abstract"
46 , "authors"
47 ]
48 ---------------------------------------------------------------
49 data CsvGargV3 = CsvGargV3
50 { d_docId :: !Int
51 , d_title :: !Text
52 , d_source :: !Text
53 , d_publication_year :: !Int
54 , d_publication_month :: !Int
55 , d_publication_day :: !Int
56 , d_abstract :: !Text
57 , d_authors :: !Text
58 }
59 deriving (Show)
60 ---------------------------------------------------------------
61 -- | Doc 2 HyperdataDocument
62 toDoc :: CsvGargV3 -> HyperdataDocument
63 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
64 HyperdataDocument (Just "CSV")
65 (Just . pack . show $ did)
66 Nothing
67 Nothing
68 Nothing
69 Nothing
70 (Just dt)
71 Nothing
72 (Just dau)
73 (Just dab)
74 (Nothing)
75 Nothing
76 (Just dpy)
77 (Just dpm)
78 (Just dpd)
79 Nothing
80 Nothing
81 Nothing
82 Nothing
83
84 ---------------------------------------------------------------
85 -- | Types Conversions
86 toDocs :: Vector CsvDoc -> [CsvGargV3]
87 toDocs v = V.toList
88 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
89 -> CsvGargV3 nId t s py pm pd abst auth )
90 (V.enumFromN 1 (V.length v'')) v''
91 where
92 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
93 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
94
95 ---------------------------------------------------------------
96 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
97 fromDocs docs = V.map fromDocs' docs
98 where
99 fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
100
101 ---------------------------------------------------------------
102 -- | Split a document in its context
103 -- TODO adapt the size of the paragraph according to the corpus average
104 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
105 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
106 if docSize > 1000
107 then
108 if (mod (round m) docSize) >= 10
109 then
110 splitDoc' splt doc
111 else
112 V.fromList [doc]
113 else
114 V.fromList [doc]
115 where
116 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
117 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
118 where
119 firstDoc = CsvDoc t s py pm pd firstAbstract auth
120 firstAbstract = head' "splitDoc'1" abstracts
121
122 nextDocs = map (\txt -> CsvDoc
123 (head' "splitDoc'2" $ sentences txt)
124 s py pm pd
125 (unsentences $ tail' "splitDoc'1" $ sentences txt)
126 auth
127 ) (tail' "splitDoc'2" abstracts)
128
129 abstracts = (splitBy $ contextSize) abst
130
131 ---------------------------------------------------------------
132 ---------------------------------------------------------------
133 type Mean = Double
134
135 docsSize :: Vector CsvDoc -> Mean
136 docsSize csvDoc = mean ls
137 where
138 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
139
140
141 ---------------------------------------------------------------
142 data CsvDoc = CsvDoc
143 { csv_title :: !Text
144 , csv_source :: !Text
145 , csv_publication_year :: !Int
146 , csv_publication_month :: !Int
147 , csv_publication_day :: !Int
148 , csv_abstract :: !Text
149 , csv_authors :: !Text
150 }
151 deriving (Show)
152
153 instance FromNamedRecord CsvDoc where
154 parseNamedRecord r = CsvDoc <$> r .: "title"
155 <*> r .: "source"
156 <*> r .: "publication_year"
157 <*> r .: "publication_month"
158 <*> r .: "publication_day"
159 <*> r .: "abstract"
160 <*> r .: "authors"
161
162 instance ToNamedRecord CsvDoc where
163 toNamedRecord (CsvDoc t s py pm pd abst aut) =
164 namedRecord [ "title" .= t
165 , "source" .= s
166 , "publication_year" .= py
167 , "publication_month" .= pm
168 , "publication_day" .= pd
169 , "abstract" .= abst
170 , "authors" .= aut
171 ]
172
173 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
174 hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
175 (m $ _hyperdataDocument_source h)
176 (mI $ _hyperdataDocument_publication_year h)
177 (mI $ _hyperdataDocument_publication_month h)
178 (mI $ _hyperdataDocument_publication_day h)
179 (m $ _hyperdataDocument_abstract h)
180 (m $ _hyperdataDocument_authors h)
181
182 where
183 m = maybe "" identity
184 mI = maybe 0 identity
185
186
187 csvDecodeOptions :: DecodeOptions
188 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
189
190 csvEncodeOptions :: EncodeOptions
191 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
192
193 delimiter :: Word8
194 delimiter = fromIntegral $ ord '\t'
195 ------------------------------------------------------------------------
196 ------------------------------------------------------------------------
197 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text]
198 readCsvOn' fields fp = V.toList
199 <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
200 <$> snd
201 <$> readFile fp
202
203 ------------------------------------------------------------------------
204
205 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
206 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
207
208 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
209 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
210
211 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
212 readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
213 Left e -> panic (pack e)
214 Right csvDocs -> csvDocs
215
216 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
217 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
218
219 ------------------------------------------------------------------------
220 -- | TODO use readFileLazy
221 readFile :: FilePath -> IO (Header, Vector CsvDoc)
222 readFile = fmap readCsvLazyBS . BL.readFile
223
224
225 -- | TODO use readByteStringLazy
226 readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
227 readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
228 Left e -> panic (pack e)
229 Right csvDocs -> csvDocs
230
231 ------------------------------------------------------------------------
232
233 -- | TODO use readFileLazy
234 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
235 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
236
237 -- | TODO use readByteStringLazy
238 readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
239 readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
240 Left e -> panic (pack e)
241 Right csvDocs -> csvDocs
242
243 readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
244 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
245
246 ------------------------------------------------------------------------
247 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
248 writeFile fp (h, vs) = BL.writeFile fp $
249 encodeByNameWith csvEncodeOptions h (V.toList vs)
250
251 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
252 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
253
254 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
255 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
256
257 ------------------------------------------------------------------------
258 -- Hal Format
259 data CsvHal = CsvHal
260 { csvHal_title :: !Text
261 , csvHal_source :: !Text
262 , csvHal_publication_year :: !Integer
263 , csvHal_publication_month :: !Int
264 , csvHal_publication_day :: !Int
265 , csvHal_abstract :: !Text
266 , csvHal_authors :: !Text
267
268 , csvHal_url :: !Text
269 , csvHal_isbn_s :: !Text
270 , csvHal_issue_s :: !Text
271 , csvHal_journalPublisher_s:: !Text
272 , csvHal_language_s :: !Text
273
274 , csvHal_doiId_s :: !Text
275 , csvHal_authId_i :: !Text
276 , csvHal_instStructId_i :: !Text
277 , csvHal_deptStructId_i :: !Text
278 , csvHal_labStructId_i :: !Text
279
280 , csvHal_rteamStructId_i :: !Text
281 , csvHal_docType_s :: !Text
282 }
283 deriving (Show)
284
285 instance FromNamedRecord CsvHal where
286 parseNamedRecord r = CsvHal <$> r .: "title"
287 <*> r .: "source"
288 <*> r .: "publication_year"
289 <*> r .: "publication_month"
290 <*> r .: "publication_day"
291 <*> r .: "abstract"
292 <*> r .: "authors"
293
294 <*> r .: "url"
295 <*> r .: "isbn_s"
296 <*> r .: "issue_s"
297 <*> r .: "journalPublisher_s"
298 <*> r .: "language_s"
299
300 <*> r .: "doiId_s"
301 <*> r .: "authId_i"
302 <*> r .: "instStructId_i"
303 <*> r .: "deptStructId_i"
304 <*> r .: "labStructId_i"
305
306 <*> r .: "rteamStructId_i"
307 <*> r .: "docType_s"
308
309 instance ToNamedRecord CsvHal where
310 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
311 namedRecord [ "title" .= t
312 , "source" .= s
313
314 , "publication_year" .= py
315 , "publication_month" .= pm
316 , "publication_day" .= pd
317
318 , "abstract" .= abst
319 , "authors" .= aut
320
321 , "url" .= url
322 , "isbn_s" .= isbn
323 , "issue_s" .= iss
324 , "journalPublisher_s" .= j
325 , "language_s" .= lang
326
327 , "doiId_s" .= doi
328 , "authId_i" .= auth
329 , "instStructId_i" .= inst
330 , "deptStructId_i" .= dept
331 , "labStructId_i" .= lab
332
333 , "rteamStructId_i" .= team
334 , "docType_s" .= doct
335 ]
336
337 csvHal2doc :: CsvHal -> HyperdataDocument
338 csvHal2doc (CsvHal title source
339 pub_year pub_month pub_day
340 abstract authors
341 url _ _ _ _
342 doi _ inst _ _
343 _ _ ) = HyperdataDocument (Just "CsvHal")
344 (Just doi)
345 (Just url)
346 Nothing
347 Nothing
348 Nothing
349 (Just title)
350 (Just authors)
351 (Just inst)
352 (Just source)
353 (Just abstract)
354 (Just $ pack . show $ jour pub_year pub_month pub_day)
355 (Just $ fromIntegral pub_year)
356 (Just pub_month)
357 (Just pub_day)
358 Nothing
359 Nothing
360 Nothing
361 Nothing
362
363
364 csv2doc :: CsvDoc -> HyperdataDocument
365 csv2doc (CsvDoc title source
366 pub_year pub_month pub_day
367 abstract authors ) = HyperdataDocument (Just "CsvHal")
368 Nothing
369 Nothing
370 Nothing
371 Nothing
372 Nothing
373 (Just title)
374 (Just authors)
375 Nothing
376 (Just source)
377 (Just abstract)
378 (Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
379 (Just $ fromIntegral pub_year)
380 (Just pub_month)
381 (Just pub_day)
382 Nothing
383 Nothing
384 Nothing
385 Nothing
386
387 ------------------------------------------------------------------------
388 parseHal :: FilePath -> IO [HyperdataDocument]
389 parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp
390 ------------------------------------------------------------------------
391
392 parseCsv :: FilePath -> IO [HyperdataDocument]
393 parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
394