]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/CSV.hs
Merge branch 'dev-corpus-with-filetype' into dev
[gargantext.git] / src / Gargantext / Text / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Text.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
17
18 module Gargantext.Text.Parsers.CSV where
19
20 import Control.Applicative
21 import Data.Char (ord)
22 import Data.Csv
23 import Data.Either (Either(Left, Right))
24 import Data.Text (Text, pack, length, intercalate)
25 import Data.Time.Segment (jour)
26 import Data.Vector (Vector)
27 import GHC.IO (FilePath)
28 import GHC.Real (round)
29 import GHC.Word (Word8)
30 import Gargantext.Database.Types.Node -- (HyperdataDocument(..))
31 import Gargantext.Prelude hiding (length)
32 import Gargantext.Text
33 import Gargantext.Text.Context
34 import qualified Data.ByteString.Lazy as BL
35 import qualified Data.ByteString as BS
36 import qualified Data.Vector as V
37
38 ---------------------------------------------------------------
39 headerCsvGargV3 :: Header
40 headerCsvGargV3 = header [ "title"
41 , "source"
42 , "publication_year"
43 , "publication_month"
44 , "publication_day"
45 , "abstract"
46 , "authors"
47 ]
48 ---------------------------------------------------------------
49 data CsvGargV3 = CsvGargV3
50 { d_docId :: !Int
51 , d_title :: !Text
52 , d_source :: !Text
53 , d_publication_year :: !Int
54 , d_publication_month :: !Int
55 , d_publication_day :: !Int
56 , d_abstract :: !Text
57 , d_authors :: !Text
58 }
59 deriving (Show)
60 ---------------------------------------------------------------
61 -- | Doc 2 HyperdataDocument
62 toDoc :: CsvGargV3 -> HyperdataDocument
63 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
64 HyperdataDocument (Just "CSV")
65 (Just . pack . show $ did)
66 Nothing
67 Nothing
68 Nothing
69 Nothing
70 (Just dt)
71 Nothing
72 (Just dau)
73 (Just dab)
74 (Nothing)
75 Nothing
76 (Just dpy)
77 (Just dpm)
78 (Just dpd)
79 Nothing
80 Nothing
81 Nothing
82 Nothing
83
84 ---------------------------------------------------------------
85 -- | Types Conversions
86 toDocs :: Vector CsvDoc -> [CsvGargV3]
87 toDocs v = V.toList
88 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
89 -> CsvGargV3 nId t s py pm pd abst auth )
90 (V.enumFromN 1 (V.length v'')) v''
91 where
92 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
93 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
94
95 ---------------------------------------------------------------
96 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
97 fromDocs docs = V.map fromDocs' docs
98 where
99 fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
100
101 ---------------------------------------------------------------
102 -- | Split a document in its context
103 -- TODO adapt the size of the paragraph according to the corpus average
104
105 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
106 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
107 if docSize > 1000
108 then
109 if (mod (round m) docSize) >= 10
110 then
111 splitDoc' splt doc
112 else
113 V.fromList [doc]
114 else
115 V.fromList [doc]
116
117
118 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
119 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
120 where
121 firstDoc = CsvDoc t s py pm pd firstAbstract auth
122 firstAbstract = head' "splitDoc'1" abstracts
123
124 nextDocs = map (\txt -> CsvDoc
125 (head' "splitDoc'2" $ sentences txt)
126 s py pm pd
127 (unsentences $ tail' "splitDoc'1" $ sentences txt)
128 auth
129 ) (tail' "splitDoc'2" abstracts)
130
131 abstracts = (splitBy $ contextSize) abst
132
133 ---------------------------------------------------------------
134 ---------------------------------------------------------------
135 type Mean = Double
136
137 docsSize :: Vector CsvDoc -> Mean
138 docsSize csvDoc = mean ls
139 where
140 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
141
142
143 ---------------------------------------------------------------
144 data CsvDoc = CsvDoc
145 { csv_title :: !Text
146 , csv_source :: !Text
147 , csv_publication_year :: !Int
148 , csv_publication_month :: !Int
149 , csv_publication_day :: !Int
150 , csv_abstract :: !Text
151 , csv_authors :: !Text
152 }
153 deriving (Show)
154
155 instance FromNamedRecord CsvDoc where
156 parseNamedRecord r = CsvDoc <$> r .: "title"
157 <*> r .: "source"
158 <*> r .: "publication_year"
159 <*> r .: "publication_month"
160 <*> r .: "publication_day"
161 <*> r .: "abstract"
162 <*> r .: "authors"
163
164 instance ToNamedRecord CsvDoc where
165 toNamedRecord (CsvDoc t s py pm pd abst aut) =
166 namedRecord [ "title" .= t
167 , "source" .= s
168 , "publication_year" .= py
169 , "publication_month" .= pm
170 , "publication_day" .= pd
171 , "abstract" .= abst
172 , "authors" .= aut
173 ]
174
175 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
176 hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
177 (m $ _hyperdataDocument_source h)
178 (mI $ _hyperdataDocument_publication_year h)
179 (mI $ _hyperdataDocument_publication_month h)
180 (mI $ _hyperdataDocument_publication_day h)
181 (m $ _hyperdataDocument_abstract h)
182 (m $ _hyperdataDocument_authors h)
183
184 where
185 m = maybe "" identity
186 mI = maybe 0 identity
187
188
189 csvDecodeOptions :: DecodeOptions
190 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
191
192 csvEncodeOptions :: EncodeOptions
193 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
194
195 delimiter :: Word8
196 delimiter = fromIntegral $ ord '\t'
197 ------------------------------------------------------------------------
198 ------------------------------------------------------------------------
199 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
200 readCsvOn fields fp = V.toList
201 <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
202 <$> snd
203 <$> readFile fp
204
205 ------------------------------------------------------------------------
206
207 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
208 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
209
210 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
211 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
212
213 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
214 readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
215 Left e -> panic (pack e)
216 Right csvDocs -> csvDocs
217
218 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
219 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
220
221 ------------------------------------------------------------------------
222 -- | TODO use readFileLazy
223 readFile :: FilePath -> IO (Header, Vector CsvDoc)
224 readFile = fmap readCsvLazyBS . BL.readFile
225
226
227 -- | TODO use readByteStringLazy
228 readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
229 readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
230 Left e -> panic (pack e)
231 Right csvDocs -> csvDocs
232
233 ------------------------------------------------------------------------
234 -- | TODO use readFileLazy
235 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
236 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
237
238 -- | TODO use readByteStringLazy
239 readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
240 readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
241 Left e -> panic (pack e)
242 Right csvDocs -> csvDocs
243
244 readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
245 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
246
247 ------------------------------------------------------------------------
248 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
249 writeFile fp (h, vs) = BL.writeFile fp $
250 encodeByNameWith csvEncodeOptions h (V.toList vs)
251
252 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
253 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
254
255 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
256 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
257
258 ------------------------------------------------------------------------
259 -- Hal Format
260 data CsvHal = CsvHal
261 { csvHal_title :: !Text
262 , csvHal_source :: !Text
263 , csvHal_publication_year :: !Integer
264 , csvHal_publication_month :: !Int
265 , csvHal_publication_day :: !Int
266 , csvHal_abstract :: !Text
267 , csvHal_authors :: !Text
268
269 , csvHal_url :: !Text
270 , csvHal_isbn_s :: !Text
271 , csvHal_issue_s :: !Text
272 , csvHal_journalPublisher_s:: !Text
273 , csvHal_language_s :: !Text
274
275 , csvHal_doiId_s :: !Text
276 , csvHal_authId_i :: !Text
277 , csvHal_instStructId_i :: !Text
278 , csvHal_deptStructId_i :: !Text
279 , csvHal_labStructId_i :: !Text
280
281 , csvHal_rteamStructId_i :: !Text
282 , csvHal_docType_s :: !Text
283 }
284 deriving (Show)
285
286 instance FromNamedRecord CsvHal where
287 parseNamedRecord r = CsvHal <$> r .: "title"
288 <*> r .: "source"
289 <*> r .: "publication_year"
290 <*> r .: "publication_month"
291 <*> r .: "publication_day"
292 <*> r .: "abstract"
293 <*> r .: "authors"
294
295 <*> r .: "url"
296 <*> r .: "isbn_s"
297 <*> r .: "issue_s"
298 <*> r .: "journalPublisher_s"
299 <*> r .: "language_s"
300
301 <*> r .: "doiId_s"
302 <*> r .: "authId_i"
303 <*> r .: "instStructId_i"
304 <*> r .: "deptStructId_i"
305 <*> r .: "labStructId_i"
306
307 <*> r .: "rteamStructId_i"
308 <*> r .: "docType_s"
309
310 instance ToNamedRecord CsvHal where
311 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
312 namedRecord [ "title" .= t
313 , "source" .= s
314
315 , "publication_year" .= py
316 , "publication_month" .= pm
317 , "publication_day" .= pd
318
319 , "abstract" .= abst
320 , "authors" .= aut
321
322 , "url" .= url
323 , "isbn_s" .= isbn
324 , "issue_s" .= iss
325 , "journalPublisher_s" .= j
326 , "language_s" .= lang
327
328 , "doiId_s" .= doi
329 , "authId_i" .= auth
330 , "instStructId_i" .= inst
331 , "deptStructId_i" .= dept
332 , "labStructId_i" .= lab
333
334 , "rteamStructId_i" .= team
335 , "docType_s" .= doct
336 ]
337
338 csvHal2doc :: CsvHal -> HyperdataDocument
339 csvHal2doc (CsvHal title source
340 pub_year pub_month pub_day
341 abstract authors
342 url _ _ _ _
343 doi _ inst _ _
344 _ _ ) = HyperdataDocument (Just "CsvHal")
345 (Just doi)
346 (Just url)
347 Nothing
348 Nothing
349 Nothing
350 (Just title)
351 (Just authors)
352 (Just inst)
353 (Just source)
354 (Just abstract)
355 (Just $ pack . show $ jour pub_year pub_month pub_day)
356 (Just $ fromIntegral pub_year)
357 (Just pub_month)
358 (Just pub_day)
359 Nothing
360 Nothing
361 Nothing
362 Nothing
363
364 ------------------------------------------------------------------------
365 parseHal :: FilePath -> IO [HyperdataDocument]
366 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readCsvHal fp
367 ------------------------------------------------------------------------
368