]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
Adding ngrams to the table now has a proper patch!
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14
15 module Gargantext.Core.Text.Corpus.Parsers.CSV where
16
17 import Control.Applicative
18 import qualified Data.ByteString as BS
19 import qualified Data.ByteString.Lazy as BL
20 import Data.Char (ord)
21 import Data.Csv
22 import Data.Either (Either(Left, Right))
23 import Data.Text (Text, pack, length, intercalate)
24 import Data.Time.Segment (jour)
25 import qualified Data.Vector as V
26 import Data.Vector (Vector)
27 import GHC.IO (FilePath)
28 import GHC.Real (round)
29 import GHC.Word (Word8)
30
31 import Gargantext.Database.Admin.Types.Hyperdata (HyperdataDocument(..))
32 import Gargantext.Prelude hiding (length)
33 import Gargantext.Core.Text
34 import Gargantext.Core.Text.Context
35
36 ---------------------------------------------------------------
37 headerCsvGargV3 :: Header
38 headerCsvGargV3 = header [ "title"
39 , "source"
40 , "publication_year"
41 , "publication_month"
42 , "publication_day"
43 , "abstract"
44 , "authors"
45 ]
46 ---------------------------------------------------------------
47 data CsvGargV3 = CsvGargV3
48 { d_docId :: !Int
49 , d_title :: !Text
50 , d_source :: !Text
51 , d_publication_year :: !Int
52 , d_publication_month :: !Int
53 , d_publication_day :: !Int
54 , d_abstract :: !Text
55 , d_authors :: !Text
56 }
57 deriving (Show)
58 ---------------------------------------------------------------
59 -- | Doc 2 HyperdataDocument
60 toDoc :: CsvGargV3 -> HyperdataDocument
61 toDoc (CsvGargV3 did dt _ dpy dpm dpd dab dau) =
62 HyperdataDocument (Just "CSV")
63 (Just . pack . show $ did)
64 Nothing
65 Nothing
66 Nothing
67 Nothing
68 (Just dt)
69 Nothing
70 (Just dau)
71 (Just dab)
72 (Nothing)
73 Nothing
74 (Just dpy)
75 (Just dpm)
76 (Just dpd)
77 Nothing
78 Nothing
79 Nothing
80 Nothing
81
82 ---------------------------------------------------------------
83 -- | Types Conversions
84 toDocs :: Vector CsvDoc -> [CsvGargV3]
85 toDocs v = V.toList
86 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
87 -> CsvGargV3 nId t s py pm pd abst auth )
88 (V.enumFromN 1 (V.length v'')) v''
89 where
90 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
91 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
92
93 ---------------------------------------------------------------
94 fromDocs :: Vector CsvGargV3 -> Vector CsvDoc
95 fromDocs docs = V.map fromDocs' docs
96 where
97 fromDocs' (CsvGargV3 _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
98
99 ---------------------------------------------------------------
100 -- | Split a document in its context
101 -- TODO adapt the size of the paragraph according to the corpus average
102 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
103 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
104 if docSize > 1000
105 then
106 if (mod (round m) docSize) >= 10
107 then
108 splitDoc' splt doc
109 else
110 V.fromList [doc]
111 else
112 V.fromList [doc]
113 where
114 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
115 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
116 where
117 firstDoc = CsvDoc t s py pm pd firstAbstract auth
118 firstAbstract = head' "splitDoc'1" abstracts
119
120 nextDocs = map (\txt -> CsvDoc
121 (head' "splitDoc'2" $ sentences txt)
122 s py pm pd
123 (unsentences $ tail' "splitDoc'1" $ sentences txt)
124 auth
125 ) (tail' "splitDoc'2" abstracts)
126
127 abstracts = (splitBy $ contextSize) abst
128
129 ---------------------------------------------------------------
130 ---------------------------------------------------------------
131 type Mean = Double
132
133 docsSize :: Vector CsvDoc -> Mean
134 docsSize csvDoc = mean ls
135 where
136 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
137
138
139 ---------------------------------------------------------------
140 data CsvDoc = CsvDoc
141 { csv_title :: !Text
142 , csv_source :: !Text
143 , csv_publication_year :: !Int
144 , csv_publication_month :: !Int
145 , csv_publication_day :: !Int
146 , csv_abstract :: !Text
147 , csv_authors :: !Text
148 }
149 deriving (Show)
150
151 instance FromNamedRecord CsvDoc where
152 parseNamedRecord r = CsvDoc <$> r .: "title"
153 <*> r .: "source"
154 <*> r .: "publication_year"
155 <*> r .: "publication_month"
156 <*> r .: "publication_day"
157 <*> r .: "abstract"
158 <*> r .: "authors"
159
160 instance ToNamedRecord CsvDoc where
161 toNamedRecord (CsvDoc t s py pm pd abst aut) =
162 namedRecord [ "title" .= t
163 , "source" .= s
164 , "publication_year" .= py
165 , "publication_month" .= pm
166 , "publication_day" .= pd
167 , "abstract" .= abst
168 , "authors" .= aut
169 ]
170
171 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
172 hyperdataDocument2csvDoc h = CsvDoc (m $ _hd_title h)
173 (m $ _hd_source h)
174 (mI $ _hd_publication_year h)
175 (mI $ _hd_publication_month h)
176 (mI $ _hd_publication_day h)
177 (m $ _hd_abstract h)
178 (m $ _hd_authors h)
179
180 where
181 m = maybe "" identity
182 mI = maybe 0 identity
183
184
185 csvDecodeOptions :: DecodeOptions
186 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
187
188 csvEncodeOptions :: EncodeOptions
189 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
190
191 delimiter :: Word8
192 delimiter = fromIntegral $ ord '\t'
193 ------------------------------------------------------------------------
194 ------------------------------------------------------------------------
195 readCsvOn' :: [CsvDoc -> Text] -> FilePath -> IO [Text]
196 readCsvOn' fields fp = V.toList
197 <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
198 <$> snd
199 <$> readFile fp
200
201 ------------------------------------------------------------------------
202
203 readFileLazy :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
204 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
205
206 readFileStrict :: (FromNamedRecord a) => proxy a -> FilePath -> IO (Header, Vector a)
207 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
208
209 readByteStringLazy :: (FromNamedRecord a) => proxy a -> BL.ByteString -> (Header, Vector a)
210 readByteStringLazy _f bs = case decodeByNameWith csvDecodeOptions bs of
211 Left e -> panic (pack e)
212 Right csvDocs -> csvDocs
213
214 readByteStringStrict :: (FromNamedRecord a) => proxy a -> BS.ByteString -> (Header, Vector a)
215 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
216
217 ------------------------------------------------------------------------
218 -- | TODO use readFileLazy
219 readFile :: FilePath -> IO (Header, Vector CsvDoc)
220 readFile = fmap readCsvLazyBS . BL.readFile
221
222
223 -- | TODO use readByteStringLazy
224 readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
225 readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
226 Left e -> panic (pack e)
227 Right csvDocs -> csvDocs
228
229 ------------------------------------------------------------------------
230
231 -- | TODO use readFileLazy
232 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
233 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
234
235 -- | TODO use readByteStringLazy
236 readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
237 readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
238 Left e -> panic (pack e)
239 Right csvDocs -> csvDocs
240
241 readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
242 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
243
244 ------------------------------------------------------------------------
245 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
246 writeFile fp (h, vs) = BL.writeFile fp $
247 encodeByNameWith csvEncodeOptions h (V.toList vs)
248
249 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
250 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
251
252 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
253 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
254
255 ------------------------------------------------------------------------
256 -- Hal Format
257 data CsvHal = CsvHal
258 { csvHal_title :: !Text
259 , csvHal_source :: !Text
260 , csvHal_publication_year :: !Integer
261 , csvHal_publication_month :: !Int
262 , csvHal_publication_day :: !Int
263 , csvHal_abstract :: !Text
264 , csvHal_authors :: !Text
265
266 , csvHal_url :: !Text
267 , csvHal_isbn_s :: !Text
268 , csvHal_issue_s :: !Text
269 , csvHal_journalPublisher_s:: !Text
270 , csvHal_language_s :: !Text
271
272 , csvHal_doiId_s :: !Text
273 , csvHal_authId_i :: !Text
274 , csvHal_instStructId_i :: !Text
275 , csvHal_deptStructId_i :: !Text
276 , csvHal_labStructId_i :: !Text
277
278 , csvHal_rteamStructId_i :: !Text
279 , csvHal_docType_s :: !Text
280 }
281 deriving (Show)
282
283 instance FromNamedRecord CsvHal where
284 parseNamedRecord r = CsvHal <$> r .: "title"
285 <*> r .: "source"
286 <*> r .: "publication_year"
287 <*> r .: "publication_month"
288 <*> r .: "publication_day"
289 <*> r .: "abstract"
290 <*> r .: "authors"
291
292 <*> r .: "url"
293 <*> r .: "isbn_s"
294 <*> r .: "issue_s"
295 <*> r .: "journalPublisher_s"
296 <*> r .: "language_s"
297
298 <*> r .: "doiId_s"
299 <*> r .: "authId_i"
300 <*> r .: "instStructId_i"
301 <*> r .: "deptStructId_i"
302 <*> r .: "labStructId_i"
303
304 <*> r .: "rteamStructId_i"
305 <*> r .: "docType_s"
306
307 instance ToNamedRecord CsvHal where
308 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
309 namedRecord [ "title" .= t
310 , "source" .= s
311
312 , "publication_year" .= py
313 , "publication_month" .= pm
314 , "publication_day" .= pd
315
316 , "abstract" .= abst
317 , "authors" .= aut
318
319 , "url" .= url
320 , "isbn_s" .= isbn
321 , "issue_s" .= iss
322 , "journalPublisher_s" .= j
323 , "language_s" .= lang
324
325 , "doiId_s" .= doi
326 , "authId_i" .= auth
327 , "instStructId_i" .= inst
328 , "deptStructId_i" .= dept
329 , "labStructId_i" .= lab
330
331 , "rteamStructId_i" .= team
332 , "docType_s" .= doct
333 ]
334
335 csvHal2doc :: CsvHal -> HyperdataDocument
336 csvHal2doc (CsvHal title source
337 pub_year pub_month pub_day
338 abstract authors
339 url _ _ _ _
340 doi _ inst _ _
341 _ _ ) = HyperdataDocument (Just "CsvHal")
342 (Just doi)
343 (Just url)
344 Nothing
345 Nothing
346 Nothing
347 (Just title)
348 (Just authors)
349 (Just inst)
350 (Just source)
351 (Just abstract)
352 (Just $ pack . show $ jour pub_year pub_month pub_day)
353 (Just $ fromIntegral pub_year)
354 (Just pub_month)
355 (Just pub_day)
356 Nothing
357 Nothing
358 Nothing
359 Nothing
360
361
362 csv2doc :: CsvDoc -> HyperdataDocument
363 csv2doc (CsvDoc title source
364 pub_year pub_month pub_day
365 abstract authors ) = HyperdataDocument (Just "CsvHal")
366 Nothing
367 Nothing
368 Nothing
369 Nothing
370 Nothing
371 (Just title)
372 (Just authors)
373 Nothing
374 (Just source)
375 (Just abstract)
376 (Just $ pack . show $ jour (fromIntegral pub_year) pub_month pub_day)
377 (Just $ fromIntegral pub_year)
378 (Just pub_month)
379 (Just pub_day)
380 Nothing
381 Nothing
382 Nothing
383 Nothing
384
385 ------------------------------------------------------------------------
386 parseHal :: FilePath -> IO [HyperdataDocument]
387 parseHal fp = V.toList <$> V.map csvHal2doc <$> snd <$> readCsvHal fp
388
389 parseHal' :: BL.ByteString -> [HyperdataDocument]
390 parseHal' = V.toList . V.map csvHal2doc . snd . readCsvHalLazyBS
391
392 ------------------------------------------------------------------------
393
394 parseCsv :: FilePath -> IO [HyperdataDocument]
395 parseCsv fp = V.toList <$> V.map csv2doc <$> snd <$> readFile fp
396
397 parseCsv' :: BL.ByteString -> [HyperdataDocument]
398 parseCsv' bs = V.toList $ V.map csv2doc $ snd $ readCsvLazyBS bs
399