]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/CSV.hs
Merge branch 'dev' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell-gargantext...
[gargantext.git] / src / Gargantext / Text / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Text.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
17
18 module Gargantext.Text.Parsers.CSV where
19
20 import Control.Applicative
21 import Data.Char (ord)
22 import Data.Csv
23 import Data.Either (Either(Left, Right))
24 import Data.Text (Text, pack, length, intercalate)
25 import Data.Time.Segment (jour)
26 import Data.Vector (Vector)
27 import GHC.IO (FilePath)
28 import GHC.Real (round)
29 import GHC.Word (Word8)
30 import Gargantext.Database.Types.Node -- (HyperdataDocument(..))
31 import Gargantext.Prelude hiding (length)
32 import Gargantext.Text
33 import Gargantext.Text.Context
34 import qualified Data.ByteString.Lazy as BL
35 import qualified Data.ByteString as BS
36 import qualified Data.Vector as V
37
38 ---------------------------------------------------------------
39 headerCsvGargV3 :: Header
40 headerCsvGargV3 = header [ "title"
41 , "source"
42 , "publication_year"
43 , "publication_month"
44 , "publication_day"
45 , "abstract"
46 , "authors"
47 ]
48 ---------------------------------------------------------------
49 data Doc = Doc
50 { d_docId :: !Int
51 , d_title :: !Text
52 , d_source :: !Text
53 , d_publication_year :: !Int
54 , d_publication_month :: !Int
55 , d_publication_day :: !Int
56 , d_abstract :: !Text
57 , d_authors :: !Text
58 }
59 deriving (Show)
60 ---------------------------------------------------------------
61 -- | Doc 2 HyperdataDocument
62 doc2hyperdataDocument :: Doc -> HyperdataDocument
63 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
64 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
65 HyperdataDocument (Just "CSV")
66 (Just . pack . show $ did)
67 Nothing
68 Nothing
69 Nothing
70 Nothing
71 (Just dt)
72 Nothing
73 (Just dau)
74 (Just dab)
75 (Nothing)
76 Nothing
77 (Just dpy)
78 (Just dpm)
79 (Just dpd)
80 Nothing
81 Nothing
82 Nothing
83 Nothing
84
85
86
87
88 ---------------------------------------------------------------
89 -- | Types Conversions
90 toDocs :: Vector CsvDoc -> [Doc]
91 toDocs v = V.toList
92 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
93 -> Doc nId t s py pm pd abst auth )
94 (V.enumFromN 1 (V.length v'')) v''
95 where
96 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
97 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
98
99 ---------------------------------------------------------------
100 fromDocs :: Vector Doc -> Vector CsvDoc
101 fromDocs docs = V.map fromDocs' docs
102 where
103 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
104
105 ---------------------------------------------------------------
106 -- | Split a document in its context
107 -- TODO adapt the size of the paragraph according to the corpus average
108
109 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
110 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
111 if docSize > 1000
112 then
113 if (mod (round m) docSize) >= 10
114 then
115 splitDoc' splt doc
116 else
117 V.fromList [doc]
118 else
119 V.fromList [doc]
120
121
122 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
123 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
124 where
125 firstDoc = CsvDoc t s py pm pd firstAbstract auth
126 firstAbstract = head' "splitDoc'1" abstracts
127
128 nextDocs = map (\txt -> CsvDoc
129 (head' "splitDoc'2" $ sentences txt)
130 s py pm pd
131 (unsentences $ tail' "splitDoc'1" $ sentences txt)
132 auth
133 ) (tail' "splitDoc'2" abstracts)
134
135 abstracts = (splitBy $ contextSize) abst
136
137 ---------------------------------------------------------------
138 ---------------------------------------------------------------
139 type Mean = Double
140
141 docsSize :: Vector CsvDoc -> Mean
142 docsSize csvDoc = mean ls
143 where
144 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
145
146
147 ---------------------------------------------------------------
148 data CsvDoc = CsvDoc
149 { csv_title :: !Text
150 , csv_source :: !Text
151 , csv_publication_year :: !Int
152 , csv_publication_month :: !Int
153 , csv_publication_day :: !Int
154 , csv_abstract :: !Text
155 , csv_authors :: !Text
156 }
157 deriving (Show)
158
159 instance FromNamedRecord CsvDoc where
160 parseNamedRecord r = CsvDoc <$> r .: "title"
161 <*> r .: "source"
162 <*> r .: "publication_year"
163 <*> r .: "publication_month"
164 <*> r .: "publication_day"
165 <*> r .: "abstract"
166 <*> r .: "authors"
167
168 instance ToNamedRecord CsvDoc where
169 toNamedRecord (CsvDoc t s py pm pd abst aut) =
170 namedRecord [ "title" .= t
171 , "source" .= s
172 , "publication_year" .= py
173 , "publication_month" .= pm
174 , "publication_day" .= pd
175 , "abstract" .= abst
176 , "authors" .= aut
177 ]
178
179 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
180 hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
181 (m $ _hyperdataDocument_source h)
182 (mI $ _hyperdataDocument_publication_year h)
183 (mI $ _hyperdataDocument_publication_month h)
184 (mI $ _hyperdataDocument_publication_day h)
185 (m $ _hyperdataDocument_abstract h)
186 (m $ _hyperdataDocument_authors h)
187
188 where
189 m = maybe "" identity
190 mI = maybe 0 identity
191
192
193 csvDecodeOptions :: DecodeOptions
194 csvDecodeOptions = defaultDecodeOptions {decDelimiter = delimiter}
195
196 csvEncodeOptions :: EncodeOptions
197 csvEncodeOptions = defaultEncodeOptions {encDelimiter = delimiter}
198
199 delimiter :: Word8
200 delimiter = fromIntegral $ ord '\t'
201 ------------------------------------------------------------------------
202 ------------------------------------------------------------------------
203 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
204 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
205 <$> snd
206 <$> readFile fp
207
208 ------------------------------------------------------------------------
209
210 readFileLazy :: (FromNamedRecord a) => a -> FilePath -> IO (Header, Vector a)
211 readFileLazy f = fmap (readByteStringLazy f) . BL.readFile
212
213 readFileStrict :: (FromNamedRecord a) => a -> FilePath -> IO (Header, Vector a)
214 readFileStrict f = fmap (readByteStringStrict f) . BS.readFile
215
216
217 readByteStringLazy :: (FromNamedRecord a) => a -> BL.ByteString -> (Header, Vector a)
218 readByteStringLazy f bs = case decodeByNameWith csvDecodeOptions bs of
219 Left e -> panic (pack e)
220 Right csvDocs -> csvDocs
221
222 readByteStringStrict :: (FromNamedRecord a) => a -> BS.ByteString -> (Header, Vector a)
223 readByteStringStrict ff = (readByteStringLazy ff) . BL.fromStrict
224
225 ------------------------------------------------------------------------
226 -- | TODO use readFileLazy
227 readFile :: FilePath -> IO (Header, Vector CsvDoc)
228 readFile = fmap readCsvLazyBS . BL.readFile
229
230 -- | TODO use readByteStringLazy
231 readCsvLazyBS :: BL.ByteString -> (Header, Vector CsvDoc)
232 readCsvLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
233 Left e -> panic (pack e)
234 Right csvDocs -> csvDocs
235
236 ------------------------------------------------------------------------
237 -- | TODO use readFileLazy
238 readCsvHal :: FilePath -> IO (Header, Vector CsvHal)
239 readCsvHal = fmap readCsvHalLazyBS . BL.readFile
240
241 -- | TODO use readByteStringLazy
242 readCsvHalLazyBS :: BL.ByteString -> (Header, Vector CsvHal)
243 readCsvHalLazyBS bs = case decodeByNameWith csvDecodeOptions bs of
244 Left e -> panic (pack e)
245 Right csvDocs -> csvDocs
246
247 readCsvHalBSStrict :: BS.ByteString -> (Header, Vector CsvHal)
248 readCsvHalBSStrict = readCsvHalLazyBS . BL.fromStrict
249
250 ------------------------------------------------------------------------
251 writeFile :: FilePath -> (Header, Vector CsvDoc) -> IO ()
252 writeFile fp (h, vs) = BL.writeFile fp $
253 encodeByNameWith csvEncodeOptions h (V.toList vs)
254
255 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
256 writeDocs2Csv fp hs = BL.writeFile fp $ hyperdataDocument2csv hs
257
258 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
259 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
260
261 ------------------------------------------------------------------------
262 -- Hal Format
263 data CsvHal = CsvHal
264 { csvHal_title :: !Text
265 , csvHal_source :: !Text
266 , csvHal_publication_year :: !Integer
267 , csvHal_publication_month :: !Int
268 , csvHal_publication_day :: !Int
269 , csvHal_abstract :: !Text
270 , csvHal_authors :: !Text
271
272 , csvHal_url :: !Text
273 , csvHal_isbn_s :: !Text
274 , csvHal_issue_s :: !Text
275 , csvHal_journalPublisher_s:: !Text
276 , csvHal_language_s :: !Text
277
278 , csvHal_doiId_s :: !Text
279 , csvHal_authId_i :: !Text
280 , csvHal_instStructId_i :: !Text
281 , csvHal_deptStructId_i :: !Text
282 , csvHal_labStructId_i :: !Text
283
284 , csvHal_rteamStructId_i :: !Text
285 , csvHal_docType_s :: !Text
286 }
287 deriving (Show)
288
289 instance FromNamedRecord CsvHal where
290 parseNamedRecord r = CsvHal <$> r .: "title"
291 <*> r .: "source"
292 <*> r .: "publication_year"
293 <*> r .: "publication_month"
294 <*> r .: "publication_day"
295 <*> r .: "abstract"
296 <*> r .: "authors"
297
298 <*> r .: "url"
299 <*> r .: "isbn_s"
300 <*> r .: "issue_s"
301 <*> r .: "journalPublisher_s"
302 <*> r .: "language_s"
303
304 <*> r .: "doiId_s"
305 <*> r .: "authId_i"
306 <*> r .: "instStructId_i"
307 <*> r .: "deptStructId_i"
308 <*> r .: "labStructId_i"
309
310 <*> r .: "rteamStructId_i"
311 <*> r .: "docType_s"
312
313 instance ToNamedRecord CsvHal where
314 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
315 namedRecord [ "title" .= t
316 , "source" .= s
317
318 , "publication_year" .= py
319 , "publication_month" .= pm
320 , "publication_day" .= pd
321
322 , "abstract" .= abst
323 , "authors" .= aut
324
325 , "url" .= url
326 , "isbn_s" .= isbn
327 , "issue_s" .= iss
328 , "journalPublisher_s" .= j
329 , "language_s" .= lang
330
331 , "doiId_s" .= doi
332 , "authId_i" .= auth
333 , "instStructId_i" .= inst
334 , "deptStructId_i" .= dept
335 , "labStructId_i" .= lab
336
337 , "rteamStructId_i" .= team
338 , "docType_s" .= doct
339 ]
340
341 csvHal2doc :: CsvHal -> HyperdataDocument
342 csvHal2doc (CsvHal title source
343 pub_year pub_month pub_day
344 abstract authors
345 url _ _ _ _
346 doi _ inst _ _
347 _ _ ) = HyperdataDocument (Just "CsvHal")
348 (Just doi)
349 (Just url)
350 Nothing
351 Nothing
352 Nothing
353 (Just title)
354 (Just authors)
355 (Just inst)
356 (Just source)
357 (Just abstract)
358 (Just $ pack . show $ jour pub_year pub_month pub_day)
359 (Just $ fromIntegral pub_year)
360 (Just pub_month)
361 (Just pub_day)
362 Nothing
363 Nothing
364 Nothing
365 Nothing
366
367 ------------------------------------------------------------------------
368 parseHal :: FilePath -> IO [HyperdataDocument]
369 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readCsvHal fp
370 ------------------------------------------------------------------------
371