]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/CSV.hs
[API] Upload csv.
[gargantext.git] / src / Gargantext / Text / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Text.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
17
18 module Gargantext.Text.Parsers.CSV where
19
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
22 import Control.Applicative
23
24 import Data.Char (ord)
25 import Data.Csv
26 import Data.Either (Either(Left, Right))
27 import Data.Text (Text, pack, length, intercalate)
28 import qualified Data.ByteString.Lazy as BL
29 import Data.Time.Segment (jour)
30
31 import Data.Vector (Vector)
32 import qualified Data.Vector as V
33
34 import Gargantext.Database.Types.Node -- (HyperdataDocument(..))
35 import Gargantext.Text
36 import Gargantext.Text.Context
37 import Gargantext.Prelude hiding (length)
38
39 ---------------------------------------------------------------
40 headerCsvGargV3 :: Header
41 headerCsvGargV3 = header [ "title"
42 , "source"
43 , "publication_year"
44 , "publication_month"
45 , "publication_day"
46 , "abstract"
47 , "authors"
48 ]
49 ---------------------------------------------------------------
50 data Doc = Doc
51 { d_docId :: !Int
52 , d_title :: !Text
53 , d_source :: !Text
54 , d_publication_year :: !Int
55 , d_publication_month :: !Int
56 , d_publication_day :: !Int
57 , d_abstract :: !Text
58 , d_authors :: !Text
59 }
60 deriving (Show)
61 ---------------------------------------------------------------
62 -- | Doc 2 HyperdataDocument
63 doc2hyperdataDocument :: Doc -> HyperdataDocument
64 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
65 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
66 HyperdataDocument (Just "CSV")
67 (Just . pack . show $ did)
68 Nothing
69 Nothing
70 Nothing
71 Nothing
72 (Just dt)
73 Nothing
74 (Just dau)
75 (Just dab)
76 (Nothing)
77 Nothing
78 (Just dpy)
79 (Just dpm)
80 (Just dpd)
81 Nothing
82 Nothing
83 Nothing
84 Nothing
85
86
87
88
89 ---------------------------------------------------------------
90 -- | Types Conversions
91 toDocs :: Vector CsvDoc -> [Doc]
92 toDocs v = V.toList
93 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
94 -> Doc nId t s py pm pd abst auth )
95 (V.enumFromN 1 (V.length v'')) v''
96 where
97 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
98 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
99
100 ---------------------------------------------------------------
101 fromDocs :: Vector Doc -> Vector CsvDoc
102 fromDocs docs = V.map fromDocs' docs
103 where
104 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
105
106 ---------------------------------------------------------------
107 -- | Split a document in its context
108 -- TODO adapt the size of the paragraph according to the corpus average
109
110 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
111 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
112 if docSize > 1000
113 then
114 if (mod (round m) docSize) >= 10
115 then
116 splitDoc' splt doc
117 else
118 V.fromList [doc]
119 else
120 V.fromList [doc]
121
122
123 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
124 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
125 where
126 firstDoc = CsvDoc t s py pm pd firstAbstract auth
127 firstAbstract = head' "splitDoc'1" abstracts
128
129 nextDocs = map (\txt -> CsvDoc
130 (head' "splitDoc'2" $ sentences txt)
131 s py pm pd
132 (unsentences $ tail' "splitDoc'1" $ sentences txt)
133 auth
134 ) (tail' "splitDoc'2" abstracts)
135
136 abstracts = (splitBy $ contextSize) abst
137
138 ---------------------------------------------------------------
139 ---------------------------------------------------------------
140 type Mean = Double
141
142 docsSize :: Vector CsvDoc -> Mean
143 docsSize csvDoc = mean ls
144 where
145 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
146
147
148 ---------------------------------------------------------------
149 data CsvDoc = CsvDoc
150 { csv_title :: !Text
151 , csv_source :: !Text
152 , csv_publication_year :: !Int
153 , csv_publication_month :: !Int
154 , csv_publication_day :: !Int
155 , csv_abstract :: !Text
156 , csv_authors :: !Text
157 }
158 deriving (Show)
159
160 instance FromNamedRecord CsvDoc where
161 parseNamedRecord r = CsvDoc <$> r .: "title"
162 <*> r .: "source"
163 <*> r .: "publication_year"
164 <*> r .: "publication_month"
165 <*> r .: "publication_day"
166 <*> r .: "abstract"
167 <*> r .: "authors"
168
169 instance ToNamedRecord CsvDoc where
170 toNamedRecord (CsvDoc t s py pm pd abst aut) =
171 namedRecord [ "title" .= t
172 , "source" .= s
173 , "publication_year" .= py
174 , "publication_month" .= pm
175 , "publication_day" .= pd
176 , "abstract" .= abst
177 , "authors" .= aut
178 ]
179
180 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
181 hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
182 (m $ _hyperdataDocument_source h)
183 (mI $ _hyperdataDocument_publication_year h)
184 (mI $ _hyperdataDocument_publication_month h)
185 (mI $ _hyperdataDocument_publication_day h)
186 (m $ _hyperdataDocument_abstract h)
187 (m $ _hyperdataDocument_authors h)
188
189 where
190 m = maybe "" identity
191 mI = maybe 0 identity
192
193
194 csvDecodeOptions :: DecodeOptions
195 csvDecodeOptions = (defaultDecodeOptions
196 {decDelimiter = fromIntegral $ ord '\t'}
197 )
198
199 csvEncodeOptions :: EncodeOptions
200 csvEncodeOptions = ( defaultEncodeOptions
201 {encDelimiter = fromIntegral $ ord '\t'}
202 )
203
204 ------------------------------------------------------------------------
205 ------------------------------------------------------------------------
206 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
207 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
208 <$> snd
209 <$> readCsv fp
210
211 ------------------------------------------------------------------------
212 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
213 readCsv fp = do
214 csvData <- BL.readFile fp
215 case decodeByNameWith csvDecodeOptions csvData of
216 Left e -> panic (pack e)
217 Right csvDocs -> pure csvDocs
218
219
220 readHal :: FilePath -> IO (Header, Vector CsvHal)
221 readHal fp = do
222 csvData <- BL.readFile fp
223 case decodeByNameWith csvDecodeOptions csvData of
224 Left e -> panic (pack e)
225 Right csvDocs -> pure csvDocs
226 ------------------------------------------------------------------------
227 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
228 writeCsv fp (h, vs) = BL.writeFile fp $
229 encodeByNameWith csvEncodeOptions h (V.toList vs)
230
231 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
232 writeDocs2Csv fp hs = BL.writeFile fp $
233 encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
234
235 hyperdataDocument2csv :: [HyperdataDocument] -> BL.ByteString
236 hyperdataDocument2csv hs = encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
237
238 ------------------------------------------------------------------------
239 -- Hal Format
240 data CsvHal = CsvHal
241 { csvHal_title :: !Text
242 , csvHal_source :: !Text
243 , csvHal_publication_year :: !Integer
244 , csvHal_publication_month :: !Int
245 , csvHal_publication_day :: !Int
246 , csvHal_abstract :: !Text
247 , csvHal_authors :: !Text
248
249 , csvHal_url :: !Text
250 , csvHal_isbn_s :: !Text
251 , csvHal_issue_s :: !Text
252 , csvHal_journalPublisher_s:: !Text
253 , csvHal_language_s :: !Text
254
255 , csvHal_doiId_s :: !Text
256 , csvHal_authId_i :: !Text
257 , csvHal_instStructId_i :: !Text
258 , csvHal_deptStructId_i :: !Text
259 , csvHal_labStructId_i :: !Text
260
261 , csvHal_rteamStructId_i :: !Text
262 , csvHal_docType_s :: !Text
263 }
264 deriving (Show)
265
266 instance FromNamedRecord CsvHal where
267 parseNamedRecord r = CsvHal <$> r .: "title"
268 <*> r .: "source"
269 <*> r .: "publication_year"
270 <*> r .: "publication_month"
271 <*> r .: "publication_day"
272 <*> r .: "abstract"
273 <*> r .: "authors"
274
275 <*> r .: "url"
276 <*> r .: "isbn_s"
277 <*> r .: "issue_s"
278 <*> r .: "journalPublisher_s"
279 <*> r .: "language_s"
280
281 <*> r .: "doiId_s"
282 <*> r .: "authId_i"
283 <*> r .: "instStructId_i"
284 <*> r .: "deptStructId_i"
285 <*> r .: "labStructId_i"
286
287 <*> r .: "rteamStructId_i"
288 <*> r .: "docType_s"
289
290 instance ToNamedRecord CsvHal where
291 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
292 namedRecord [ "title" .= t
293 , "source" .= s
294
295 , "publication_year" .= py
296 , "publication_month" .= pm
297 , "publication_day" .= pd
298
299 , "abstract" .= abst
300 , "authors" .= aut
301
302 , "url" .= url
303 , "isbn_s" .= isbn
304 , "issue_s" .= iss
305 , "journalPublisher_s" .= j
306 , "language_s" .= lang
307
308 , "doiId_s" .= doi
309 , "authId_i" .= auth
310 , "instStructId_i" .= inst
311 , "deptStructId_i" .= dept
312 , "labStructId_i" .= lab
313
314 , "rteamStructId_i" .= team
315 , "docType_s" .= doct
316 ]
317
318 csvHal2doc :: CsvHal -> HyperdataDocument
319 csvHal2doc (CsvHal title source
320 pub_year pub_month pub_day
321 abstract authors
322 url _ _ _ _
323 doi _ inst _ _
324 _ _ ) = HyperdataDocument (Just "CsvHal")
325 (Just doi)
326 (Just url)
327 Nothing
328 Nothing
329 Nothing
330 (Just title)
331 (Just authors)
332 (Just inst)
333 (Just source)
334 (Just abstract)
335 (Just $ pack . show $ jour pub_year pub_month pub_day)
336 (Just $ fromIntegral pub_year)
337 (Just pub_month)
338 (Just pub_day)
339 Nothing
340 Nothing
341 Nothing
342 Nothing
343
344 ------------------------------------------------------------------------
345 parseHal :: FilePath -> IO [HyperdataDocument]
346 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
347 ------------------------------------------------------------------------
348
349