]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/CSV.hs
[PARSERS] RIS/PRESSE fix title and abstract field.
[gargantext.git] / src / Gargantext / Text / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Text.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
17
18 module Gargantext.Text.Parsers.CSV where
19
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
22
23 import Control.Applicative
24
25 import Data.Char (ord)
26 import Data.Csv
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
30 import Data.Time.Segment (jour)
31
32 import Data.Vector (Vector)
33 import qualified Data.Vector as V
34
35 import Gargantext.Database.Types.Node -- (HyperdataDocument(..))
36 import Gargantext.Text
37 import Gargantext.Text.Context
38 import Gargantext.Prelude hiding (length)
39
40 ---------------------------------------------------------------
41 headerCsvGargV3 :: Header
42 headerCsvGargV3 = header [ "title"
43 , "source"
44 , "publication_year"
45 , "publication_month"
46 , "publication_day"
47 , "abstract"
48 , "authors"
49 ]
50 ---------------------------------------------------------------
51 data Doc = Doc
52 { d_docId :: !Int
53 , d_title :: !Text
54 , d_source :: !Text
55 , d_publication_year :: !Int
56 , d_publication_month :: !Int
57 , d_publication_day :: !Int
58 , d_abstract :: !Text
59 , d_authors :: !Text
60 }
61 deriving (Show)
62 ---------------------------------------------------------------
63 -- | Doc 2 HyperdataDocument
64 doc2hyperdataDocument :: Doc -> HyperdataDocument
65 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
66 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
67 HyperdataDocument (Just "CSV")
68 (Just . pack . show $ did)
69 Nothing
70 Nothing
71 Nothing
72 Nothing
73 (Just dt)
74 Nothing
75 (Just dau)
76 (Just dab)
77 (Nothing)
78 Nothing
79 (Just dpy)
80 (Just dpm)
81 (Just dpd)
82 Nothing
83 Nothing
84 Nothing
85 Nothing
86
87
88
89
90 ---------------------------------------------------------------
91 -- | Types Conversions
92 toDocs :: Vector CsvDoc -> [Doc]
93 toDocs v = V.toList
94 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
95 -> Doc nId t s py pm pd abst auth )
96 (V.enumFromN 1 (V.length v'')) v''
97 where
98 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
99 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
100
101 ---------------------------------------------------------------
102 fromDocs :: Vector Doc -> Vector CsvDoc
103 fromDocs docs = V.map fromDocs' docs
104 where
105 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
106
107 ---------------------------------------------------------------
108 -- | Split a document in its context
109 -- TODO adapt the size of the paragraph according to the corpus average
110
111 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
112 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
113 if docSize > 1000
114 then
115 if (mod (round m) docSize) >= 10
116 then
117 splitDoc' splt doc
118 else
119 V.fromList [doc]
120 else
121 V.fromList [doc]
122
123
124 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
125 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
126 where
127 firstDoc = CsvDoc t s py pm pd firstAbstract auth
128 firstAbstract = head' "splitDoc'1" abstracts
129
130 nextDocs = map (\txt -> CsvDoc
131 (head' "splitDoc'2" $ sentences txt)
132 s py pm pd
133 (unsentences $ tail' "splitDoc'1" $ sentences txt)
134 auth
135 ) (tail' "splitDoc'2" abstracts)
136
137 abstracts = (splitBy $ contextSize) abst
138
139 ---------------------------------------------------------------
140 ---------------------------------------------------------------
141 type Mean = Double
142
143 docsSize :: Vector CsvDoc -> Mean
144 docsSize csvDoc = mean ls
145 where
146 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
147
148
149 ---------------------------------------------------------------
150 data CsvDoc = CsvDoc
151 { csv_title :: !Text
152 , csv_source :: !Text
153 , csv_publication_year :: !Int
154 , csv_publication_month :: !Int
155 , csv_publication_day :: !Int
156 , csv_abstract :: !Text
157 , csv_authors :: !Text
158 }
159 deriving (Show)
160
161 instance FromNamedRecord CsvDoc where
162 parseNamedRecord r = CsvDoc <$> r .: "title"
163 <*> r .: "source"
164 <*> r .: "publication_year"
165 <*> r .: "publication_month"
166 <*> r .: "publication_day"
167 <*> r .: "abstract"
168 <*> r .: "authors"
169
170 instance ToNamedRecord CsvDoc where
171 toNamedRecord (CsvDoc t s py pm pd abst aut) =
172 namedRecord [ "title" .= t
173 , "source" .= s
174 , "publication_year" .= py
175 , "publication_month" .= pm
176 , "publication_day" .= pd
177 , "abstract" .= abst
178 , "authors" .= aut
179 ]
180
181 hyperdataDocument2csvDoc :: HyperdataDocument -> CsvDoc
182 hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
183 (m $ _hyperdataDocument_source h)
184 (mI $ _hyperdataDocument_publication_year h)
185 (mI $ _hyperdataDocument_publication_month h)
186 (mI $ _hyperdataDocument_publication_day h)
187 (m $ _hyperdataDocument_abstract h)
188 (m $ _hyperdataDocument_authors h)
189
190 where
191 m = maybe "" identity
192 mI = maybe 0 identity
193
194
195 csvDecodeOptions :: DecodeOptions
196 csvDecodeOptions = (defaultDecodeOptions
197 {decDelimiter = fromIntegral $ ord '\t'}
198 )
199
200 csvEncodeOptions :: EncodeOptions
201 csvEncodeOptions = ( defaultEncodeOptions
202 {encDelimiter = fromIntegral $ ord '\t'}
203 )
204
205 ------------------------------------------------------------------------
206 ------------------------------------------------------------------------
207 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
208 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
209 <$> snd
210 <$> readCsv fp
211
212 ------------------------------------------------------------------------
213 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
214 readCsv fp = do
215 csvData <- BL.readFile fp
216 case decodeByNameWith csvDecodeOptions csvData of
217 Left e -> panic (pack e)
218 Right csvDocs -> pure csvDocs
219
220
221 readHal :: FilePath -> IO (Header, Vector CsvHal)
222 readHal fp = do
223 csvData <- BL.readFile fp
224 case decodeByNameWith csvDecodeOptions csvData of
225 Left e -> panic (pack e)
226 Right csvDocs -> pure csvDocs
227 ------------------------------------------------------------------------
228 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
229 writeCsv fp (h, vs) = BL.writeFile fp $
230 encodeByNameWith csvEncodeOptions h (V.toList vs)
231
232 writeDocs2Csv :: FilePath -> [HyperdataDocument] -> IO ()
233 writeDocs2Csv fp hs = BL.writeFile fp $
234 encodeByNameWith csvEncodeOptions headerCsvGargV3 (map hyperdataDocument2csvDoc hs)
235 ------------------------------------------------------------------------
236 -- Hal Format
237 data CsvHal = CsvHal
238 { csvHal_title :: !Text
239 , csvHal_source :: !Text
240 , csvHal_publication_year :: !Integer
241 , csvHal_publication_month :: !Int
242 , csvHal_publication_day :: !Int
243 , csvHal_abstract :: !Text
244 , csvHal_authors :: !Text
245
246 , csvHal_url :: !Text
247 , csvHal_isbn_s :: !Text
248 , csvHal_issue_s :: !Text
249 , csvHal_journalPublisher_s:: !Text
250 , csvHal_language_s :: !Text
251
252 , csvHal_doiId_s :: !Text
253 , csvHal_authId_i :: !Text
254 , csvHal_instStructId_i :: !Text
255 , csvHal_deptStructId_i :: !Text
256 , csvHal_labStructId_i :: !Text
257
258 , csvHal_rteamStructId_i :: !Text
259 , csvHal_docType_s :: !Text
260 }
261 deriving (Show)
262
263 instance FromNamedRecord CsvHal where
264 parseNamedRecord r = CsvHal <$> r .: "title"
265 <*> r .: "source"
266 <*> r .: "publication_year"
267 <*> r .: "publication_month"
268 <*> r .: "publication_day"
269 <*> r .: "abstract"
270 <*> r .: "authors"
271
272 <*> r .: "url"
273 <*> r .: "isbn_s"
274 <*> r .: "issue_s"
275 <*> r .: "journalPublisher_s"
276 <*> r .: "language_s"
277
278 <*> r .: "doiId_s"
279 <*> r .: "authId_i"
280 <*> r .: "instStructId_i"
281 <*> r .: "deptStructId_i"
282 <*> r .: "labStructId_i"
283
284 <*> r .: "rteamStructId_i"
285 <*> r .: "docType_s"
286
287 instance ToNamedRecord CsvHal where
288 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
289 namedRecord [ "title" .= t
290 , "source" .= s
291
292 , "publication_year" .= py
293 , "publication_month" .= pm
294 , "publication_day" .= pd
295
296 , "abstract" .= abst
297 , "authors" .= aut
298
299 , "url" .= url
300 , "isbn_s" .= isbn
301 , "issue_s" .= iss
302 , "journalPublisher_s" .= j
303 , "language_s" .= lang
304
305 , "doiId_s" .= doi
306 , "authId_i" .= auth
307 , "instStructId_i" .= inst
308 , "deptStructId_i" .= dept
309 , "labStructId_i" .= lab
310
311 , "rteamStructId_i" .= team
312 , "docType_s" .= doct
313 ]
314
315 csvHal2doc :: CsvHal -> HyperdataDocument
316 csvHal2doc (CsvHal title source
317 pub_year pub_month pub_day
318 abstract authors
319 url _ _ _ _
320 doi _ inst _ _
321 _ _ ) = HyperdataDocument (Just "CsvHal")
322 (Just doi)
323 (Just url)
324 Nothing
325 Nothing
326 Nothing
327 (Just title)
328 (Just authors)
329 (Just inst)
330 (Just source)
331 (Just abstract)
332 (Just $ pack . show $ jour pub_year pub_month pub_day)
333 (Just $ fromIntegral pub_year)
334 (Just pub_month)
335 (Just pub_day)
336 Nothing
337 Nothing
338 Nothing
339 Nothing
340
341 ------------------------------------------------------------------------
342 parseHal :: FilePath -> IO [HyperdataDocument]
343 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
344 ------------------------------------------------------------------------
345
346