]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/CSV.hs
[TYPE] HyperdataDocument : adding institutes as new field.
[gargantext.git] / src / Gargantext / Text / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Text.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
17
18 module Gargantext.Text.Parsers.CSV where
19
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
22
23 import Control.Applicative
24
25 import Data.Char (ord)
26 import Data.Csv
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
30 import Data.Time.Segment (jour)
31
32 import Data.Vector (Vector)
33 import qualified Data.Vector as V
34 import Safe (tailMay)
35
36 import Gargantext.Database.Types.Node (HyperdataDocument(..))
37 import Gargantext.Text
38 import Gargantext.Text.Context
39 import Gargantext.Prelude hiding (length)
40
41 ---------------------------------------------------------------
42 data Doc = Doc
43 { d_docId :: !Int
44 , d_title :: !Text
45 , d_source :: !Text
46 , d_publication_year :: !Int
47 , d_publication_month :: !Int
48 , d_publication_day :: !Int
49 , d_abstract :: !Text
50 , d_authors :: !Text
51 }
52 deriving (Show)
53 ---------------------------------------------------------------
54 -- | Doc 2 HyperdataDocument
55 doc2hyperdataDocument :: Doc -> HyperdataDocument
56 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
57 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
58 HyperdataDocument (Just "CSV")
59 (Just . pack . show $ did)
60 Nothing
61 Nothing
62 Nothing
63 Nothing
64 (Just dt)
65 Nothing
66 (Just dau)
67 (Just dab)
68 (Nothing)
69 Nothing
70 (Just dpy)
71 (Just dpm)
72 (Just dpd)
73 Nothing
74 Nothing
75 Nothing
76 Nothing
77 ---------------------------------------------------------------
78 -- | Types Conversions
79 toDocs :: Vector CsvDoc -> [Doc]
80 toDocs v = V.toList
81 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
82 -> Doc nId t s py pm pd abst auth )
83 (V.enumFromN 1 (V.length v'')) v''
84 where
85 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
86 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
87
88 ---------------------------------------------------------------
89 fromDocs :: Vector Doc -> Vector CsvDoc
90 fromDocs docs = V.map fromDocs' docs
91 where
92 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
93
94 ---------------------------------------------------------------
95 -- | Split a document in its context
96 -- TODO adapt the size of the paragraph according to the corpus average
97
98 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
99 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
100 if docSize > 1000
101 then
102 if (mod (round m) docSize) >= 10
103 then
104 splitDoc' splt doc
105 else
106 V.fromList [doc]
107 else
108 V.fromList [doc]
109
110
111 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
112 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
113 where
114 firstDoc = CsvDoc t s py pm pd firstAbstract auth
115 firstAbstract = head' abstracts
116
117 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
118
119 abstracts = (splitBy $ contextSize) abst
120 head' x = maybe "" identity (head x)
121 tail' x = maybe [""] identity (tailMay x)
122
123 ---------------------------------------------------------------
124 ---------------------------------------------------------------
125 type Mean = Double
126
127 docsSize :: Vector CsvDoc -> Mean
128 docsSize csvDoc = mean ls
129 where
130 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
131
132
133 ---------------------------------------------------------------
134 data CsvDoc = CsvDoc
135 { csv_title :: !Text
136 , csv_source :: !Text
137 , csv_publication_year :: !Int
138 , csv_publication_month :: !Int
139 , csv_publication_day :: !Int
140 , csv_abstract :: !Text
141 , csv_authors :: !Text
142 }
143 deriving (Show)
144
145 instance FromNamedRecord CsvDoc where
146 parseNamedRecord r = CsvDoc <$> r .: "title"
147 <*> r .: "source"
148 <*> r .: "publication_year"
149 <*> r .: "publication_month"
150 <*> r .: "publication_day"
151 <*> r .: "abstract"
152 <*> r .: "authors"
153
154 instance ToNamedRecord CsvDoc where
155 toNamedRecord (CsvDoc t s py pm pd abst aut) =
156 namedRecord [ "title" .= t
157 , "source" .= s
158 , "publication_year" .= py
159 , "publication_month" .= pm
160 , "publication_day" .= pd
161 , "abstract" .= abst
162 , "authors" .= aut
163 ]
164
165
166 csvDecodeOptions :: DecodeOptions
167 csvDecodeOptions = (defaultDecodeOptions
168 {decDelimiter = fromIntegral $ ord '\t'}
169 )
170
171 csvEncodeOptions :: EncodeOptions
172 csvEncodeOptions = ( defaultEncodeOptions
173 {encDelimiter = fromIntegral $ ord '\t'}
174 )
175
176 ------------------------------------------------------------------------
177 ------------------------------------------------------------------------
178 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
179 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
180 <$> snd
181 <$> readCsv fp
182
183 ------------------------------------------------------------------------
184 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
185 readCsv fp = do
186 csvData <- BL.readFile fp
187 case decodeByNameWith csvDecodeOptions csvData of
188 Left e -> panic (pack e)
189 Right csvDocs -> pure csvDocs
190
191
192 readHal :: FilePath -> IO (Header, Vector CsvHal)
193 readHal fp = do
194 csvData <- BL.readFile fp
195 case decodeByNameWith csvDecodeOptions csvData of
196 Left e -> panic (pack e)
197 Right csvDocs -> pure csvDocs
198 ------------------------------------------------------------------------
199 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
200 writeCsv fp (h, vs) = BL.writeFile fp $
201 encodeByNameWith csvEncodeOptions h (V.toList vs)
202
203
204 ------------------------------------------------------------------------
205 -- Hal Format
206 data CsvHal = CsvHal
207 { csvHal_title :: !Text
208 , csvHal_source :: !Text
209 , csvHal_publication_year :: !Integer
210 , csvHal_publication_month :: !Int
211 , csvHal_publication_day :: !Int
212 , csvHal_abstract :: !Text
213 , csvHal_authors :: !Text
214
215 , csvHal_url :: !Text
216 , csvHal_isbn_s :: !Text
217 , csvHal_issue_s :: !Text
218 , csvHal_journalPublisher_s:: !Text
219 , csvHal_language_s :: !Text
220
221 , csvHal_doiId_s :: !Text
222 , csvHal_authId_i :: !Text
223 , csvHal_instStructId_i :: !Text
224 , csvHal_deptStructId_i :: !Text
225 , csvHal_labStructId_i :: !Text
226
227 , csvHal_rteamStructId_i :: !Text
228 , csvHal_docType_s :: !Text
229 }
230 deriving (Show)
231
232 instance FromNamedRecord CsvHal where
233 parseNamedRecord r = CsvHal <$> r .: "title"
234 <*> r .: "source"
235 <*> r .: "publication_year"
236 <*> r .: "publication_month"
237 <*> r .: "publication_day"
238 <*> r .: "abstract"
239 <*> r .: "authors"
240
241 <*> r .: "url"
242 <*> r .: "isbn_s"
243 <*> r .: "issue_s"
244 <*> r .: "journalPublisher_s"
245 <*> r .: "language_s"
246
247 <*> r .: "doiId_s"
248 <*> r .: "authId_i"
249 <*> r .: "instStructId_i"
250 <*> r .: "deptStructId_i"
251 <*> r .: "labStructId_i"
252
253 <*> r .: "rteamStructId_i"
254 <*> r .: "docType_s"
255
256 instance ToNamedRecord CsvHal where
257 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
258 namedRecord [ "title" .= t
259 , "source" .= s
260
261 , "publication_year" .= py
262 , "publication_month" .= pm
263 , "publication_day" .= pd
264
265 , "abstract" .= abst
266 , "authors" .= aut
267
268 , "url" .= url
269 , "isbn_s" .= isbn
270 , "issue_s" .= iss
271 , "journalPublisher_s" .= j
272 , "language_s" .= lang
273
274 , "doiId_s" .= doi
275 , "authId_i" .= auth
276 , "instStructId_i" .= inst
277 , "deptStructId_i" .= dept
278 , "labStructId_i" .= lab
279
280 , "rteamStructId_i" .= team
281 , "docType_s" .= doct
282 ]
283
284 csvHal2doc :: CsvHal -> HyperdataDocument
285 csvHal2doc (CsvHal title source
286 pub_year pub_month pub_day
287 abstract authors
288 url _ _ _ _
289 doi _ inst _ _
290 _ _ ) = HyperdataDocument (Just "CsvHal")
291 (Just doi)
292 (Just url)
293 Nothing
294 Nothing
295 Nothing
296 (Just title)
297 (Just inst)
298 (Just authors)
299 (Just source)
300 (Just abstract)
301 (Just $ pack . show $ jour pub_year pub_month pub_day)
302 (Just $ fromIntegral pub_year)
303 (Just pub_month)
304 (Just pub_day)
305 Nothing
306 Nothing
307 Nothing
308 Nothing
309
310 ------------------------------------------------------------------------
311 parseHal :: FilePath -> IO [HyperdataDocument]
312 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
313 ------------------------------------------------------------------------
314
315