]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/CSV.hs
[PHYLO][API] Get implemented.
[gargantext.git] / src / Gargantext / Text / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Text.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
17
18 module Gargantext.Text.Parsers.CSV where
19
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
22
23 import Control.Applicative
24
25 import Data.Char (ord)
26 import Data.Csv
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
30 import Data.Time.Segment (jour)
31
32 import Data.Vector (Vector)
33 import qualified Data.Vector as V
34 import Safe (tailMay)
35
36 import Gargantext.Database.Types.Node (HyperdataDocument(..))
37 import Gargantext.Text
38 import Gargantext.Text.Context
39 import Gargantext.Prelude hiding (length)
40
41 ---------------------------------------------------------------
42 headerCsvGargV3 :: Header
43 headerCsvGargV3 = header [ "title"
44 , "source"
45 , "publication_year"
46 , "publication_month"
47 , "publication_day"
48 , "abstract"
49 , "authors"
50 ]
51 ---------------------------------------------------------------
52 data Doc = Doc
53 { d_docId :: !Int
54 , d_title :: !Text
55 , d_source :: !Text
56 , d_publication_year :: !Int
57 , d_publication_month :: !Int
58 , d_publication_day :: !Int
59 , d_abstract :: !Text
60 , d_authors :: !Text
61 }
62 deriving (Show)
63 ---------------------------------------------------------------
64 -- | Doc 2 HyperdataDocument
65 doc2hyperdataDocument :: Doc -> HyperdataDocument
66 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
67 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
68 HyperdataDocument (Just "CSV")
69 (Just . pack . show $ did)
70 Nothing
71 Nothing
72 Nothing
73 Nothing
74 (Just dt)
75 Nothing
76 (Just dau)
77 (Just dab)
78 (Nothing)
79 Nothing
80 (Just dpy)
81 (Just dpm)
82 (Just dpd)
83 Nothing
84 Nothing
85 Nothing
86 Nothing
87 ---------------------------------------------------------------
88 -- | Types Conversions
89 toDocs :: Vector CsvDoc -> [Doc]
90 toDocs v = V.toList
91 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
92 -> Doc nId t s py pm pd abst auth )
93 (V.enumFromN 1 (V.length v'')) v''
94 where
95 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
96 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
97
98 ---------------------------------------------------------------
99 fromDocs :: Vector Doc -> Vector CsvDoc
100 fromDocs docs = V.map fromDocs' docs
101 where
102 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
103
104 ---------------------------------------------------------------
105 -- | Split a document in its context
106 -- TODO adapt the size of the paragraph according to the corpus average
107
108 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
109 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
110 if docSize > 1000
111 then
112 if (mod (round m) docSize) >= 10
113 then
114 splitDoc' splt doc
115 else
116 V.fromList [doc]
117 else
118 V.fromList [doc]
119
120
121 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
122 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
123 where
124 firstDoc = CsvDoc t s py pm pd firstAbstract auth
125 firstAbstract = head' "splitDoc'1" abstracts
126
127 nextDocs = map (\txt -> CsvDoc (head' "splitDoc'2" $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
128
129 abstracts = (splitBy $ contextSize) abst
130 tail' x = maybe [""] identity (tailMay x)
131
132 ---------------------------------------------------------------
133 ---------------------------------------------------------------
134 type Mean = Double
135
136 docsSize :: Vector CsvDoc -> Mean
137 docsSize csvDoc = mean ls
138 where
139 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
140
141
142 ---------------------------------------------------------------
143 data CsvDoc = CsvDoc
144 { csv_title :: !Text
145 , csv_source :: !Text
146 , csv_publication_year :: !Int
147 , csv_publication_month :: !Int
148 , csv_publication_day :: !Int
149 , csv_abstract :: !Text
150 , csv_authors :: !Text
151 }
152 deriving (Show)
153
154 instance FromNamedRecord CsvDoc where
155 parseNamedRecord r = CsvDoc <$> r .: "title"
156 <*> r .: "source"
157 <*> r .: "publication_year"
158 <*> r .: "publication_month"
159 <*> r .: "publication_day"
160 <*> r .: "abstract"
161 <*> r .: "authors"
162
163 instance ToNamedRecord CsvDoc where
164 toNamedRecord (CsvDoc t s py pm pd abst aut) =
165 namedRecord [ "title" .= t
166 , "source" .= s
167 , "publication_year" .= py
168 , "publication_month" .= pm
169 , "publication_day" .= pd
170 , "abstract" .= abst
171 , "authors" .= aut
172 ]
173
174
175 csvDecodeOptions :: DecodeOptions
176 csvDecodeOptions = (defaultDecodeOptions
177 {decDelimiter = fromIntegral $ ord '\t'}
178 )
179
180 csvEncodeOptions :: EncodeOptions
181 csvEncodeOptions = ( defaultEncodeOptions
182 {encDelimiter = fromIntegral $ ord '\t'}
183 )
184
185 ------------------------------------------------------------------------
186 ------------------------------------------------------------------------
187 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
188 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
189 <$> snd
190 <$> readCsv fp
191
192 ------------------------------------------------------------------------
193 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
194 readCsv fp = do
195 csvData <- BL.readFile fp
196 case decodeByNameWith csvDecodeOptions csvData of
197 Left e -> panic (pack e)
198 Right csvDocs -> pure csvDocs
199
200
201 readHal :: FilePath -> IO (Header, Vector CsvHal)
202 readHal fp = do
203 csvData <- BL.readFile fp
204 case decodeByNameWith csvDecodeOptions csvData of
205 Left e -> panic (pack e)
206 Right csvDocs -> pure csvDocs
207 ------------------------------------------------------------------------
208 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
209 writeCsv fp (h, vs) = BL.writeFile fp $
210 encodeByNameWith csvEncodeOptions h (V.toList vs)
211
212
213 ------------------------------------------------------------------------
214 -- Hal Format
215 data CsvHal = CsvHal
216 { csvHal_title :: !Text
217 , csvHal_source :: !Text
218 , csvHal_publication_year :: !Integer
219 , csvHal_publication_month :: !Int
220 , csvHal_publication_day :: !Int
221 , csvHal_abstract :: !Text
222 , csvHal_authors :: !Text
223
224 , csvHal_url :: !Text
225 , csvHal_isbn_s :: !Text
226 , csvHal_issue_s :: !Text
227 , csvHal_journalPublisher_s:: !Text
228 , csvHal_language_s :: !Text
229
230 , csvHal_doiId_s :: !Text
231 , csvHal_authId_i :: !Text
232 , csvHal_instStructId_i :: !Text
233 , csvHal_deptStructId_i :: !Text
234 , csvHal_labStructId_i :: !Text
235
236 , csvHal_rteamStructId_i :: !Text
237 , csvHal_docType_s :: !Text
238 }
239 deriving (Show)
240
241 instance FromNamedRecord CsvHal where
242 parseNamedRecord r = CsvHal <$> r .: "title"
243 <*> r .: "source"
244 <*> r .: "publication_year"
245 <*> r .: "publication_month"
246 <*> r .: "publication_day"
247 <*> r .: "abstract"
248 <*> r .: "authors"
249
250 <*> r .: "url"
251 <*> r .: "isbn_s"
252 <*> r .: "issue_s"
253 <*> r .: "journalPublisher_s"
254 <*> r .: "language_s"
255
256 <*> r .: "doiId_s"
257 <*> r .: "authId_i"
258 <*> r .: "instStructId_i"
259 <*> r .: "deptStructId_i"
260 <*> r .: "labStructId_i"
261
262 <*> r .: "rteamStructId_i"
263 <*> r .: "docType_s"
264
265 instance ToNamedRecord CsvHal where
266 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
267 namedRecord [ "title" .= t
268 , "source" .= s
269
270 , "publication_year" .= py
271 , "publication_month" .= pm
272 , "publication_day" .= pd
273
274 , "abstract" .= abst
275 , "authors" .= aut
276
277 , "url" .= url
278 , "isbn_s" .= isbn
279 , "issue_s" .= iss
280 , "journalPublisher_s" .= j
281 , "language_s" .= lang
282
283 , "doiId_s" .= doi
284 , "authId_i" .= auth
285 , "instStructId_i" .= inst
286 , "deptStructId_i" .= dept
287 , "labStructId_i" .= lab
288
289 , "rteamStructId_i" .= team
290 , "docType_s" .= doct
291 ]
292
293 csvHal2doc :: CsvHal -> HyperdataDocument
294 csvHal2doc (CsvHal title source
295 pub_year pub_month pub_day
296 abstract authors
297 url _ _ _ _
298 doi _ inst _ _
299 _ _ ) = HyperdataDocument (Just "CsvHal")
300 (Just doi)
301 (Just url)
302 Nothing
303 Nothing
304 Nothing
305 (Just title)
306 (Just authors)
307 (Just inst)
308 (Just source)
309 (Just abstract)
310 (Just $ pack . show $ jour pub_year pub_month pub_day)
311 (Just $ fromIntegral pub_year)
312 (Just pub_month)
313 (Just pub_day)
314 Nothing
315 Nothing
316 Nothing
317 Nothing
318
319 ------------------------------------------------------------------------
320 parseHal :: FilePath -> IO [HyperdataDocument]
321 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
322 ------------------------------------------------------------------------
323
324