]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/CSV.hs
[FIX] build ok.
[gargantext.git] / src / Gargantext / Text / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Text.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
17
18 module Gargantext.Text.Parsers.CSV where
19
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
22
23 import Control.Applicative
24
25 import Data.Char (ord)
26 import Data.Csv
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
30 import Data.Time.Segment (jour)
31
32 import Data.Vector (Vector)
33 import qualified Data.Vector as V
34 import Safe (tailMay)
35
36 import Gargantext.Database.Types.Node (HyperdataDocument(..))
37 import Gargantext.Text
38 import Gargantext.Text.Context
39 import Gargantext.Prelude hiding (length)
40
41 ---------------------------------------------------------------
42 data Doc = Doc
43 { d_docId :: !Int
44 , d_title :: !Text
45 , d_source :: !Text
46 , d_publication_year :: !Int
47 , d_publication_month :: !Int
48 , d_publication_day :: !Int
49 , d_abstract :: !Text
50 , d_authors :: !Text
51 }
52 deriving (Show)
53 ---------------------------------------------------------------
54 -- | Doc 2 HyperdataDocument
55 doc2hyperdataDocument :: Doc -> HyperdataDocument
56 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
57 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
58 HyperdataDocument (Just "CSV")
59 (Just . pack . show $ did)
60 Nothing
61 Nothing
62 Nothing
63 Nothing
64 (Just dt)
65 (Just dau)
66 (Just dab)
67 (Nothing)
68 Nothing
69 (Just dpy)
70 (Just dpm)
71 (Just dpd)
72 Nothing
73 Nothing
74 Nothing
75 Nothing
76 ---------------------------------------------------------------
77 -- | Types Conversions
78 toDocs :: Vector CsvDoc -> [Doc]
79 toDocs v = V.toList
80 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
81 -> Doc nId t s py pm pd abst auth )
82 (V.enumFromN 1 (V.length v'')) v''
83 where
84 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
85 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
86
87 ---------------------------------------------------------------
88 fromDocs :: Vector Doc -> Vector CsvDoc
89 fromDocs docs = V.map fromDocs' docs
90 where
91 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
92
93 ---------------------------------------------------------------
94 -- | Split a document in its context
95 -- TODO adapt the size of the paragraph according to the corpus average
96
97 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
98 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
99 if docSize > 1000
100 then
101 if (mod (round m) docSize) >= 10
102 then
103 splitDoc' splt doc
104 else
105 V.fromList [doc]
106 else
107 V.fromList [doc]
108
109
110 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
111 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
112 where
113 firstDoc = CsvDoc t s py pm pd firstAbstract auth
114 firstAbstract = head' abstracts
115
116 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
117
118 abstracts = (splitBy $ contextSize) abst
119 head' x = maybe "" identity (head x)
120 tail' x = maybe [""] identity (tailMay x)
121
122 ---------------------------------------------------------------
123 ---------------------------------------------------------------
124 type Mean = Double
125
126 docsSize :: Vector CsvDoc -> Mean
127 docsSize csvDoc = mean ls
128 where
129 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
130
131
132 ---------------------------------------------------------------
133 data CsvDoc = CsvDoc
134 { csv_title :: !Text
135 , csv_source :: !Text
136 , csv_publication_year :: !Int
137 , csv_publication_month :: !Int
138 , csv_publication_day :: !Int
139 , csv_abstract :: !Text
140 , csv_authors :: !Text
141 }
142 deriving (Show)
143
144 instance FromNamedRecord CsvDoc where
145 parseNamedRecord r = CsvDoc <$> r .: "title"
146 <*> r .: "source"
147 <*> r .: "publication_year"
148 <*> r .: "publication_month"
149 <*> r .: "publication_day"
150 <*> r .: "abstract"
151 <*> r .: "authors"
152
153 instance ToNamedRecord CsvDoc where
154 toNamedRecord (CsvDoc t s py pm pd abst aut) =
155 namedRecord [ "title" .= t
156 , "source" .= s
157 , "publication_year" .= py
158 , "publication_month" .= pm
159 , "publication_day" .= pd
160 , "abstract" .= abst
161 , "authors" .= aut
162 ]
163
164
165 csvDecodeOptions :: DecodeOptions
166 csvDecodeOptions = (defaultDecodeOptions
167 {decDelimiter = fromIntegral $ ord '\t'}
168 )
169
170 csvEncodeOptions :: EncodeOptions
171 csvEncodeOptions = ( defaultEncodeOptions
172 {encDelimiter = fromIntegral $ ord '\t'}
173 )
174
175 ------------------------------------------------------------------------
176 ------------------------------------------------------------------------
177 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
178 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
179 <$> snd
180 <$> readCsv fp
181
182 ------------------------------------------------------------------------
183 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
184 readCsv fp = do
185 csvData <- BL.readFile fp
186 case decodeByNameWith csvDecodeOptions csvData of
187 Left e -> panic (pack e)
188 Right csvDocs -> pure csvDocs
189
190
191 readHal :: FilePath -> IO (Header, Vector CsvHal)
192 readHal fp = do
193 csvData <- BL.readFile fp
194 case decodeByNameWith csvDecodeOptions csvData of
195 Left e -> panic (pack e)
196 Right csvDocs -> pure csvDocs
197 ------------------------------------------------------------------------
198 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
199 writeCsv fp (h, vs) = BL.writeFile fp $
200 encodeByNameWith csvEncodeOptions h (V.toList vs)
201
202
203 ------------------------------------------------------------------------
204 -- Hal Format
205 data CsvHal = CsvHal
206 { csvHal_title :: !Text
207 , csvHal_source :: !Text
208 , csvHal_publication_year :: !Integer
209 , csvHal_publication_month :: !Int
210 , csvHal_publication_day :: !Int
211 , csvHal_abstract :: !Text
212 , csvHal_authors :: !Text
213
214 , csvHal_url :: !Text
215 , csvHal_isbn_s :: !Text
216 , csvHal_issue_s :: !Text
217 , csvHal_journalPublisher_s:: !Text
218 , csvHal_language_s :: !Text
219
220 , csvHal_doiId_s :: !Text
221 , csvHal_authId_i :: !Text
222 , csvHal_instStructId_i :: !Text
223 , csvHal_deptStructId_i :: !Text
224 , csvHal_labStructId_i :: !Text
225
226 , csvHal_rteamStructId_i :: !Text
227 , csvHal_docType_s :: !Text
228 }
229 deriving (Show)
230
231 instance FromNamedRecord CsvHal where
232 parseNamedRecord r = CsvHal <$> r .: "title"
233 <*> r .: "source"
234 <*> r .: "publication_year"
235 <*> r .: "publication_month"
236 <*> r .: "publication_day"
237 <*> r .: "abstract"
238 <*> r .: "authors"
239
240 <*> r .: "url"
241 <*> r .: "isbn_s"
242 <*> r .: "issue_s"
243 <*> r .: "journalPublisher_s"
244 <*> r .: "language_s"
245
246 <*> r .: "doiId_s"
247 <*> r .: "authId_i"
248 <*> r .: "instStructId_i"
249 <*> r .: "deptStructId_i"
250 <*> r .: "labStructId_i"
251
252 <*> r .: "rteamStructId_i"
253 <*> r .: "docType_s"
254
255 instance ToNamedRecord CsvHal where
256 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
257 namedRecord [ "title" .= t
258 , "source" .= s
259
260 , "publication_year" .= py
261 , "publication_month" .= pm
262 , "publication_day" .= pd
263
264 , "abstract" .= abst
265 , "authors" .= aut
266
267 , "url" .= url
268 , "isbn_s" .= isbn
269 , "issue_s" .= iss
270 , "journalPublisher_s" .= j
271 , "language_s" .= lang
272
273 , "doiId_s" .= doi
274 , "authId_i" .= auth
275 , "instStructId_i" .= inst
276 , "deptStructId_i" .= dept
277 , "labStructId_i" .= lab
278
279 , "rteamStructId_i" .= team
280 , "docType_s" .= doct
281 ]
282
283 csvHal2doc :: CsvHal -> HyperdataDocument
284 csvHal2doc (CsvHal title source
285 pub_year pub_month pub_day
286 abstract authors
287 url _ _ _ _
288 doi _ _ _ _
289 _ _ ) = HyperdataDocument (Just "CsvHal")
290 (Just doi)
291 (Just url)
292 Nothing
293 Nothing
294 Nothing
295 (Just title)
296 (Just authors)
297 (Just source)
298 (Just abstract)
299 (Just $ pack . show $ jour pub_year pub_month pub_day)
300 (Just $ fromIntegral pub_year)
301 (Just pub_month)
302 (Just pub_day)
303 Nothing
304 Nothing
305 Nothing
306 Nothing
307
308 ------------------------------------------------------------------------
309 parseHal :: FilePath -> IO [HyperdataDocument]
310 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
311 ------------------------------------------------------------------------
312
313