]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/CSV.hs
[BIN/PHYLO] first draft, needs phylo connection.
[gargantext.git] / src / Gargantext / Text / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Text.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
17
18 module Gargantext.Text.Parsers.CSV where
19
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
22
23 import Control.Applicative
24
25 import Data.Char (ord)
26 import Data.Csv
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
30 import Data.Time.Segment (jour)
31
32 import Data.Vector (Vector)
33 import qualified Data.Vector as V
34
35 import Gargantext.Database.Types.Node (HyperdataDocument(..))
36 import Gargantext.Text
37 import Gargantext.Text.Context
38 import Gargantext.Prelude hiding (length)
39
40 ---------------------------------------------------------------
41 headerCsvGargV3 :: Header
42 headerCsvGargV3 = header [ "title"
43 , "source"
44 , "publication_year"
45 , "publication_month"
46 , "publication_day"
47 , "abstract"
48 , "authors"
49 ]
50 ---------------------------------------------------------------
51 data Doc = Doc
52 { d_docId :: !Int
53 , d_title :: !Text
54 , d_source :: !Text
55 , d_publication_year :: !Int
56 , d_publication_month :: !Int
57 , d_publication_day :: !Int
58 , d_abstract :: !Text
59 , d_authors :: !Text
60 }
61 deriving (Show)
62 ---------------------------------------------------------------
63 -- | Doc 2 HyperdataDocument
64 doc2hyperdataDocument :: Doc -> HyperdataDocument
65 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
66 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
67 HyperdataDocument (Just "CSV")
68 (Just . pack . show $ did)
69 Nothing
70 Nothing
71 Nothing
72 Nothing
73 (Just dt)
74 Nothing
75 (Just dau)
76 (Just dab)
77 (Nothing)
78 Nothing
79 (Just dpy)
80 (Just dpm)
81 (Just dpd)
82 Nothing
83 Nothing
84 Nothing
85 Nothing
86 ---------------------------------------------------------------
87 -- | Types Conversions
88 toDocs :: Vector CsvDoc -> [Doc]
89 toDocs v = V.toList
90 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
91 -> Doc nId t s py pm pd abst auth )
92 (V.enumFromN 1 (V.length v'')) v''
93 where
94 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
95 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
96
97 ---------------------------------------------------------------
98 fromDocs :: Vector Doc -> Vector CsvDoc
99 fromDocs docs = V.map fromDocs' docs
100 where
101 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
102
103 ---------------------------------------------------------------
104 -- | Split a document in its context
105 -- TODO adapt the size of the paragraph according to the corpus average
106
107 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
108 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
109 if docSize > 1000
110 then
111 if (mod (round m) docSize) >= 10
112 then
113 splitDoc' splt doc
114 else
115 V.fromList [doc]
116 else
117 V.fromList [doc]
118
119
120 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
121 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
122 where
123 firstDoc = CsvDoc t s py pm pd firstAbstract auth
124 firstAbstract = head' "splitDoc'1" abstracts
125
126 nextDocs = map (\txt -> CsvDoc
127 (head' "splitDoc'2" $ sentences txt)
128 s py pm pd
129 (unsentences $ tail' "splitDoc'1" $ sentences txt)
130 auth
131 ) (tail' "splitDoc'2" abstracts)
132
133 abstracts = (splitBy $ contextSize) abst
134
135 ---------------------------------------------------------------
136 ---------------------------------------------------------------
137 type Mean = Double
138
139 docsSize :: Vector CsvDoc -> Mean
140 docsSize csvDoc = mean ls
141 where
142 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
143
144
145 ---------------------------------------------------------------
146 data CsvDoc = CsvDoc
147 { csv_title :: !Text
148 , csv_source :: !Text
149 , csv_publication_year :: !Int
150 , csv_publication_month :: !Int
151 , csv_publication_day :: !Int
152 , csv_abstract :: !Text
153 , csv_authors :: !Text
154 }
155 deriving (Show)
156
157 instance FromNamedRecord CsvDoc where
158 parseNamedRecord r = CsvDoc <$> r .: "title"
159 <*> r .: "source"
160 <*> r .: "publication_year"
161 <*> r .: "publication_month"
162 <*> r .: "publication_day"
163 <*> r .: "abstract"
164 <*> r .: "authors"
165
166 instance ToNamedRecord CsvDoc where
167 toNamedRecord (CsvDoc t s py pm pd abst aut) =
168 namedRecord [ "title" .= t
169 , "source" .= s
170 , "publication_year" .= py
171 , "publication_month" .= pm
172 , "publication_day" .= pd
173 , "abstract" .= abst
174 , "authors" .= aut
175 ]
176
177
178 csvDecodeOptions :: DecodeOptions
179 csvDecodeOptions = (defaultDecodeOptions
180 {decDelimiter = fromIntegral $ ord '\t'}
181 )
182
183 csvEncodeOptions :: EncodeOptions
184 csvEncodeOptions = ( defaultEncodeOptions
185 {encDelimiter = fromIntegral $ ord '\t'}
186 )
187
188 ------------------------------------------------------------------------
189 ------------------------------------------------------------------------
190 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
191 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
192 <$> snd
193 <$> readCsv fp
194
195 ------------------------------------------------------------------------
196 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
197 readCsv fp = do
198 csvData <- BL.readFile fp
199 case decodeByNameWith csvDecodeOptions csvData of
200 Left e -> panic (pack e)
201 Right csvDocs -> pure csvDocs
202
203
204 readHal :: FilePath -> IO (Header, Vector CsvHal)
205 readHal fp = do
206 csvData <- BL.readFile fp
207 case decodeByNameWith csvDecodeOptions csvData of
208 Left e -> panic (pack e)
209 Right csvDocs -> pure csvDocs
210 ------------------------------------------------------------------------
211 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
212 writeCsv fp (h, vs) = BL.writeFile fp $
213 encodeByNameWith csvEncodeOptions h (V.toList vs)
214
215
216 ------------------------------------------------------------------------
217 -- Hal Format
218 data CsvHal = CsvHal
219 { csvHal_title :: !Text
220 , csvHal_source :: !Text
221 , csvHal_publication_year :: !Integer
222 , csvHal_publication_month :: !Int
223 , csvHal_publication_day :: !Int
224 , csvHal_abstract :: !Text
225 , csvHal_authors :: !Text
226
227 , csvHal_url :: !Text
228 , csvHal_isbn_s :: !Text
229 , csvHal_issue_s :: !Text
230 , csvHal_journalPublisher_s:: !Text
231 , csvHal_language_s :: !Text
232
233 , csvHal_doiId_s :: !Text
234 , csvHal_authId_i :: !Text
235 , csvHal_instStructId_i :: !Text
236 , csvHal_deptStructId_i :: !Text
237 , csvHal_labStructId_i :: !Text
238
239 , csvHal_rteamStructId_i :: !Text
240 , csvHal_docType_s :: !Text
241 }
242 deriving (Show)
243
244 instance FromNamedRecord CsvHal where
245 parseNamedRecord r = CsvHal <$> r .: "title"
246 <*> r .: "source"
247 <*> r .: "publication_year"
248 <*> r .: "publication_month"
249 <*> r .: "publication_day"
250 <*> r .: "abstract"
251 <*> r .: "authors"
252
253 <*> r .: "url"
254 <*> r .: "isbn_s"
255 <*> r .: "issue_s"
256 <*> r .: "journalPublisher_s"
257 <*> r .: "language_s"
258
259 <*> r .: "doiId_s"
260 <*> r .: "authId_i"
261 <*> r .: "instStructId_i"
262 <*> r .: "deptStructId_i"
263 <*> r .: "labStructId_i"
264
265 <*> r .: "rteamStructId_i"
266 <*> r .: "docType_s"
267
268 instance ToNamedRecord CsvHal where
269 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
270 namedRecord [ "title" .= t
271 , "source" .= s
272
273 , "publication_year" .= py
274 , "publication_month" .= pm
275 , "publication_day" .= pd
276
277 , "abstract" .= abst
278 , "authors" .= aut
279
280 , "url" .= url
281 , "isbn_s" .= isbn
282 , "issue_s" .= iss
283 , "journalPublisher_s" .= j
284 , "language_s" .= lang
285
286 , "doiId_s" .= doi
287 , "authId_i" .= auth
288 , "instStructId_i" .= inst
289 , "deptStructId_i" .= dept
290 , "labStructId_i" .= lab
291
292 , "rteamStructId_i" .= team
293 , "docType_s" .= doct
294 ]
295
296 csvHal2doc :: CsvHal -> HyperdataDocument
297 csvHal2doc (CsvHal title source
298 pub_year pub_month pub_day
299 abstract authors
300 url _ _ _ _
301 doi _ inst _ _
302 _ _ ) = HyperdataDocument (Just "CsvHal")
303 (Just doi)
304 (Just url)
305 Nothing
306 Nothing
307 Nothing
308 (Just title)
309 (Just authors)
310 (Just inst)
311 (Just source)
312 (Just abstract)
313 (Just $ pack . show $ jour pub_year pub_month pub_day)
314 (Just $ fromIntegral pub_year)
315 (Just pub_month)
316 (Just pub_day)
317 Nothing
318 Nothing
319 Nothing
320 Nothing
321
322 ------------------------------------------------------------------------
323 parseHal :: FilePath -> IO [HyperdataDocument]
324 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
325 ------------------------------------------------------------------------
326
327