]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/CSV.hs
Merge branch 'dev-ngrams-repo' of ssh://delanoe.org/haskell-gargantext into dev-ngram...
[gargantext.git] / src / Gargantext / Text / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Text.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
17
18 module Gargantext.Text.Parsers.CSV where
19
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
22
23 import Control.Applicative
24
25 import Data.Char (ord)
26 import Data.Csv
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
30 import Data.Time.Segment (jour)
31
32 import Data.Vector (Vector)
33 import qualified Data.Vector as V
34 import Safe (tailMay)
35
36 import Gargantext.Database.Types.Node (HyperdataDocument(..))
37 import Gargantext.Text
38 import Gargantext.Text.Context
39 import Gargantext.Prelude hiding (length)
40
41 ---------------------------------------------------------------
42 headerCsvGargV3 :: Header
43 headerCsvGargV3 = header [ "title"
44 , "source"
45 , "publication_year"
46 , "publication_month"
47 , "publication_day"
48 , "abstract"
49 , "authors"
50 ]
51 ---------------------------------------------------------------
52 data Doc = Doc
53 { d_docId :: !Int
54 , d_title :: !Text
55 , d_source :: !Text
56 , d_publication_year :: !Int
57 , d_publication_month :: !Int
58 , d_publication_day :: !Int
59 , d_abstract :: !Text
60 , d_authors :: !Text
61 }
62 deriving (Show)
63 ---------------------------------------------------------------
64 -- | Doc 2 HyperdataDocument
65 doc2hyperdataDocument :: Doc -> HyperdataDocument
66 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
67 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
68 HyperdataDocument (Just "CSV")
69 (Just . pack . show $ did)
70 Nothing
71 Nothing
72 Nothing
73 Nothing
74 (Just dt)
75 Nothing
76 (Just dau)
77 (Just dab)
78 (Nothing)
79 Nothing
80 (Just dpy)
81 (Just dpm)
82 (Just dpd)
83 Nothing
84 Nothing
85 Nothing
86 Nothing
87 ---------------------------------------------------------------
88 -- | Types Conversions
89 toDocs :: Vector CsvDoc -> [Doc]
90 toDocs v = V.toList
91 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
92 -> Doc nId t s py pm pd abst auth )
93 (V.enumFromN 1 (V.length v'')) v''
94 where
95 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
96 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
97
98 ---------------------------------------------------------------
99 fromDocs :: Vector Doc -> Vector CsvDoc
100 fromDocs docs = V.map fromDocs' docs
101 where
102 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
103
104 ---------------------------------------------------------------
105 -- | Split a document in its context
106 -- TODO adapt the size of the paragraph according to the corpus average
107
108 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
109 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
110 if docSize > 1000
111 then
112 if (mod (round m) docSize) >= 10
113 then
114 splitDoc' splt doc
115 else
116 V.fromList [doc]
117 else
118 V.fromList [doc]
119
120
121 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
122 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
123 where
124 firstDoc = CsvDoc t s py pm pd firstAbstract auth
125 firstAbstract = head' abstracts
126
127 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
128
129 abstracts = (splitBy $ contextSize) abst
130 head' x = maybe "" identity (head x)
131 tail' x = maybe [""] identity (tailMay x)
132
133 ---------------------------------------------------------------
134 ---------------------------------------------------------------
135 type Mean = Double
136
137 docsSize :: Vector CsvDoc -> Mean
138 docsSize csvDoc = mean ls
139 where
140 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
141
142
143 ---------------------------------------------------------------
144 data CsvDoc = CsvDoc
145 { csv_title :: !Text
146 , csv_source :: !Text
147 , csv_publication_year :: !Int
148 , csv_publication_month :: !Int
149 , csv_publication_day :: !Int
150 , csv_abstract :: !Text
151 , csv_authors :: !Text
152 }
153 deriving (Show)
154
155 instance FromNamedRecord CsvDoc where
156 parseNamedRecord r = CsvDoc <$> r .: "title"
157 <*> r .: "source"
158 <*> r .: "publication_year"
159 <*> r .: "publication_month"
160 <*> r .: "publication_day"
161 <*> r .: "abstract"
162 <*> r .: "authors"
163
164 instance ToNamedRecord CsvDoc where
165 toNamedRecord (CsvDoc t s py pm pd abst aut) =
166 namedRecord [ "title" .= t
167 , "source" .= s
168 , "publication_year" .= py
169 , "publication_month" .= pm
170 , "publication_day" .= pd
171 , "abstract" .= abst
172 , "authors" .= aut
173 ]
174
175
176 csvDecodeOptions :: DecodeOptions
177 csvDecodeOptions = (defaultDecodeOptions
178 {decDelimiter = fromIntegral $ ord '\t'}
179 )
180
181 csvEncodeOptions :: EncodeOptions
182 csvEncodeOptions = ( defaultEncodeOptions
183 {encDelimiter = fromIntegral $ ord '\t'}
184 )
185
186 ------------------------------------------------------------------------
187 ------------------------------------------------------------------------
188 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
189 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
190 <$> snd
191 <$> readCsv fp
192
193 ------------------------------------------------------------------------
194 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
195 readCsv fp = do
196 csvData <- BL.readFile fp
197 case decodeByNameWith csvDecodeOptions csvData of
198 Left e -> panic (pack e)
199 Right csvDocs -> pure csvDocs
200
201
202 readHal :: FilePath -> IO (Header, Vector CsvHal)
203 readHal fp = do
204 csvData <- BL.readFile fp
205 case decodeByNameWith csvDecodeOptions csvData of
206 Left e -> panic (pack e)
207 Right csvDocs -> pure csvDocs
208 ------------------------------------------------------------------------
209 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
210 writeCsv fp (h, vs) = BL.writeFile fp $
211 encodeByNameWith csvEncodeOptions h (V.toList vs)
212
213
214 ------------------------------------------------------------------------
215 -- Hal Format
216 data CsvHal = CsvHal
217 { csvHal_title :: !Text
218 , csvHal_source :: !Text
219 , csvHal_publication_year :: !Integer
220 , csvHal_publication_month :: !Int
221 , csvHal_publication_day :: !Int
222 , csvHal_abstract :: !Text
223 , csvHal_authors :: !Text
224
225 , csvHal_url :: !Text
226 , csvHal_isbn_s :: !Text
227 , csvHal_issue_s :: !Text
228 , csvHal_journalPublisher_s:: !Text
229 , csvHal_language_s :: !Text
230
231 , csvHal_doiId_s :: !Text
232 , csvHal_authId_i :: !Text
233 , csvHal_instStructId_i :: !Text
234 , csvHal_deptStructId_i :: !Text
235 , csvHal_labStructId_i :: !Text
236
237 , csvHal_rteamStructId_i :: !Text
238 , csvHal_docType_s :: !Text
239 }
240 deriving (Show)
241
242 instance FromNamedRecord CsvHal where
243 parseNamedRecord r = CsvHal <$> r .: "title"
244 <*> r .: "source"
245 <*> r .: "publication_year"
246 <*> r .: "publication_month"
247 <*> r .: "publication_day"
248 <*> r .: "abstract"
249 <*> r .: "authors"
250
251 <*> r .: "url"
252 <*> r .: "isbn_s"
253 <*> r .: "issue_s"
254 <*> r .: "journalPublisher_s"
255 <*> r .: "language_s"
256
257 <*> r .: "doiId_s"
258 <*> r .: "authId_i"
259 <*> r .: "instStructId_i"
260 <*> r .: "deptStructId_i"
261 <*> r .: "labStructId_i"
262
263 <*> r .: "rteamStructId_i"
264 <*> r .: "docType_s"
265
266 instance ToNamedRecord CsvHal where
267 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
268 namedRecord [ "title" .= t
269 , "source" .= s
270
271 , "publication_year" .= py
272 , "publication_month" .= pm
273 , "publication_day" .= pd
274
275 , "abstract" .= abst
276 , "authors" .= aut
277
278 , "url" .= url
279 , "isbn_s" .= isbn
280 , "issue_s" .= iss
281 , "journalPublisher_s" .= j
282 , "language_s" .= lang
283
284 , "doiId_s" .= doi
285 , "authId_i" .= auth
286 , "instStructId_i" .= inst
287 , "deptStructId_i" .= dept
288 , "labStructId_i" .= lab
289
290 , "rteamStructId_i" .= team
291 , "docType_s" .= doct
292 ]
293
294 csvHal2doc :: CsvHal -> HyperdataDocument
295 csvHal2doc (CsvHal title source
296 pub_year pub_month pub_day
297 abstract authors
298 url _ _ _ _
299 doi _ inst _ _
300 _ _ ) = HyperdataDocument (Just "CsvHal")
301 (Just doi)
302 (Just url)
303 Nothing
304 Nothing
305 Nothing
306 (Just title)
307 (Just authors)
308 (Just inst)
309 (Just source)
310 (Just abstract)
311 (Just $ pack . show $ jour pub_year pub_month pub_day)
312 (Just $ fromIntegral pub_year)
313 (Just pub_month)
314 (Just pub_day)
315 Nothing
316 Nothing
317 Nothing
318 Nothing
319
320 ------------------------------------------------------------------------
321 parseHal :: FilePath -> IO [HyperdataDocument]
322 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
323 ------------------------------------------------------------------------
324
325