]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/CSV.hs
[FIX] cleaning Database.Node + clearer error message.
[gargantext.git] / src / Gargantext / Text / Parsers / CSV.hs
1 {-|
2 Module : Gargantext.Text.Parsers.CSV
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 CSV parser for Gargantext corpus files.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
17
18 module Gargantext.Text.Parsers.CSV where
19
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
22
23 import Control.Applicative
24
25 import Data.Char (ord)
26 import Data.Csv
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
30
31 import Data.Vector (Vector)
32 import qualified Data.Vector as V
33 import Safe (tailMay)
34
35 import Gargantext.Core.Types.Node (HyperdataDocument(..))
36 import Gargantext.Text
37 import Gargantext.Text.Context
38 import Gargantext.Prelude hiding (length)
39
40 ---------------------------------------------------------------
41 data Doc = Doc
42 { d_docId :: !Int
43 , d_title :: !Text
44 , d_source :: !Text
45 , d_publication_year :: !Int
46 , d_publication_month :: !Int
47 , d_publication_day :: !Int
48 , d_abstract :: !Text
49 , d_authors :: !Text
50 }
51 deriving (Show)
52 ---------------------------------------------------------------
53 -- | Doc 2 HyperdataDocument
54 doc2hyperdataDocument :: Doc -> HyperdataDocument
55 doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
56 HyperdataDocument (Just "CSV")
57 (Just did)
58 Nothing
59 Nothing
60 (Just dt)
61 (Just dau)
62 (Just ds)
63 (Just dab)
64 (Nothing)
65 Nothing
66 (Just dpy)
67 (Just dpm)
68 (Just dpd)
69 Nothing
70 Nothing
71 Nothing
72 ---------------------------------------------------------------
73 -- | Types Conversions
74 toDocs :: Vector CsvDoc -> [Doc]
75 toDocs v = V.toList
76 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
77 -> Doc nId t s py pm pd abst auth )
78 (V.enumFromN 1 (V.length v'')) v''
79 where
80 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
81 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
82
83 ---------------------------------------------------------------
84 fromDocs :: Vector Doc -> Vector CsvDoc
85 fromDocs docs = V.map fromDocs' docs
86 where
87 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
88
89 ---------------------------------------------------------------
90 -- | Split a document in its context
91 -- TODO adapt the size of the paragraph according to the corpus average
92
93 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
94 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
95 if docSize > 1000
96 then
97 if (mod (round m) docSize) >= 10
98 then
99 splitDoc' splt doc
100 else
101 V.fromList [doc]
102 else
103 V.fromList [doc]
104
105
106 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
107 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
108 where
109 firstDoc = CsvDoc t s py pm pd firstAbstract auth
110 firstAbstract = head' abstracts
111
112 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
113
114 abstracts = (splitBy $ contextSize) abst
115 head' x = maybe "" identity (head x)
116 tail' x = maybe [""] identity (tailMay x)
117
118 ---------------------------------------------------------------
119 ---------------------------------------------------------------
120 type Mean = Double
121
122 docsSize :: Vector CsvDoc -> Mean
123 docsSize csvDoc = mean ls
124 where
125 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
126
127
128 ---------------------------------------------------------------
129 data CsvDoc = CsvDoc
130 { csv_title :: !Text
131 , csv_source :: !Text
132 , csv_publication_year :: !Int
133 , csv_publication_month :: !Int
134 , csv_publication_day :: !Int
135 , csv_abstract :: !Text
136 , csv_authors :: !Text
137 }
138 deriving (Show)
139
140 instance FromNamedRecord CsvDoc where
141 parseNamedRecord r = CsvDoc <$> r .: "title"
142 <*> r .: "source"
143 <*> r .: "publication_year"
144 <*> r .: "publication_month"
145 <*> r .: "publication_day"
146 <*> r .: "abstract"
147 <*> r .: "authors"
148
149 instance ToNamedRecord CsvDoc where
150 toNamedRecord (CsvDoc t s py pm pd abst aut) =
151 namedRecord [ "title" .= t
152 , "source" .= s
153 , "publication_year" .= py
154 , "publication_month" .= pm
155 , "publication_day" .= pd
156 , "abstract" .= abst
157 , "authors" .= aut
158 ]
159
160
161 csvDecodeOptions :: DecodeOptions
162 csvDecodeOptions = (defaultDecodeOptions
163 {decDelimiter = fromIntegral $ ord '\t'}
164 )
165
166 csvEncodeOptions :: EncodeOptions
167 csvEncodeOptions = ( defaultEncodeOptions
168 {encDelimiter = fromIntegral $ ord '\t'}
169 )
170
171 ------------------------------------------------------------------------
172 ------------------------------------------------------------------------
173 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
174 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
175 <$> snd
176 <$> readCsv fp
177
178 ------------------------------------------------------------------------
179 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
180 readCsv fp = do
181 csvData <- BL.readFile fp
182 case decodeByNameWith csvDecodeOptions csvData of
183 Left e -> panic (pack e)
184 Right csvDocs -> pure csvDocs
185
186
187 readHal :: FilePath -> IO (Header, Vector CsvHal)
188 readHal fp = do
189 csvData <- BL.readFile fp
190 case decodeByNameWith csvDecodeOptions csvData of
191 Left e -> panic (pack e)
192 Right csvDocs -> pure csvDocs
193 ------------------------------------------------------------------------
194
195
196 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
197 writeCsv fp (h, vs) = BL.writeFile fp $
198 encodeByNameWith csvEncodeOptions h (V.toList vs)
199
200
201 ------------------------------------------------------------------------
202 -- Hal Format
203 data CsvHal = CsvHal
204 { csvHal_title :: !Text
205 , csvHal_source :: !Text
206 , csvHal_publication_year :: !Int
207 , csvHal_publication_month :: !Int
208 , csvHal_publication_day :: !Int
209 , csvHal_abstract :: !Text
210 , csvHal_authors :: !Text
211
212 , csvHal_url :: !Text
213 , csvHal_isbn_s :: !Text
214 , csvHal_issue_s :: !Text
215 , csvHal_journalPublisher_s:: !Text
216 , csvHal_language_s :: !Text
217
218 , csvHal_doiId_s :: !Text
219 , csvHal_authId_i :: !Text
220 , csvHal_instStructId_i :: !Text
221 , csvHal_deptStructId_i :: !Text
222 , csvHal_labStructId_i :: !Text
223
224 , csvHal_rteamStructId_i :: !Text
225 , csvHal_docType_s :: !Text
226 }
227 deriving (Show)
228
229 instance FromNamedRecord CsvHal where
230 parseNamedRecord r = CsvHal <$> r .: "title"
231 <*> r .: "source"
232 <*> r .: "publication_year"
233 <*> r .: "publication_month"
234 <*> r .: "publication_day"
235 <*> r .: "abstract"
236 <*> r .: "authors"
237
238 <*> r .: "url"
239 <*> r .: "isbn_s"
240 <*> r .: "issue_s"
241 <*> r .: "journalPublisher_s"
242 <*> r .: "language_s"
243
244 <*> r .: "doiId_s"
245 <*> r .: "authId_i"
246 <*> r .: "instStructId_i"
247 <*> r .: "deptStructId_i"
248 <*> r .: "labStructId_i"
249
250 <*> r .: "rteamStructId_i"
251 <*> r .: "docType_s"
252
253 instance ToNamedRecord CsvHal where
254 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss jour lang doi auth inst dept lab team doct) =
255 namedRecord [ "title" .= t
256 , "source" .= s
257 , "publication_year" .= py
258 , "publication_month" .= pm
259 , "publication_day" .= pd
260 , "abstract" .= abst
261 , "authors" .= aut
262
263 , "url" .= url
264 , "isbn_s" .= isbn
265 , "issue_s" .= iss
266 , "journalPublisher_s" .= jour
267 , "language_s" .= lang
268
269 , "doiId_s" .= doi
270 , "authId_i" .= auth
271 , "instStructId_i" .= inst
272 , "deptStructId_i" .= dept
273 , "labStructId_i" .= lab
274
275 , "rteamStructId_i" .= team
276 , "docType_s" .= doct
277 ]