2 Module : Gargantext.Text.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
18 module Gargantext.Text.Parsers.CSV where
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
23 import Control.Applicative
25 import Data.Char (ord)
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
30 import Data.Time.Segment (jour)
32 import Data.Vector (Vector)
33 import qualified Data.Vector as V
36 import Gargantext.Database.Types.Node (HyperdataDocument(..))
37 import Gargantext.Text
38 import Gargantext.Text.Context
39 import Gargantext.Prelude hiding (length)
41 ---------------------------------------------------------------
42 headerCsvGargV3 :: Header
43 headerCsvGargV3 = header [ "title"
51 ---------------------------------------------------------------
56 , d_publication_year :: !Int
57 , d_publication_month :: !Int
58 , d_publication_day :: !Int
63 ---------------------------------------------------------------
64 -- | Doc 2 HyperdataDocument
65 doc2hyperdataDocument :: Doc -> HyperdataDocument
66 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
67 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
68 HyperdataDocument (Just "CSV")
69 (Just . pack . show $ did)
87 ---------------------------------------------------------------
88 -- | Types Conversions
89 toDocs :: Vector CsvDoc -> [Doc]
91 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
92 -> Doc nId t s py pm pd abst auth )
93 (V.enumFromN 1 (V.length v'')) v''
95 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
96 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
98 ---------------------------------------------------------------
99 fromDocs :: Vector Doc -> Vector CsvDoc
100 fromDocs docs = V.map fromDocs' docs
102 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
104 ---------------------------------------------------------------
105 -- | Split a document in its context
106 -- TODO adapt the size of the paragraph according to the corpus average
108 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
109 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
112 if (mod (round m) docSize) >= 10
121 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
122 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
124 firstDoc = CsvDoc t s py pm pd firstAbstract auth
125 firstAbstract = head' abstracts
127 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
129 abstracts = (splitBy $ contextSize) abst
130 head' x = maybe "" identity (head x)
131 tail' x = maybe [""] identity (tailMay x)
133 ---------------------------------------------------------------
134 ---------------------------------------------------------------
137 docsSize :: Vector CsvDoc -> Mean
138 docsSize csvDoc = mean ls
140 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
143 ---------------------------------------------------------------
146 , csv_source :: !Text
147 , csv_publication_year :: !Int
148 , csv_publication_month :: !Int
149 , csv_publication_day :: !Int
150 , csv_abstract :: !Text
151 , csv_authors :: !Text
155 instance FromNamedRecord CsvDoc where
156 parseNamedRecord r = CsvDoc <$> r .: "title"
158 <*> r .: "publication_year"
159 <*> r .: "publication_month"
160 <*> r .: "publication_day"
164 instance ToNamedRecord CsvDoc where
165 toNamedRecord (CsvDoc t s py pm pd abst aut) =
166 namedRecord [ "title" .= t
168 , "publication_year" .= py
169 , "publication_month" .= pm
170 , "publication_day" .= pd
176 csvDecodeOptions :: DecodeOptions
177 csvDecodeOptions = (defaultDecodeOptions
178 {decDelimiter = fromIntegral $ ord '\t'}
181 csvEncodeOptions :: EncodeOptions
182 csvEncodeOptions = ( defaultEncodeOptions
183 {encDelimiter = fromIntegral $ ord '\t'}
186 ------------------------------------------------------------------------
187 ------------------------------------------------------------------------
188 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
189 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
193 ------------------------------------------------------------------------
194 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
196 csvData <- BL.readFile fp
197 case decodeByNameWith csvDecodeOptions csvData of
198 Left e -> panic (pack e)
199 Right csvDocs -> pure csvDocs
202 readHal :: FilePath -> IO (Header, Vector CsvHal)
204 csvData <- BL.readFile fp
205 case decodeByNameWith csvDecodeOptions csvData of
206 Left e -> panic (pack e)
207 Right csvDocs -> pure csvDocs
208 ------------------------------------------------------------------------
209 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
210 writeCsv fp (h, vs) = BL.writeFile fp $
211 encodeByNameWith csvEncodeOptions h (V.toList vs)
214 ------------------------------------------------------------------------
217 { csvHal_title :: !Text
218 , csvHal_source :: !Text
219 , csvHal_publication_year :: !Integer
220 , csvHal_publication_month :: !Int
221 , csvHal_publication_day :: !Int
222 , csvHal_abstract :: !Text
223 , csvHal_authors :: !Text
225 , csvHal_url :: !Text
226 , csvHal_isbn_s :: !Text
227 , csvHal_issue_s :: !Text
228 , csvHal_journalPublisher_s:: !Text
229 , csvHal_language_s :: !Text
231 , csvHal_doiId_s :: !Text
232 , csvHal_authId_i :: !Text
233 , csvHal_instStructId_i :: !Text
234 , csvHal_deptStructId_i :: !Text
235 , csvHal_labStructId_i :: !Text
237 , csvHal_rteamStructId_i :: !Text
238 , csvHal_docType_s :: !Text
242 instance FromNamedRecord CsvHal where
243 parseNamedRecord r = CsvHal <$> r .: "title"
245 <*> r .: "publication_year"
246 <*> r .: "publication_month"
247 <*> r .: "publication_day"
254 <*> r .: "journalPublisher_s"
255 <*> r .: "language_s"
259 <*> r .: "instStructId_i"
260 <*> r .: "deptStructId_i"
261 <*> r .: "labStructId_i"
263 <*> r .: "rteamStructId_i"
266 instance ToNamedRecord CsvHal where
267 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
268 namedRecord [ "title" .= t
271 , "publication_year" .= py
272 , "publication_month" .= pm
273 , "publication_day" .= pd
281 , "journalPublisher_s" .= j
282 , "language_s" .= lang
286 , "instStructId_i" .= inst
287 , "deptStructId_i" .= dept
288 , "labStructId_i" .= lab
290 , "rteamStructId_i" .= team
291 , "docType_s" .= doct
294 csvHal2doc :: CsvHal -> HyperdataDocument
295 csvHal2doc (CsvHal title source
296 pub_year pub_month pub_day
300 _ _ ) = HyperdataDocument (Just "CsvHal")
311 (Just $ pack . show $ jour pub_year pub_month pub_day)
312 (Just $ fromIntegral pub_year)
320 ------------------------------------------------------------------------
321 parseHal :: FilePath -> IO [HyperdataDocument]
322 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
323 ------------------------------------------------------------------------