2 Module : Gargantext.Text.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
18 module Gargantext.Text.Parsers.CSV where
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
23 import Control.Applicative
25 import Data.Char (ord)
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
30 import Data.Time.Segment (jour)
32 import Data.Vector (Vector)
33 import qualified Data.Vector as V
36 import Gargantext.Database.Types.Node (HyperdataDocument(..))
37 import Gargantext.Text
38 import Gargantext.Text.Context
39 import Gargantext.Prelude hiding (length)
41 ---------------------------------------------------------------
46 , d_publication_year :: !Int
47 , d_publication_month :: !Int
48 , d_publication_day :: !Int
53 ---------------------------------------------------------------
54 -- | Doc 2 HyperdataDocument
55 doc2hyperdataDocument :: Doc -> HyperdataDocument
56 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
57 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
58 HyperdataDocument (Just "CSV")
59 (Just . pack . show $ did)
76 ---------------------------------------------------------------
77 -- | Types Conversions
78 toDocs :: Vector CsvDoc -> [Doc]
80 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
81 -> Doc nId t s py pm pd abst auth )
82 (V.enumFromN 1 (V.length v'')) v''
84 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
85 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
87 ---------------------------------------------------------------
88 fromDocs :: Vector Doc -> Vector CsvDoc
89 fromDocs docs = V.map fromDocs' docs
91 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
93 ---------------------------------------------------------------
94 -- | Split a document in its context
95 -- TODO adapt the size of the paragraph according to the corpus average
97 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
98 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
101 if (mod (round m) docSize) >= 10
110 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
111 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
113 firstDoc = CsvDoc t s py pm pd firstAbstract auth
114 firstAbstract = head' abstracts
116 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
118 abstracts = (splitBy $ contextSize) abst
119 head' x = maybe "" identity (head x)
120 tail' x = maybe [""] identity (tailMay x)
122 ---------------------------------------------------------------
123 ---------------------------------------------------------------
126 docsSize :: Vector CsvDoc -> Mean
127 docsSize csvDoc = mean ls
129 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
132 ---------------------------------------------------------------
135 , csv_source :: !Text
136 , csv_publication_year :: !Int
137 , csv_publication_month :: !Int
138 , csv_publication_day :: !Int
139 , csv_abstract :: !Text
140 , csv_authors :: !Text
144 instance FromNamedRecord CsvDoc where
145 parseNamedRecord r = CsvDoc <$> r .: "title"
147 <*> r .: "publication_year"
148 <*> r .: "publication_month"
149 <*> r .: "publication_day"
153 instance ToNamedRecord CsvDoc where
154 toNamedRecord (CsvDoc t s py pm pd abst aut) =
155 namedRecord [ "title" .= t
157 , "publication_year" .= py
158 , "publication_month" .= pm
159 , "publication_day" .= pd
165 csvDecodeOptions :: DecodeOptions
166 csvDecodeOptions = (defaultDecodeOptions
167 {decDelimiter = fromIntegral $ ord '\t'}
170 csvEncodeOptions :: EncodeOptions
171 csvEncodeOptions = ( defaultEncodeOptions
172 {encDelimiter = fromIntegral $ ord '\t'}
175 ------------------------------------------------------------------------
176 ------------------------------------------------------------------------
177 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
178 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
182 ------------------------------------------------------------------------
183 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
185 csvData <- BL.readFile fp
186 case decodeByNameWith csvDecodeOptions csvData of
187 Left e -> panic (pack e)
188 Right csvDocs -> pure csvDocs
191 readHal :: FilePath -> IO (Header, Vector CsvHal)
193 csvData <- BL.readFile fp
194 case decodeByNameWith csvDecodeOptions csvData of
195 Left e -> panic (pack e)
196 Right csvDocs -> pure csvDocs
197 ------------------------------------------------------------------------
198 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
199 writeCsv fp (h, vs) = BL.writeFile fp $
200 encodeByNameWith csvEncodeOptions h (V.toList vs)
203 ------------------------------------------------------------------------
206 { csvHal_title :: !Text
207 , csvHal_source :: !Text
208 , csvHal_publication_year :: !Integer
209 , csvHal_publication_month :: !Int
210 , csvHal_publication_day :: !Int
211 , csvHal_abstract :: !Text
212 , csvHal_authors :: !Text
214 , csvHal_url :: !Text
215 , csvHal_isbn_s :: !Text
216 , csvHal_issue_s :: !Text
217 , csvHal_journalPublisher_s:: !Text
218 , csvHal_language_s :: !Text
220 , csvHal_doiId_s :: !Text
221 , csvHal_authId_i :: !Text
222 , csvHal_instStructId_i :: !Text
223 , csvHal_deptStructId_i :: !Text
224 , csvHal_labStructId_i :: !Text
226 , csvHal_rteamStructId_i :: !Text
227 , csvHal_docType_s :: !Text
231 instance FromNamedRecord CsvHal where
232 parseNamedRecord r = CsvHal <$> r .: "title"
234 <*> r .: "publication_year"
235 <*> r .: "publication_month"
236 <*> r .: "publication_day"
243 <*> r .: "journalPublisher_s"
244 <*> r .: "language_s"
248 <*> r .: "instStructId_i"
249 <*> r .: "deptStructId_i"
250 <*> r .: "labStructId_i"
252 <*> r .: "rteamStructId_i"
255 instance ToNamedRecord CsvHal where
256 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
257 namedRecord [ "title" .= t
260 , "publication_year" .= py
261 , "publication_month" .= pm
262 , "publication_day" .= pd
270 , "journalPublisher_s" .= j
271 , "language_s" .= lang
275 , "instStructId_i" .= inst
276 , "deptStructId_i" .= dept
277 , "labStructId_i" .= lab
279 , "rteamStructId_i" .= team
280 , "docType_s" .= doct
283 csvHal2doc :: CsvHal -> HyperdataDocument
284 csvHal2doc (CsvHal title source
285 pub_year pub_month pub_day
289 _ _ ) = HyperdataDocument (Just "CsvHal")
299 (Just $ pack . show $ jour pub_year pub_month pub_day)
300 (Just $ fromIntegral pub_year)
308 ------------------------------------------------------------------------
309 parseHal :: FilePath -> IO [HyperdataDocument]
310 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
311 ------------------------------------------------------------------------