2 Module : Gargantext.Text.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
18 module Gargantext.Text.Parsers.CSV where
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
23 import Control.Applicative
25 import Data.Char (ord)
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
31 import Data.Vector (Vector)
32 import qualified Data.Vector as V
35 import Gargantext.Database.Types.Node (HyperdataDocument(..))
36 import Gargantext.Text
37 import Gargantext.Text.Context
38 import Gargantext.Prelude hiding (length)
40 ---------------------------------------------------------------
45 , d_publication_year :: !Int
46 , d_publication_month :: !Int
47 , d_publication_day :: !Int
52 ---------------------------------------------------------------
53 -- | Doc 2 HyperdataDocument
54 doc2hyperdataDocument :: Doc -> HyperdataDocument
55 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
56 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
57 HyperdataDocument (Just "CSV")
58 (Just . pack . show $ did)
74 ---------------------------------------------------------------
75 -- | Types Conversions
76 toDocs :: Vector CsvDoc -> [Doc]
78 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
79 -> Doc nId t s py pm pd abst auth )
80 (V.enumFromN 1 (V.length v'')) v''
82 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
83 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
85 ---------------------------------------------------------------
86 fromDocs :: Vector Doc -> Vector CsvDoc
87 fromDocs docs = V.map fromDocs' docs
89 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
91 ---------------------------------------------------------------
92 -- | Split a document in its context
93 -- TODO adapt the size of the paragraph according to the corpus average
95 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
96 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
99 if (mod (round m) docSize) >= 10
108 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
109 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
111 firstDoc = CsvDoc t s py pm pd firstAbstract auth
112 firstAbstract = head' abstracts
114 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
116 abstracts = (splitBy $ contextSize) abst
117 head' x = maybe "" identity (head x)
118 tail' x = maybe [""] identity (tailMay x)
120 ---------------------------------------------------------------
121 ---------------------------------------------------------------
124 docsSize :: Vector CsvDoc -> Mean
125 docsSize csvDoc = mean ls
127 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
130 ---------------------------------------------------------------
133 , csv_source :: !Text
134 , csv_publication_year :: !Int
135 , csv_publication_month :: !Int
136 , csv_publication_day :: !Int
137 , csv_abstract :: !Text
138 , csv_authors :: !Text
142 instance FromNamedRecord CsvDoc where
143 parseNamedRecord r = CsvDoc <$> r .: "title"
145 <*> r .: "publication_year"
146 <*> r .: "publication_month"
147 <*> r .: "publication_day"
151 instance ToNamedRecord CsvDoc where
152 toNamedRecord (CsvDoc t s py pm pd abst aut) =
153 namedRecord [ "title" .= t
155 , "publication_year" .= py
156 , "publication_month" .= pm
157 , "publication_day" .= pd
163 csvDecodeOptions :: DecodeOptions
164 csvDecodeOptions = (defaultDecodeOptions
165 {decDelimiter = fromIntegral $ ord '\t'}
168 csvEncodeOptions :: EncodeOptions
169 csvEncodeOptions = ( defaultEncodeOptions
170 {encDelimiter = fromIntegral $ ord '\t'}
173 ------------------------------------------------------------------------
174 ------------------------------------------------------------------------
175 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
176 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
180 ------------------------------------------------------------------------
181 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
183 csvData <- BL.readFile fp
184 case decodeByNameWith csvDecodeOptions csvData of
185 Left e -> panic (pack e)
186 Right csvDocs -> pure csvDocs
189 readHal :: FilePath -> IO (Header, Vector CsvHal)
191 csvData <- BL.readFile fp
192 case decodeByNameWith csvDecodeOptions csvData of
193 Left e -> panic (pack e)
194 Right csvDocs -> pure csvDocs
195 ------------------------------------------------------------------------
198 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
199 writeCsv fp (h, vs) = BL.writeFile fp $
200 encodeByNameWith csvEncodeOptions h (V.toList vs)
203 ------------------------------------------------------------------------
206 { csvHal_title :: !Text
207 , csvHal_source :: !Text
208 , csvHal_publication_year :: !Int
209 , csvHal_publication_month :: !Int
210 , csvHal_publication_day :: !Int
211 , csvHal_abstract :: !Text
212 , csvHal_authors :: !Text
214 , csvHal_url :: !Text
215 , csvHal_isbn_s :: !Text
216 , csvHal_issue_s :: !Text
217 , csvHal_journalPublisher_s:: !Text
218 , csvHal_language_s :: !Text
220 , csvHal_doiId_s :: !Text
221 , csvHal_authId_i :: !Text
222 , csvHal_instStructId_i :: !Text
223 , csvHal_deptStructId_i :: !Text
224 , csvHal_labStructId_i :: !Text
226 , csvHal_rteamStructId_i :: !Text
227 , csvHal_docType_s :: !Text
231 instance FromNamedRecord CsvHal where
232 parseNamedRecord r = CsvHal <$> r .: "title"
234 <*> r .: "publication_year"
235 <*> r .: "publication_month"
236 <*> r .: "publication_day"
243 <*> r .: "journalPublisher_s"
244 <*> r .: "language_s"
248 <*> r .: "instStructId_i"
249 <*> r .: "deptStructId_i"
250 <*> r .: "labStructId_i"
252 <*> r .: "rteamStructId_i"
255 instance ToNamedRecord CsvHal where
256 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss jour lang doi auth inst dept lab team doct) =
257 namedRecord [ "title" .= t
259 , "publication_year" .= py
260 , "publication_month" .= pm
261 , "publication_day" .= pd
268 , "journalPublisher_s" .= jour
269 , "language_s" .= lang
273 , "instStructId_i" .= inst
274 , "deptStructId_i" .= dept
275 , "labStructId_i" .= lab
277 , "rteamStructId_i" .= team
278 , "docType_s" .= doct