2 Module : Gargantext.Text.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
18 module Gargantext.Text.Parsers.CSV where
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
23 import Control.Applicative
25 import Data.Char (ord)
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
31 import Data.Vector (Vector)
32 import qualified Data.Vector as V
35 import Gargantext.Core.Types.Node (HyperdataDocument(..))
36 import Gargantext.Text
37 import Gargantext.Text.Context
38 import Gargantext.Prelude hiding (length)
40 ---------------------------------------------------------------
45 , d_publication_year :: !Int
46 , d_publication_month :: !Int
47 , d_publication_day :: !Int
52 ---------------------------------------------------------------
53 -- | Doc 2 HyperdataDocument
54 doc2hyperdataDocument :: Doc -> HyperdataDocument
55 doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
56 HyperdataDocument (Just "CSV")
72 ---------------------------------------------------------------
73 -- | Types Conversions
74 toDocs :: Vector CsvDoc -> [Doc]
76 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
77 -> Doc nId t s py pm pd abst auth )
78 (V.enumFromN 1 (V.length v'')) v''
80 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
81 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
83 ---------------------------------------------------------------
84 fromDocs :: Vector Doc -> Vector CsvDoc
85 fromDocs docs = V.map fromDocs' docs
87 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
89 ---------------------------------------------------------------
90 -- | Split a document in its context
91 -- TODO adapt the size of the paragraph according to the corpus average
93 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
94 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
97 if (mod (round m) docSize) >= 10
106 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
107 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
109 firstDoc = CsvDoc t s py pm pd firstAbstract auth
110 firstAbstract = head' abstracts
112 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
114 abstracts = (splitBy $ contextSize) abst
115 head' x = maybe "" identity (head x)
116 tail' x = maybe [""] identity (tailMay x)
118 ---------------------------------------------------------------
119 ---------------------------------------------------------------
122 docsSize :: Vector CsvDoc -> Mean
123 docsSize csvDoc = mean ls
125 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
128 ---------------------------------------------------------------
131 , csv_source :: !Text
132 , csv_publication_year :: !Int
133 , csv_publication_month :: !Int
134 , csv_publication_day :: !Int
135 , csv_abstract :: !Text
136 , csv_authors :: !Text
140 instance FromNamedRecord CsvDoc where
141 parseNamedRecord r = CsvDoc <$> r .: "title"
143 <*> r .: "publication_year"
144 <*> r .: "publication_month"
145 <*> r .: "publication_day"
149 instance ToNamedRecord CsvDoc where
150 toNamedRecord (CsvDoc t s py pm pd abst aut) =
151 namedRecord [ "title" .= t
153 , "publication_year" .= py
154 , "publication_month" .= pm
155 , "publication_day" .= pd
161 csvDecodeOptions :: DecodeOptions
162 csvDecodeOptions = (defaultDecodeOptions
163 {decDelimiter = fromIntegral $ ord '\t'}
166 csvEncodeOptions :: EncodeOptions
167 csvEncodeOptions = ( defaultEncodeOptions
168 {encDelimiter = fromIntegral $ ord '\t'}
171 ------------------------------------------------------------------------
172 ------------------------------------------------------------------------
173 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
174 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
178 ------------------------------------------------------------------------
179 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
181 csvData <- BL.readFile fp
182 case decodeByNameWith csvDecodeOptions csvData of
183 Left e -> panic (pack e)
184 Right csvDocs -> pure csvDocs
187 readHal :: FilePath -> IO (Header, Vector CsvHal)
189 csvData <- BL.readFile fp
190 case decodeByNameWith csvDecodeOptions csvData of
191 Left e -> panic (pack e)
192 Right csvDocs -> pure csvDocs
193 ------------------------------------------------------------------------
196 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
197 writeCsv fp (h, vs) = BL.writeFile fp $
198 encodeByNameWith csvEncodeOptions h (V.toList vs)
201 ------------------------------------------------------------------------
204 { csvHal_title :: !Text
205 , csvHal_source :: !Text
206 , csvHal_publication_year :: !Int
207 , csvHal_publication_month :: !Int
208 , csvHal_publication_day :: !Int
209 , csvHal_abstract :: !Text
210 , csvHal_authors :: !Text
212 , csvHal_url :: !Text
213 , csvHal_isbn_s :: !Text
214 , csvHal_issue_s :: !Text
215 , csvHal_journalPublisher_s:: !Text
216 , csvHal_language_s :: !Text
218 , csvHal_doiId_s :: !Text
219 , csvHal_authId_i :: !Text
220 , csvHal_instStructId_i :: !Text
221 , csvHal_deptStructId_i :: !Text
222 , csvHal_labStructId_i :: !Text
224 , csvHal_rteamStructId_i :: !Text
225 , csvHal_docType_s :: !Text
229 instance FromNamedRecord CsvHal where
230 parseNamedRecord r = CsvHal <$> r .: "title"
232 <*> r .: "publication_year"
233 <*> r .: "publication_month"
234 <*> r .: "publication_day"
241 <*> r .: "journalPublisher_s"
242 <*> r .: "language_s"
246 <*> r .: "instStructId_i"
247 <*> r .: "deptStructId_i"
248 <*> r .: "labStructId_i"
250 <*> r .: "rteamStructId_i"
253 instance ToNamedRecord CsvHal where
254 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss jour lang doi auth inst dept lab team doct) =
255 namedRecord [ "title" .= t
257 , "publication_year" .= py
258 , "publication_month" .= pm
259 , "publication_day" .= pd
266 , "journalPublisher_s" .= jour
267 , "language_s" .= lang
271 , "instStructId_i" .= inst
272 , "deptStructId_i" .= dept
273 , "labStructId_i" .= lab
275 , "rteamStructId_i" .= team
276 , "docType_s" .= doct