2 Module : Gargantext.Text.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
18 module Gargantext.Text.Parsers.CSV where
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
23 import Control.Applicative
25 import Data.Char (ord)
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
30 import Data.Time.Segment (jour)
32 import Data.Vector (Vector)
33 import qualified Data.Vector as V
35 import Gargantext.Database.Types.Node (HyperdataDocument(..))
36 import Gargantext.Text
37 import Gargantext.Text.Context
38 import Gargantext.Prelude hiding (length)
40 ---------------------------------------------------------------
41 headerCsvGargV3 :: Header
42 headerCsvGargV3 = header [ "title"
50 ---------------------------------------------------------------
55 , d_publication_year :: !Int
56 , d_publication_month :: !Int
57 , d_publication_day :: !Int
62 ---------------------------------------------------------------
63 -- | Doc 2 HyperdataDocument
64 doc2hyperdataDocument :: Doc -> HyperdataDocument
65 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
66 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
67 HyperdataDocument (Just "CSV")
68 (Just . pack . show $ did)
86 ---------------------------------------------------------------
87 -- | Types Conversions
88 toDocs :: Vector CsvDoc -> [Doc]
90 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
91 -> Doc nId t s py pm pd abst auth )
92 (V.enumFromN 1 (V.length v'')) v''
94 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
95 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
97 ---------------------------------------------------------------
98 fromDocs :: Vector Doc -> Vector CsvDoc
99 fromDocs docs = V.map fromDocs' docs
101 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
103 ---------------------------------------------------------------
104 -- | Split a document in its context
105 -- TODO adapt the size of the paragraph according to the corpus average
107 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
108 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
111 if (mod (round m) docSize) >= 10
120 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
121 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
123 firstDoc = CsvDoc t s py pm pd firstAbstract auth
124 firstAbstract = head' "splitDoc'1" abstracts
126 nextDocs = map (\txt -> CsvDoc
127 (head' "splitDoc'2" $ sentences txt)
129 (unsentences $ tail' "splitDoc'1" $ sentences txt)
131 ) (tail' "splitDoc'2" abstracts)
133 abstracts = (splitBy $ contextSize) abst
135 ---------------------------------------------------------------
136 ---------------------------------------------------------------
139 docsSize :: Vector CsvDoc -> Mean
140 docsSize csvDoc = mean ls
142 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
145 ---------------------------------------------------------------
148 , csv_source :: !Text
149 , csv_publication_year :: !Int
150 , csv_publication_month :: !Int
151 , csv_publication_day :: !Int
152 , csv_abstract :: !Text
153 , csv_authors :: !Text
157 instance FromNamedRecord CsvDoc where
158 parseNamedRecord r = CsvDoc <$> r .: "title"
160 <*> r .: "publication_year"
161 <*> r .: "publication_month"
162 <*> r .: "publication_day"
166 instance ToNamedRecord CsvDoc where
167 toNamedRecord (CsvDoc t s py pm pd abst aut) =
168 namedRecord [ "title" .= t
170 , "publication_year" .= py
171 , "publication_month" .= pm
172 , "publication_day" .= pd
178 csvDecodeOptions :: DecodeOptions
179 csvDecodeOptions = (defaultDecodeOptions
180 {decDelimiter = fromIntegral $ ord '\t'}
183 csvEncodeOptions :: EncodeOptions
184 csvEncodeOptions = ( defaultEncodeOptions
185 {encDelimiter = fromIntegral $ ord '\t'}
188 ------------------------------------------------------------------------
189 ------------------------------------------------------------------------
190 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
191 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
195 ------------------------------------------------------------------------
196 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
198 csvData <- BL.readFile fp
199 case decodeByNameWith csvDecodeOptions csvData of
200 Left e -> panic (pack e)
201 Right csvDocs -> pure csvDocs
204 readHal :: FilePath -> IO (Header, Vector CsvHal)
206 csvData <- BL.readFile fp
207 case decodeByNameWith csvDecodeOptions csvData of
208 Left e -> panic (pack e)
209 Right csvDocs -> pure csvDocs
210 ------------------------------------------------------------------------
211 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
212 writeCsv fp (h, vs) = BL.writeFile fp $
213 encodeByNameWith csvEncodeOptions h (V.toList vs)
216 ------------------------------------------------------------------------
219 { csvHal_title :: !Text
220 , csvHal_source :: !Text
221 , csvHal_publication_year :: !Integer
222 , csvHal_publication_month :: !Int
223 , csvHal_publication_day :: !Int
224 , csvHal_abstract :: !Text
225 , csvHal_authors :: !Text
227 , csvHal_url :: !Text
228 , csvHal_isbn_s :: !Text
229 , csvHal_issue_s :: !Text
230 , csvHal_journalPublisher_s:: !Text
231 , csvHal_language_s :: !Text
233 , csvHal_doiId_s :: !Text
234 , csvHal_authId_i :: !Text
235 , csvHal_instStructId_i :: !Text
236 , csvHal_deptStructId_i :: !Text
237 , csvHal_labStructId_i :: !Text
239 , csvHal_rteamStructId_i :: !Text
240 , csvHal_docType_s :: !Text
244 instance FromNamedRecord CsvHal where
245 parseNamedRecord r = CsvHal <$> r .: "title"
247 <*> r .: "publication_year"
248 <*> r .: "publication_month"
249 <*> r .: "publication_day"
256 <*> r .: "journalPublisher_s"
257 <*> r .: "language_s"
261 <*> r .: "instStructId_i"
262 <*> r .: "deptStructId_i"
263 <*> r .: "labStructId_i"
265 <*> r .: "rteamStructId_i"
268 instance ToNamedRecord CsvHal where
269 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
270 namedRecord [ "title" .= t
273 , "publication_year" .= py
274 , "publication_month" .= pm
275 , "publication_day" .= pd
283 , "journalPublisher_s" .= j
284 , "language_s" .= lang
288 , "instStructId_i" .= inst
289 , "deptStructId_i" .= dept
290 , "labStructId_i" .= lab
292 , "rteamStructId_i" .= team
293 , "docType_s" .= doct
296 csvHal2doc :: CsvHal -> HyperdataDocument
297 csvHal2doc (CsvHal title source
298 pub_year pub_month pub_day
302 _ _ ) = HyperdataDocument (Just "CsvHal")
313 (Just $ pack . show $ jour pub_year pub_month pub_day)
314 (Just $ fromIntegral pub_year)
322 ------------------------------------------------------------------------
323 parseHal :: FilePath -> IO [HyperdataDocument]
324 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
325 ------------------------------------------------------------------------