2 Module : Gargantext.Text.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
18 module Gargantext.Text.Parsers.CSV where
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
23 import Control.Applicative
25 import Data.Char (ord)
27 import Data.Either (Either(Left, Right))
28 import Data.Text (Text, pack, length, intercalate)
29 import qualified Data.ByteString.Lazy as BL
30 import Data.Time.Segment (jour)
32 import Data.Vector (Vector)
33 import qualified Data.Vector as V
36 import Gargantext.Database.Types.Node (HyperdataDocument(..))
37 import Gargantext.Text
38 import Gargantext.Text.Context
39 import Gargantext.Prelude hiding (length)
41 ---------------------------------------------------------------
42 headerCsvGargV3 :: Header
43 headerCsvGargV3 = header [ "title"
51 ---------------------------------------------------------------
56 , d_publication_year :: !Int
57 , d_publication_month :: !Int
58 , d_publication_day :: !Int
63 ---------------------------------------------------------------
64 -- | Doc 2 HyperdataDocument
65 doc2hyperdataDocument :: Doc -> HyperdataDocument
66 --doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
67 doc2hyperdataDocument (Doc did dt _ dpy dpm dpd dab dau) =
68 HyperdataDocument (Just "CSV")
69 (Just . pack . show $ did)
87 ---------------------------------------------------------------
88 -- | Types Conversions
89 toDocs :: Vector CsvDoc -> [Doc]
91 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
92 -> Doc nId t s py pm pd abst auth )
93 (V.enumFromN 1 (V.length v'')) v''
95 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
96 seps= (V.fromList [Paragraphs 1, Sentences 3, Chars 3])
98 ---------------------------------------------------------------
99 fromDocs :: Vector Doc -> Vector CsvDoc
100 fromDocs docs = V.map fromDocs' docs
102 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
104 ---------------------------------------------------------------
105 -- | Split a document in its context
106 -- TODO adapt the size of the paragraph according to the corpus average
108 splitDoc :: Mean -> SplitContext -> CsvDoc -> Vector CsvDoc
109 splitDoc m splt doc = let docSize = (length $ csv_abstract doc) in
112 if (mod (round m) docSize) >= 10
121 splitDoc' :: SplitContext -> CsvDoc -> Vector CsvDoc
122 splitDoc' contextSize (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
124 firstDoc = CsvDoc t s py pm pd firstAbstract auth
125 firstAbstract = head' "splitDoc'1" abstracts
127 nextDocs = map (\txt -> CsvDoc (head' "splitDoc'2" $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
129 abstracts = (splitBy $ contextSize) abst
130 tail' x = maybe [""] identity (tailMay x)
132 ---------------------------------------------------------------
133 ---------------------------------------------------------------
136 docsSize :: Vector CsvDoc -> Mean
137 docsSize csvDoc = mean ls
139 ls = V.toList $ V.map (fromIntegral . length . csv_abstract) csvDoc
142 ---------------------------------------------------------------
145 , csv_source :: !Text
146 , csv_publication_year :: !Int
147 , csv_publication_month :: !Int
148 , csv_publication_day :: !Int
149 , csv_abstract :: !Text
150 , csv_authors :: !Text
154 instance FromNamedRecord CsvDoc where
155 parseNamedRecord r = CsvDoc <$> r .: "title"
157 <*> r .: "publication_year"
158 <*> r .: "publication_month"
159 <*> r .: "publication_day"
163 instance ToNamedRecord CsvDoc where
164 toNamedRecord (CsvDoc t s py pm pd abst aut) =
165 namedRecord [ "title" .= t
167 , "publication_year" .= py
168 , "publication_month" .= pm
169 , "publication_day" .= pd
175 csvDecodeOptions :: DecodeOptions
176 csvDecodeOptions = (defaultDecodeOptions
177 {decDelimiter = fromIntegral $ ord '\t'}
180 csvEncodeOptions :: EncodeOptions
181 csvEncodeOptions = ( defaultEncodeOptions
182 {encDelimiter = fromIntegral $ ord '\t'}
185 ------------------------------------------------------------------------
186 ------------------------------------------------------------------------
187 readCsvOn :: [CsvDoc -> Text] -> FilePath -> IO [Text]
188 readCsvOn fields fp = V.toList <$> V.map (\l -> intercalate (pack " ") $ map (\field -> field l) fields)
192 ------------------------------------------------------------------------
193 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
195 csvData <- BL.readFile fp
196 case decodeByNameWith csvDecodeOptions csvData of
197 Left e -> panic (pack e)
198 Right csvDocs -> pure csvDocs
201 readHal :: FilePath -> IO (Header, Vector CsvHal)
203 csvData <- BL.readFile fp
204 case decodeByNameWith csvDecodeOptions csvData of
205 Left e -> panic (pack e)
206 Right csvDocs -> pure csvDocs
207 ------------------------------------------------------------------------
208 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
209 writeCsv fp (h, vs) = BL.writeFile fp $
210 encodeByNameWith csvEncodeOptions h (V.toList vs)
213 ------------------------------------------------------------------------
216 { csvHal_title :: !Text
217 , csvHal_source :: !Text
218 , csvHal_publication_year :: !Integer
219 , csvHal_publication_month :: !Int
220 , csvHal_publication_day :: !Int
221 , csvHal_abstract :: !Text
222 , csvHal_authors :: !Text
224 , csvHal_url :: !Text
225 , csvHal_isbn_s :: !Text
226 , csvHal_issue_s :: !Text
227 , csvHal_journalPublisher_s:: !Text
228 , csvHal_language_s :: !Text
230 , csvHal_doiId_s :: !Text
231 , csvHal_authId_i :: !Text
232 , csvHal_instStructId_i :: !Text
233 , csvHal_deptStructId_i :: !Text
234 , csvHal_labStructId_i :: !Text
236 , csvHal_rteamStructId_i :: !Text
237 , csvHal_docType_s :: !Text
241 instance FromNamedRecord CsvHal where
242 parseNamedRecord r = CsvHal <$> r .: "title"
244 <*> r .: "publication_year"
245 <*> r .: "publication_month"
246 <*> r .: "publication_day"
253 <*> r .: "journalPublisher_s"
254 <*> r .: "language_s"
258 <*> r .: "instStructId_i"
259 <*> r .: "deptStructId_i"
260 <*> r .: "labStructId_i"
262 <*> r .: "rteamStructId_i"
265 instance ToNamedRecord CsvHal where
266 toNamedRecord (CsvHal t s py pm pd abst aut url isbn iss j lang doi auth inst dept lab team doct) =
267 namedRecord [ "title" .= t
270 , "publication_year" .= py
271 , "publication_month" .= pm
272 , "publication_day" .= pd
280 , "journalPublisher_s" .= j
281 , "language_s" .= lang
285 , "instStructId_i" .= inst
286 , "deptStructId_i" .= dept
287 , "labStructId_i" .= lab
289 , "rteamStructId_i" .= team
290 , "docType_s" .= doct
293 csvHal2doc :: CsvHal -> HyperdataDocument
294 csvHal2doc (CsvHal title source
295 pub_year pub_month pub_day
299 _ _ ) = HyperdataDocument (Just "CsvHal")
310 (Just $ pack . show $ jour pub_year pub_month pub_day)
311 (Just $ fromIntegral pub_year)
319 ------------------------------------------------------------------------
320 parseHal :: FilePath -> IO [HyperdataDocument]
321 parseHal fp = map csvHal2doc <$> V.toList <$> snd <$> readHal fp
322 ------------------------------------------------------------------------