2 Module : Gargantext.Text.Parsers.CSV
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 CSV parser for Gargantext corpus files.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE DeriveGeneric #-}
18 module Gargantext.Text.Parsers.CSV where
20 import GHC.Real (round)
21 import GHC.IO (FilePath)
23 import Control.Applicative
25 import Data.Char (ord)
27 import Data.Either (Either(Left, Right))
28 import Data.String (IsString)
29 import Data.Text (Text, pack, unpack, length)
30 import qualified Data.ByteString.Lazy as BL
32 import Data.Vector (Vector)
33 import qualified Data.Vector as V
35 import Text.HTML.TagSoup
37 import Gargantext.Text
38 import Gargantext.Prelude hiding (length)
40 ---------------------------------------------------------------
45 , d_publication_year :: !Int
46 , d_publication_month :: !Int
47 , d_publication_day :: !Int
52 ---------------------------------------------------------------
53 toDocs :: Vector CsvDoc -> [Doc]
55 $ V.zipWith (\nId (CsvDoc t s py pm pd abst auth)
56 -> Doc nId t s py pm pd abst auth )
57 (V.enumFromN 1 (V.length v'')) v''
59 v'' = V.foldl (\v' sep -> V.concatMap (splitDoc (docsSize v') sep) v') v seps
60 seps= (V.fromList [Paragraph, Sentences, Chars])
62 ---------------------------------------------------------------
63 fromDocs :: Vector Doc -> Vector CsvDoc
64 fromDocs docs = V.map fromDocs' docs
66 fromDocs' (Doc _ t s py pm pd abst auth) = (CsvDoc t s py pm pd abst auth)
68 ---------------------------------------------------------------
69 -- | Split a document in its context
70 -- TODO adapt the size of the paragraph according to the corpus average
72 data SplitBy = Paragraph | Sentences | Chars
74 splitDoc :: Mean -> SplitBy -> CsvDoc -> Vector CsvDoc
75 splitDoc m splt doc = let docSize = (length $ c_abstract doc) in
78 if (mod (round m) docSize) >= 10
87 splitDoc' :: SplitBy -> CsvDoc -> Vector CsvDoc
88 splitDoc' splt (CsvDoc t s py pm pd abst auth) = V.fromList $ [firstDoc] <> nextDocs
90 firstDoc = CsvDoc t s py pm pd firstAbstract auth
91 firstAbstract = head' abstracts
93 nextDocs = map (\txt -> CsvDoc (head' $ sentences txt) s py pm pd (unsentences $ tail' $ sentences txt) auth) (tail' abstracts)
95 abstracts = (splitBy splt) abst
96 head' x = maybe "" identity (head x)
97 tail' x = maybe [""] identity (tailMay x)
100 splitBy :: SplitBy -> Text -> [Text]
101 splitBy Chars = map pack . chunkAlong 1000 1 . unpack
102 splitBy Sentences = map unsentences . chunkAlong 20 1 . sentences
103 splitBy Paragraph = map removeTag . filter isTagText . parseTags
105 removeTag :: IsString p => Tag p -> p
106 removeTag (TagText x) = x
107 removeTag (TagComment x) = x
110 ---------------------------------------------------------------
111 ---------------------------------------------------------------
114 docsSize :: Vector CsvDoc -> Mean
115 docsSize csvDoc = mean ls
117 ls = V.toList $ V.map (fromIntegral . length . c_abstract) csvDoc
120 ---------------------------------------------------------------
124 , c_publication_year :: !Int
125 , c_publication_month :: !Int
126 , c_publication_day :: !Int
127 , c_abstract :: !Text
132 instance FromNamedRecord CsvDoc where
133 parseNamedRecord r = CsvDoc <$> r .: "title"
135 <*> r .: "publication_year"
136 <*> r .: "publication_month"
137 <*> r .: "publication_day"
141 instance ToNamedRecord CsvDoc where
142 toNamedRecord (CsvDoc t s py pm pd abst aut) =
143 namedRecord [ "title" .= t
145 , "publication_year" .= py
146 , "publication_month" .= pm
147 , "publication_day" .= pd
153 csvDecodeOptions :: DecodeOptions
154 csvDecodeOptions = (defaultDecodeOptions
155 {decDelimiter = fromIntegral $ ord '\t'}
158 csvEncodeOptions :: EncodeOptions
159 csvEncodeOptions = ( defaultEncodeOptions
160 {encDelimiter = fromIntegral $ ord '\t'}
164 readCsv :: FilePath -> IO (Header, Vector CsvDoc)
166 csvData <- BL.readFile fp
167 case decodeByNameWith csvDecodeOptions csvData of
168 Left e -> panic (pack e)
169 Right csvDocs -> pure csvDocs
172 writeCsv :: FilePath -> (Header, Vector CsvDoc) -> IO ()
173 writeCsv fp (h, vs) = BL.writeFile fp $
174 encodeByNameWith csvEncodeOptions h (V.toList vs)