]> Git — Sourcephile - haskell/symantic-xml.git/blob - Language/Symantic/XML/Read.hs
XML: fix union of EscapedText
[haskell/symantic-xml.git] / Language / Symantic / XML / Read.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Language.Symantic.XML.Read
9 ( module Language.Symantic.XML.Read.Parser
10 , module Language.Symantic.XML.Read
11 ) where
12
13 import Control.Arrow (left)
14 import Control.Applicative (Applicative(..), Alternative(..))
15 import Control.Monad (Monad(..), void, unless, forM, join)
16 import Data.Bool
17 import Data.Char (Char)
18 import Data.Default.Class (Default(..))
19 import Data.Either (Either(..))
20 import Data.Eq (Eq(..))
21 import Data.Foldable (Foldable(..))
22 import Data.Function (($), (.), const)
23 import Data.Functor ((<$>), (<$))
24 import Data.Maybe (Maybe(..), maybe)
25 import Data.Monoid (Monoid(..))
26 import Data.Ord (Ord(..))
27 import Data.Semigroup (Semigroup(..))
28 import Data.String (String)
29 import Data.TreeSeq.Strict (Tree(..))
30 import Data.Tuple (snd)
31 import Prelude (Num(..), Enum(..), Bounded(..), Integer, toInteger)
32 import System.IO (FilePath, IO)
33 import Text.Megaparsec ((<?>))
34 import Text.Show (Show(..))
35 import qualified Control.Exception as Exn
36 import qualified Control.Monad.Trans.Reader as R
37 import qualified Data.ByteString.Lazy as BSL
38 import qualified Data.Char as Char
39 import qualified Data.Char.Properties.XMLCharProps as XC
40 import qualified Data.HashMap.Strict as HM
41 import qualified Data.List as List
42 import qualified Data.Sequence as Seq
43 import qualified Data.Text.Encoding.Error as TL
44 import qualified Data.Text.Lazy as TL
45 import qualified Data.Text.Lazy.Encoding as TL
46 import qualified Data.TreeSeq.Strict as TS
47 import qualified System.IO.Error as IO
48 import qualified Text.Megaparsec as P
49 import qualified Text.Megaparsec.Char as P
50
51 import Language.Symantic.XML.Document
52 import Language.Symantic.XML.Read.Parser
53
54 readXML :: FilePath -> TL.Text -> Either (P.ParseError (P.Token TL.Text) Error) XMLs
55 readXML filePath stateInput =
56 snd $
57 P.runParser'
58 (R.runReaderT p_document def)
59 P.State
60 { P.stateInput
61 , P.statePos = pure $ P.initialPos filePath
62 , P.stateTabWidth = P.pos1 -- NOTE: do not expand tabs.
63 , P.stateTokensProcessed = 0
64 }
65
66 readFile :: FilePath -> IO (Either ErrorRead TL.Text)
67 readFile fp =
68 (left ErrorRead_Unicode . TL.decodeUtf8' <$> BSL.readFile fp)
69 `Exn.catch` \e ->
70 if IO.isAlreadyInUseError e
71 || IO.isDoesNotExistError e
72 || IO.isPermissionError e
73 then return $ Left $ ErrorRead_IO e
74 else IO.ioError e
75
76 -- * Type 'ErrorRead'
77 data ErrorRead
78 = ErrorRead_IO IO.IOError
79 | ErrorRead_Unicode TL.UnicodeException
80 deriving (Show)
81
82 -- * Document
83 p_document :: P.Tokens s ~ TL.Text => Parser Error s XMLs
84 p_document = do
85 ps <- p_prolog
86 e <- p_Element
87 ms <- P.many p_Misc
88 P.eof
89 return (ps <> pure e <> join (Seq.fromList ms))
90
91 -- ** Prolog
92 p_prolog :: P.Tokens s ~ TL.Text => Parser Error s XMLs
93 p_prolog = do
94 xmlDecl <- P.option Seq.empty $ pure <$> p_XMLDecl
95 ms <- P.many p_Misc
96 return (xmlDecl <> join (Seq.fromList ms))
97
98 -- ** Misc
99 p_Misc :: P.Tokens s ~ TL.Text => Parser Error s XMLs
100 p_Misc =
101 P.try (pure <$> p_Comment)
102 <|> P.try (pure <$> p_PI)
103 <|> pure <$> p_S
104
105 -- ** XMLDecl
106 p_XMLDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
107 p_XMLDecl = P.label "XMLDecl" $ do
108 Sourced src as <- p_Sourced $ P.between (P.string "<?xml") (P.string "?>") $ do
109 vi <- pure <$> p_VersionInfo
110 ed <- P.option Seq.empty $ pure <$> p_EncodingDecl
111 sd <- P.option Seq.empty $ pure <$> p_SDDecl
112 p_Spaces
113 return $ vi <> ed <> sd
114 return $ Tree (Sourced src $ NodePI "xml" "") as
115
116 p_VersionInfo :: P.Tokens s ~ TL.Text => Parser Error s XML
117 p_VersionInfo = P.label "VersionInfo" $ do
118 Sourced c v <- p_Sourced $ do
119 P.try (() <$ p_Spaces1 <* P.string "version")
120 p_Eq
121 p_quoted $ const $ p_Sourced $
122 (<>)
123 <$> P.string "1."
124 <*> P.takeWhile1P Nothing Char.isDigit
125 return $ Tree (Sourced c $ NodeAttr "version") $ pure $
126 TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v
127
128 p_EncodingDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
129 p_EncodingDecl = P.label "EncodingDecl" $ do
130 Sourced c v <- p_Sourced $ do
131 P.try (() <$ p_Spaces1 <* P.string "encoding")
132 p_Eq
133 p_quoted $ const $ p_Sourced p_EncName
134 return $ Tree (Sourced c $ NodeAttr "encoding") $ pure $
135 TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v
136
137 p_EncName :: P.Tokens s ~ TL.Text => Parser Error s TL.Text
138 p_EncName = P.label "EncName" $ do
139 P.notFollowedBy (P.satisfy $ not . isAlpha)
140 P.takeWhile1P Nothing $ \c ->
141 isAlpha c || Char.isDigit c ||
142 c=='.' || c=='_' || c=='-'
143 where isAlpha c = Char.isAsciiLower c || Char.isAsciiUpper c
144
145 -- *** SDDecl
146 p_SDDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
147 p_SDDecl = P.label "SDDecl" $ do
148 p_SourcedBegin $ do
149 Sourced ca () <- P.try (p_Sourced $ () <$ p_Spaces1 <* P.string "standalone")
150 p_Eq
151 v <- p_quoted $ const $ p_Sourced $ P.string "yes" <|> P.string "no"
152 return $ Tree (Sourced ca $ NodeAttr "standalone") $ pure $
153 TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v
154
155 -- ** CharData
156 p_CharData :: P.Tokens s ~ TL.Text => Parser e s EscapedText
157 p_CharData =
158 escapeText
159 <$> p_until1 (\c -> XC.isXmlChar c && c/='<' && c/='&') (']',"]>")
160
161 -- ** Comment
162 p_Comment :: P.Tokens s ~ TL.Text => Parser Error s XML
163 p_Comment = p_SourcedBegin $ P.string "<!--" *> p_Comment__
164 p_Comment_ :: P.Tokens s ~ TL.Text => Parser Error s XML
165 p_Comment_ = P.string "--" *> p_Comment__
166 p_Comment__:: P.Tokens s ~ TL.Text => Parser Error s XML
167 p_Comment__ = P.label "Comment" $ do
168 c <- p_until XC.isXmlChar ('-', "-")
169 void $ P.string "-->"
170 cell <- p_SourcedEnd
171 return $ TS.tree0 (cell $ NodeComment c)
172
173 -- ** CDATA
174 p_CDSect :: P.Tokens s ~ TL.Text => Parser Error s XML
175 p_CDSect = p_SourcedBegin $ P.string "<![CDATA[" *> p_CDSect__
176 p_CDSect_ :: P.Tokens s ~ TL.Text => Parser Error s XML
177 p_CDSect_ = P.string "[CDATA[" *> p_CDSect__
178 p_CDSect__ :: P.Tokens s ~ TL.Text => Parser Error s XML
179 p_CDSect__ = P.label "CDSect" $ do
180 c <- p_until XC.isXmlChar (']', "]>")
181 void $ P.string "]]>"
182 cell <- p_SourcedEnd
183 return $ TS.tree0 $ cell $ NodeCDATA c
184
185 -- ** PI
186 p_PI :: P.Tokens s ~ TL.Text => Parser Error s XML
187 p_PI = p_SourcedBegin $ P.string "<?" *> p_PI__
188 p_PI_ :: P.Tokens s ~ TL.Text => Parser Error s XML
189 p_PI_ = P.char '?' *> p_PI__
190 p_PI__ :: P.Tokens s ~ TL.Text => Parser Error s XML
191 p_PI__ = P.label "PI" $ do
192 n <- p_PITarget
193 v <- P.option "" $ P.try $ p_Spaces1 *> p_until XC.isXmlChar ('?', ">")
194 void $ P.string "?>"
195 cell <- p_SourcedEnd
196 return $ TS.tree0 $ cell $ NodePI n v
197 p_PITarget :: P.Tokens s ~ TL.Text => Parser Error s PName
198 p_PITarget = do
199 n <- p_PName
200 case n of
201 PName{pNameSpace=Nothing, pNameLocal=NCName l}
202 | "xml" == TL.toLower l -> p_error $ Error_PI_reserved n
203 _ -> return n
204
205 -- ** Element
206 p_Element :: P.Tokens s ~ TL.Text => Parser Error s XML
207 p_Element = p_SourcedBegin $ (P.char '<' *> p_Element_)
208 p_Element_ :: P.Tokens s ~ TL.Text => Parser Error s XML
209 p_Element_ = P.label "Element" p_STag
210
211 -- *** STag
212 p_STag :: P.Tokens s ~ TL.Text => Parser Error s XML
213 p_STag = do
214 n <- p_PName
215 as <- P.many $ P.try $ p_Spaces1 *> p_Attribute
216 p_Spaces
217 ro <- R.ask
218 elemNS :: HM.HashMap NCName Namespace <-
219 (HM.fromList . List.concat <$>) $ forM as $ \case
220 Sourced _ (PName{..}, Sourced _ av)
221 | ns <- Namespace $ unescapeText av
222 , Nothing <- pNameSpace
223 , NCName "xmlns" <- pNameLocal ->
224 -- NOTE: default namespace declaration.
225 case ns of
226 _ | ns == xmlns_xml -- DOC: it MUST NOT be declared as the default namespace
227 || ns == xmlns_xmlns -- DOC: it MUST NOT be declared as the default namespace
228 -> p_error $ Error_Namespace_reserved ns
229 _ -> return [(NCName "" , ns)]
230 | ns <- Namespace $ unescapeText av
231 , Just (NCName "xmlns") <- pNameSpace ->
232 -- NOTE: namespace prefix declaration.
233 case unNCName pNameLocal of
234 "xml" -- DOC: It MAY, but need not, be declared,
235 -- and MUST NOT be bound to any other namespace name.
236 | ns == xmlns_xml -> return []
237 | otherwise -> p_error $ Error_Namespace_reserved_prefix pNameLocal
238 "xmlns" -- DOC: It MUST NOT be declared
239 -> p_error $ Error_Namespace_reserved_prefix pNameLocal
240 local | "xml" <- TL.toLower $ TL.take 3 local -> return []
241 -- DOC: All other prefixes beginning with the three-letter
242 -- sequence x, m, l, in any case combination, are reserved.
243 -- This means that: processors MUST NOT treat them as fatal errors.
244 _ | ns == xmlns_xml -- DOC: Other prefixes MUST NOT be bound to this namespace name.
245 || ns == xmlns_xmlns -- DOC: Other prefixes MUST NOT be bound to this namespace name.
246 -> p_error $ Error_Namespace_reserved ns
247 _ -> return [(pNameLocal, ns)]
248 | otherwise -> return []
249 let scopeNS = elemNS <> reader_ns_scope ro
250 let defaultNS = HM.lookupDefault (reader_ns_default ro) (NCName "") scopeNS
251 let lookupNamePrefix prefix =
252 maybe (p_error $ Error_Namespace_prefix_unknown prefix) return $
253 HM.lookup prefix scopeNS
254 elemName :: QName <-
255 -- NOTE: expand element's QName.
256 case pNameSpace n of
257 Nothing -> return QName{qNameSpace=defaultNS, qNameLocal=pNameLocal n}
258 -- DOC: If there is a default namespace declaration in scope,
259 -- the expanded name corresponding to an unprefixed element name
260 -- has the URI of the default namespace as its namespace name.
261 Just prefix
262 | NCName "xmlns" <- prefix ->
263 -- DOC: Element names MUST NOT have the prefix xmlns.
264 p_error $ Error_Namespace_reserved_prefix prefix
265 | otherwise -> do
266 ns <- lookupNamePrefix prefix
267 return QName{qNameSpace=ns, qNameLocal=pNameLocal n}
268 elemAttrs :: [Sourced FileSource (QName, Sourced FileSource EscapedText)] <-
269 -- NOTE: expand attributes' PName into QName.
270 forM as $ \s@Sourced{unSourced=(an, av)} -> do
271 ns <- maybe (return "") lookupNamePrefix $ pNameSpace an
272 let qn = QName{qNameSpace=ns, qNameLocal=pNameLocal an}
273 return s{unSourced=(qn, av)}
274 -- NOTE: check for attribute collision.
275 let attrsByQName :: HM.HashMap QName [Sourced FileSource (QName, Sourced FileSource EscapedText)] =
276 HM.fromListWith (<>) $ (<$> elemAttrs) $ \a@(Sourced _c (an, _av)) -> (an, [a])
277 case HM.toList $ HM.filter (\x -> length x > 1) attrsByQName of
278 (an, _):_ -> p_error $ Error_Attribute_collision an
279 _ -> return ()
280 elemAttrsXML :: XMLs <- (Seq.fromList <$>) $
281 forM elemAttrs $ \(Sourced sa (an, av)) -> do
282 return $ TS.Tree (Sourced sa $ NodeAttr an) $
283 pure $ TS.tree0 $ NodeText <$> av
284 content :: XMLs <-
285 elemAttrsXML <$ P.string "/>" <|>
286 R.local
287 (const ro
288 { reader_ns_scope = scopeNS
289 , reader_ns_default = defaultNS
290 })
291 ((elemAttrsXML <>) <$ P.char '>' <*> p_content <* p_ETag elemName)
292 cell <- p_SourcedEnd
293 return $ Tree (cell $ NodeElem elemName) content
294
295 -- *** Attribute
296 p_Attribute :: P.Tokens s ~ TL.Text => Parser Error s (Sourced FileSource (PName, Sourced FileSource EscapedText))
297 p_Attribute = p_Sourced $ (,) <$> p_PName <* p_Eq <*> p_AttValue
298
299 p_AttValue :: P.Tokens s ~ TL.Text => Parser Error s (Sourced FileSource EscapedText)
300 p_AttValue = P.label "AttValue" $ p_quoted p_AttValueText
301
302 p_AttValueText :: P.Tokens s ~ TL.Text => Char -> Parser Error s (Sourced FileSource EscapedText)
303 p_AttValueText q = p_Sourced $
304 EscapedText . Seq.fromList <$> P.many
305 ( p_Reference
306 <|> EscapedPlain <$> P.takeWhile1P Nothing (\c ->
307 XC.isXmlChar c &&
308 c `List.notElem` (q:"<&'\">"))
309 <|> EscapedEntityRef entityRef_gt <$ P.char '>'
310 <|> (if q == '\''
311 then EscapedEntityRef entityRef_quot <$ P.char '"'
312 else EscapedEntityRef entityRef_apos <$ P.char '\'')
313 )
314
315 -- * content
316 p_content :: P.Tokens s ~ TL.Text => Parser Error s XMLs
317 p_content =
318 (Seq.fromList <$>) $ P.many $
319 (p_SourcedBegin $ do
320 P.try $ P.char '<' *> P.notFollowedBy (P.char '/')
321 p_Element_ <|> p_PI_ <|> (P.char '!' *> (p_Comment_ <|> p_CDSect_))
322 )
323 <|> ((tree0 <$>) $ p_Sourced $ NodeText . mconcat
324 <$> P.some (p_CharData <|> EscapedText . pure <$> p_Reference))
325
326 -- *** ETag
327 p_ETag :: P.Tokens s ~ TL.Text => QName -> Parser Error s ()
328 p_ETag expected = do
329 got <- P.string "</" *> p_QName <* p_Spaces <* P.char '>'
330 unless (got == expected) $
331 p_error $ Error_Closing_tag_unexpected got expected
332
333 -- * Name
334 p_Name :: P.Tokens s ~ TL.Text => Parser Error s Name
335 p_Name = P.label "Name" $
336 Name
337 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNameStartChar)
338 <*> P.takeWhile1P Nothing XC.isXmlNameChar
339
340 -- * PName
341 p_PName :: P.Tokens s ~ TL.Text => Parser e s PName
342 p_PName = P.label "PName" $ do
343 n <- p_NCName
344 s <- P.optional $ P.try $ P.char ':' *> p_NCName
345 return $ case s of
346 Nothing -> PName{pNameSpace=Nothing, pNameLocal=n}
347 Just l -> PName{pNameSpace=Just n , pNameLocal=l}
348
349 -- * QName
350 p_QName :: P.Tokens s ~ TL.Text => Parser Error s QName
351 p_QName = P.label "QName" $ do
352 n <- p_NCName
353 s <- P.optional $ P.try $ P.char ':' *> p_NCName
354 Reader{..} <- R.ask
355 case s of
356 Nothing -> return QName{qNameSpace=reader_ns_default, qNameLocal=n}
357 Just l ->
358 case HM.lookup n reader_ns_scope of
359 Nothing -> p_error $ Error_Namespace_prefix_unknown n
360 Just ns -> return QName{qNameSpace=ns, qNameLocal=l}
361
362 -- ** NCName
363 p_NCName :: P.Tokens s ~ TL.Text => Parser e s NCName
364 p_NCName = P.label "NCName" $
365 NCName
366 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNCNameStartChar)
367 <*> P.takeWhile1P Nothing XC.isXmlNCNameChar
368
369 -- * Reference
370 p_Reference :: P.Tokens s ~ TL.Text => Parser Error s Escaped
371 p_Reference =
372 EscapedCharRef <$> p_CharRef <|>
373 EscapedEntityRef <$> p_EntityRef
374
375 -- ** EntityRef
376 p_EntityRef :: P.Tokens s ~ TL.Text => Parser Error s EntityRef
377 p_EntityRef = P.label "EntityRef" $ do
378 ref <- P.char '&' *> p_NCName <* P.char ';'
379 EntityRef ref <$> lookupEntityRef ref
380 where
381 lookupEntityRef (NCName "lt" ) = pure "<"
382 lookupEntityRef (NCName "gt" ) = pure ">"
383 lookupEntityRef (NCName "amp" ) = pure "&"
384 lookupEntityRef (NCName "apos") = pure "'"
385 lookupEntityRef (NCName "quot") = pure "\""
386 lookupEntityRef n = p_error $ Error_EntityRef_unknown n
387
388 -- ** CharRef
389 p_CharRef :: P.Tokens s ~ TL.Text => Parser Error s CharRef
390 p_CharRef = P.label "CharRef" $
391 do
392 ref <- readHexadecimal
393 <$ P.string "&#x"
394 <*> P.some P.hexDigitChar
395 <* P.char ';'
396 check ref
397 <|> do
398 ref <- readDecimal
399 <$ P.string "&#"
400 <*> P.some P.digitChar
401 <* P.char ';'
402 check ref
403 where
404 check i =
405 let c = toEnum (fromInteger i) in
406 if i <= toInteger (fromEnum (maxBound::Char))
407 && XC.isXmlChar c
408 then pure $ CharRef c
409 else p_error $ Error_CharRef_invalid i
410
411 readInt :: Integer -> String -> Integer
412 readInt base digits =
413 sign * List.foldl' acc 0 (List.concatMap digToInt digits1)
414 where
415 acc q r = q*base + r
416 (sign, digits1) =
417 case digits of
418 [] -> (1, digits)
419 c:ds | c == '-' -> (-1, ds)
420 | c == '+' -> ( 1, ds)
421 | otherwise -> ( 1, digits)
422 ord = toInteger . Char.ord
423 digToInt c
424 | Char.isDigit c = [ord c - ord '0']
425 | Char.isAsciiLower c = [ord c - ord 'a' + 10]
426 | Char.isAsciiUpper c = [ord c - ord 'A' + 10]
427 | otherwise = []
428
429 readDecimal :: String -> Integer
430 readDecimal = readInt 10
431
432 readHexadecimal :: String -> Integer
433 readHexadecimal = readInt 16
434
435 -- * Char
436 p_Char :: P.Tokens s ~ TL.Text => Parser e s Char
437 p_Char = P.label "Char" $ P.satisfy XC.isXmlCharCR <|> p_CRLF
438 {-# INLINE p_Char #-}
439
440 -- ** Space
441 -- | Map '\r' and '\r\n' to '\n'.
442 p_CRLF :: P.Tokens s ~ TL.Text => Parser e s Char
443 p_CRLF = P.label "CRLF" $
444 P.char '\r' *> P.option '\n' (P.char '\n')
445
446 p_Space :: P.Tokens s ~ TL.Text => Parser e s Char
447 p_Space = P.label "Space" $
448 P.satisfy XC.isXmlSpaceCharCR <|> p_CRLF
449 {-# INLINE p_Space #-}
450
451 p_Spaces :: P.Tokens s ~ TL.Text => Parser e s ()
452 p_Spaces = P.label "Spaces" $
453 void $ P.takeWhileP Nothing XC.isXmlSpaceChar
454 {-# INLINE p_Spaces #-}
455
456 p_S :: P.Tokens s ~ TL.Text => Parser Error s XML
457 p_S = P.label "Spaces" $
458 (\ts -> TS.tree0 (NodeText . EscapedText . pure . EscapedPlain . TL.concat <$> ts))
459 <$> p_Sourced (P.some $
460 P.takeWhile1P Nothing XC.isXmlSpaceCharCR <|>
461 TL.singleton <$> p_CRLF)
462
463 p_Spaces1 :: P.Tokens s ~ TL.Text => Parser e s ()
464 p_Spaces1 = P.label "Spaces1" $
465 void $ P.takeWhile1P Nothing XC.isXmlSpaceChar
466 {-# INLINE p_Spaces1 #-}
467
468 -- * Eq
469 p_separator :: P.Tokens s ~ TL.Text => Char -> Parser e s ()
470 p_separator c = P.try (() <$ p_Spaces <* P.char c) <* p_Spaces <?> [c]
471
472 p_Eq :: P.Tokens s ~ TL.Text => Parser e s ()
473 p_Eq = p_separator '=' <?> "Eq"