]> Git — Sourcephile - haskell/symantic-xml.git/blob - src/Symantic/XML/Tree/Read.hs
Rewrite to categorical symantic
[haskell/symantic-xml.git] / src / Symantic / XML / Tree / Read.hs
1 {-# LANGUAGE ConstraintKinds #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 {-# LANGUAGE Rank2Types #-}
4 {-# OPTIONS_GHC -fno-warn-orphans #-}
5 module Symantic.XML.Tree.Read where
6
7 import Control.Arrow (left)
8 import Control.Applicative (Applicative(..), Alternative(..))
9 import Control.Monad (Monad(..), void, unless, forM)
10 import Data.Bool
11 import Data.Char (Char)
12 import Data.Either (Either(..))
13 import Data.Eq (Eq(..))
14 import Data.Foldable (Foldable(..))
15 import Data.Function (($), (.), const)
16 import Data.Functor ((<$>), (<$))
17 import Data.Maybe (Maybe(..), maybe, catMaybes)
18 import Data.Monoid (Monoid(..))
19 import Data.Ord (Ord(..))
20 import Data.Semigroup (Semigroup(..))
21 import Data.List.NonEmpty (NonEmpty(..))
22 import Data.String (String, IsString(..))
23 import Prelude (Num(..), Enum(..), Bounded(..), Integer, toInteger)
24 import System.IO (FilePath, IO)
25 import Text.Show (Show(..))
26 import qualified Control.Exception as Exn
27 import qualified Control.Monad.Trans.Reader as R
28 import qualified Data.ByteString.Lazy as BSL
29 import qualified Data.Char as Char
30 import qualified Data.Char.Properties.XMLCharProps as XC
31 import qualified Data.HashMap.Strict as HM
32 import qualified Data.List as List
33 import qualified Data.Set as Set
34 import qualified Data.Sequence as Seq
35 import qualified Data.Text.Encoding.Error as TL
36 import qualified Data.Text.Lazy as TL
37 import qualified Data.Text.Lazy.Encoding as TL
38 import qualified Data.TreeSeq.Strict as TS
39 import qualified System.IO.Error as IO
40 import qualified Text.Megaparsec as P
41 import qualified Text.Megaparsec.Char as P
42
43 import Symantic.Base ()
44 import Symantic.XML.Language hiding (void)
45 import Symantic.XML.Tree.Source
46 import Symantic.XML.Tree.Data
47
48 readTree :: FilePath -> IO (Either String FileSourcedTrees)
49 readTree path =
50 readUtf8 path >>= \case
51 Left err -> return $ Left $ show err
52 Right txt -> return $
53 case runReadTree path txt of
54 Right a -> Right a
55 Left err -> Left $ P.errorBundlePretty err
56
57 runReadTree ::
58 FilePath -> TL.Text ->
59 Either (P.ParseErrorBundle TL.Text Error)
60 FileSourcedTrees
61 runReadTree = P.runParser $ R.runReaderT p_document defaultReadTreeInh
62
63 -- * Type 'ErrorRead'
64 data ErrorRead
65 = ErrorRead_IO IO.IOError
66 | ErrorRead_Unicode TL.UnicodeException
67 deriving (Show)
68 readUtf8 :: FilePath -> IO (Either ErrorRead TL.Text)
69 readUtf8 path =
70 (left ErrorRead_Unicode . TL.decodeUtf8' <$> BSL.readFile path)
71 `Exn.catch` \e ->
72 if IO.isAlreadyInUseError e
73 || IO.isDoesNotExistError e
74 || IO.isPermissionError e
75 then return $ Left $ ErrorRead_IO e
76 else IO.ioError e
77
78 -- * Type 'ReadTree'
79 -- | Convenient alias.
80 type ReadTree e s a =
81 ReadTreeConstraints e s a =>
82 R.ReaderT ReadTreeInh (P.Parsec e s) a
83
84 -- ** Type 'ReadTreeConstraints'
85 type ReadTreeConstraints e s a =
86 ( P.Stream s
87 , P.Token s ~ Char
88 , Ord e
89 , IsString (P.Tokens s)
90 , P.ShowErrorComponent e
91 )
92
93 -- ** Type 'ReadTreeInh'
94 data ReadTreeInh
95 = ReadTreeInh
96 { readTreeInh_source :: FileSource Offset
97 , readTreeInh_ns_scope :: HM.HashMap NCName Namespace
98 , readTreeInh_ns_default :: Namespace
99 } deriving (Show)
100
101 defaultReadTreeInh :: ReadTreeInh
102 defaultReadTreeInh = ReadTreeInh
103 { readTreeInh_source = FileSource $ pure $
104 FileRange mempty mempty mempty
105 , readTreeInh_ns_scope = HM.fromList
106 [ ("xml" , xmlns_xml)
107 , ("xmlns", xmlns_xmlns)
108 ]
109 , readTreeInh_ns_default = ""
110 }
111
112 p_Offset :: ReadTree e s Offset
113 p_Offset = Offset <$> P.getOffset
114 {-# INLINE p_Offset #-}
115
116 p_Sourced :: ReadTree e s a -> ReadTree e s (Sourced (FileSource Offset) a)
117 p_Sourced pa = do
118 ReadTreeInh{readTreeInh_source} <- R.ask
119 b <- P.getParserState
120 let fileRange_path = P.sourceName $ P.pstateSourcePos $ P.statePosState b
121 let fileRange_begin = Offset $ P.stateOffset b
122 a <- pa
123 e <- P.getParserState
124 let fileRange_end = Offset $ P.stateOffset e
125 return $ Sourced (setSource FileRange{..} readTreeInh_source) a
126
127 setSource :: FileRange pos -> FileSource pos -> FileSource pos
128 setSource fileRange (FileSource (_curr:|next)) = FileSource (fileRange:|next)
129
130 -- | Like 'p_Sourced' but uncoupled (through the use of 'p_SourcedEnd') for more flexibility.
131 p_SourcedBegin :: ReadTree e s a -> ReadTree e s a
132 p_SourcedBegin pa = do
133 b <- P.getParserState
134 let fileRange_path = P.sourceName $ P.pstateSourcePos $ P.statePosState b
135 let fileRange_begin = Offset $ P.stateOffset b
136 let fileRange_end = fileRange_begin
137 (`R.local` pa) $ \inh@ReadTreeInh{..} ->
138 inh{ readTreeInh_source = setSource FileRange{..} readTreeInh_source }
139
140 -- | WARNING: only to be used within a 'p_SourcedBegin'.
141 p_SourcedEnd :: ReadTree e s (a -> Sourced (FileSource Offset) a)
142 p_SourcedEnd = do
143 ReadTreeInh{..} <- R.ask
144 e <- P.getParserState
145 let fileRange_end = Offset $ P.stateOffset e
146 return $ Sourced $
147 (\(FileSource (curr:|path)) -> FileSource (curr{fileRange_end}:|path))
148 readTreeInh_source
149
150 -- * Type 'Error'
151 data Error
152 = Error_CharRef_invalid Integer
153 -- ^ Well-formedness constraint: Legal Character.
154 --
155 -- Characters referred to using character references MUST match the production for Char.
156 | Error_EntityRef_unknown NCName
157 -- ^ Well-formedness constraint: Entity Declared
158 --
159 -- In a document without any DTD, a document with only an internal DTD
160 -- subset which contains no parameter entity references, or a document
161 -- with " standalone='yes' ", for an entity reference that does not occur
162 -- within the external subset or a parameter entity, the Name given in the
163 -- entity reference MUST match that in an entity declaration that does not
164 -- occur within the external subset or a parameter entity, except that
165 -- well-formed documents need not declare any of the following entities:
166 -- amp, lt, gt, apos, quot. The declaration of a general entity MUST
167 -- precede any reference to it which appears in a default value in an
168 -- attribute-list declaration.
169 --
170 -- Note that non-validating processors are not obligated to read and
171 -- process entity declarations occurring in parameter entities or in the
172 -- external subset; for such documents, the define that an entity must be
173 -- declared is a well-formedness constraint only if standalone='yes'.
174 | Error_Closing_tag_unexpected QName QName
175 -- ^ Well-formedness constraint: Element Type Match.
176 --
177 -- The Name in an element's end-tag MUST match the element type in the start-tag.
178 | Error_Attribute_collision QName
179 -- ^ Well-formedness constraint: Unique Att Spec.
180 --
181 -- An attribute name MUST NOT appear more than once in the same start-tag or empty-element tag.
182 | Error_PI_reserved PName
183 -- ^ The target names " XML ", " xml ", and so on are reserved for standardization.
184 | Error_Namespace_prefix_unknown NCName
185 -- ^ Namespace constraint: Prefix Declared
186 --
187 -- The namespace prefix, unless it is xml or xmlns, MUST have been declared in a namespace declaration attribute in either the start-tag of the element where the prefix is used or in an ancestor element (i.e., an element in whose content the prefixed markup occurs).
188 | Error_Namespace_empty NCName
189 -- ^ Namespace constraint: No Prefix Undeclaring
190 --
191 -- In a namespace declaration for a prefix (i.e., where the NSAttName is a PrefixedAttName), the attribute value MUST NOT be empty.
192 | Error_Namespace_reserved Namespace
193 | Error_Namespace_reserved_prefix NCName
194 -- ^ Namespace constraint: Reserved Prefixes and Namespace Names
195 --
196 -- The prefix xml is by definition bound to the namespace name
197 -- http://www.w3.org/XML/1998/namespace. It MAY, but need not, be
198 -- declared, and MUST NOT be bound to any other namespace name. Other
199 -- prefixes MUST NOT be bound to this namespace name, and it MUST NOT be
200 -- declared as the default namespace.
201 --
202 -- The prefix xmlns is used only to declare namespace bindings and is by
203 -- definition bound to the namespace name http://www.w3.org/2000/xmlns/.
204 -- It MUST NOT be declared . Other prefixes MUST NOT be bound to this
205 -- namespace name, and it MUST NOT be declared as the default namespace.
206 -- Element names MUST NOT have the prefix xmlns.
207 --
208 -- All other prefixes beginning with the three-letter sequence x, m, l, in
209 -- any case combination, are reserved. This means that:
210 --
211 -- - users SHOULD NOT use them except as defined by later specifications
212 -- - processors MUST NOT treat them as fatal errors.
213 deriving (Eq,Ord,Show)
214 instance P.ShowErrorComponent Error where
215 showErrorComponent = show
216
217 -- * Helpers
218 p_error :: e -> ReadTree e s a
219 p_error = P.fancyFailure . Set.singleton . P.ErrorCustom
220
221 p_quoted :: P.Tokens s ~ TL.Text => (Char -> ReadTree e s a) -> ReadTree e s a
222 p_quoted p =
223 P.between (P.char '"') (P.char '"') (p '"') <|>
224 P.between (P.char '\'') (P.char '\'') (p '\'')
225
226 p_until ::
227 P.Tokens s ~ TL.Text =>
228 (Char -> Bool) -> (Char, TL.Text) -> ReadTree e s TL.Text
229 p_until content (end, end_) =
230 (TL.concat <$>) $ P.many $
231 P.takeWhile1P Nothing (\c -> content c && c /= end) <|>
232 P.try (TL.singleton <$> P.char end <* P.notFollowedBy (P.string end_))
233
234 p_until1 ::
235 P.Tokens s ~ TL.Text =>
236 (Char -> Bool) -> (Char, TL.Text) -> ReadTree e s TL.Text
237 p_until1 content (end, end_) =
238 (TL.concat <$>) $ P.some $
239 P.takeWhile1P Nothing (\c -> content c && c /= end) <|>
240 P.try (TL.singleton <$> P.char end <* P.notFollowedBy (P.string end_))
241
242 -- * Document
243 p_document :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTrees
244 p_document = do
245 ps <- p_prolog
246 e <- p_Element
247 m <- p_Miscs
248 P.eof
249 return (ps <> pure e <> m)
250
251 -- ** Prolog
252 p_prolog :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTrees
253 p_prolog = (<>)
254 <$> P.option Seq.empty (pure <$> p_XMLDecl)
255 <*> p_Miscs
256
257 -- ** Misc
258 p_Miscs :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTrees
259 p_Miscs = (Seq.fromList . catMaybes <$>) $ P.many $
260 Just <$> p_Comment <|>
261 Just <$> p_PI <|>
262 Nothing <$ p_Spaces1
263
264 -- ** XMLDecl
265 p_XMLDecl :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
266 p_XMLDecl = do
267 Sourced src as <- p_Sourced $ P.between (P.string "<?xml") (P.string "?>") $ do
268 vi <- pure <$> p_VersionInfo
269 ed <- P.option Seq.empty $ pure <$> p_EncodingDecl
270 sd <- P.option Seq.empty $ pure <$> p_SDDecl
271 p_Spaces
272 return $ vi <> ed <> sd
273 return $ TS.Tree (Sourced src $ NodePI "xml" "") as
274
275 p_VersionInfo :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
276 p_VersionInfo = do
277 Sourced src v <- p_Sourced $ do
278 P.try $ p_Spaces1 <* P.string "version"
279 p_Eq
280 p_quoted $ const $
281 (<>)
282 <$> P.string "1."
283 <*> P.takeWhile1P Nothing Char.isDigit
284 return $ TS.tree0 $ Sourced src $ NodePI "version" v
285
286 p_EncodingDecl :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
287 p_EncodingDecl = do
288 Sourced src v <- p_Sourced $ do
289 P.try $ p_Spaces1 <* P.string "encoding"
290 p_Eq
291 p_quoted $ const p_EncName
292 return $ TS.tree0 $ Sourced src $ NodePI "encoding" v
293
294 p_EncName :: P.Tokens s ~ TL.Text => ReadTree Error s TL.Text
295 p_EncName = P.label "EncName" $ do
296 P.notFollowedBy (P.satisfy $ not . isAlpha)
297 P.takeWhile1P Nothing $ \c ->
298 isAlpha c || Char.isDigit c ||
299 c=='.' || c=='_' || c=='-'
300 where isAlpha c = Char.isAsciiLower c || Char.isAsciiUpper c
301
302 -- *** SDDecl
303 p_SDDecl :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
304 p_SDDecl = do
305 Sourced src v <- p_Sourced $ do
306 P.try $ p_Spaces1 <* P.string "standalone"
307 p_Eq
308 p_quoted $ const $ P.string "yes" <|> P.string "no"
309 return $ TS.tree0 $ Sourced src $ NodePI "standalone" v
310
311 -- ** CharData
312 p_CharData :: P.Tokens s ~ TL.Text => ReadTree e s EscapedText
313 p_CharData = P.label "[^<&]" $ escapeText <$>
314 p_until1 (\c -> XC.isXmlChar c && c/='<' && c/='&') (']',"]>")
315
316 -- ** Comment
317 p_Comment :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
318 p_Comment = p_SourcedBegin $ P.string "<!--" *> p_Comment__
319 p_Comment_ :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
320 p_Comment_ = P.string "--" *> p_Comment__
321 p_Comment__:: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
322 p_Comment__ = do
323 c <- p_until XC.isXmlChar ('-', "-")
324 void $ P.string "-->"
325 src <- p_SourcedEnd
326 return $ TS.tree0 $ src $ NodeComment c
327
328 -- ** CDATA
329 p_CDSect :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
330 p_CDSect = p_SourcedBegin $ P.string "<![CDATA[" *> p_CDSect__
331 p_CDSect_ :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
332 p_CDSect_ = P.string "[CDATA[" *> p_CDSect__
333 p_CDSect__ :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
334 p_CDSect__ = do
335 c <- p_until XC.isXmlChar (']', "]>")
336 void $ P.string "]]>"
337 src <- p_SourcedEnd
338 return $ TS.tree0 $ src $ NodeCDATA c
339
340 -- ** PI
341 p_PI :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
342 p_PI = p_SourcedBegin $ P.string "<?" *> p_PI__
343 p_PI_ :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
344 p_PI_ = P.char '?' *> p_PI__
345 p_PI__ :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
346 p_PI__ = do
347 n <- p_PITarget
348 v <- P.option "" $ p_Spaces1 *> p_until XC.isXmlChar ('?', ">")
349 void $ P.string "?>"
350 src <- p_SourcedEnd
351 return $ TS.tree0 $ src $ NodePI n v
352 p_PITarget :: P.Tokens s ~ TL.Text => ReadTree Error s PName
353 p_PITarget = do
354 n <- p_PName
355 case n of
356 PName{pNameSpace=Nothing, pNameLocal=NCName l}
357 | "xml" == TL.toLower l -> p_error $ Error_PI_reserved n
358 _ -> return n
359
360 -- ** Element
361 p_Element :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
362 p_Element = p_SourcedBegin $ (P.char '<' *> p_Element_)
363 p_Element_ :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
364 p_Element_ = p_STag
365
366 -- *** STag
367 p_STag :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTree
368 p_STag = do
369 n <- p_PName
370 attrs <- P.many $ p_Attribute
371 p_Spaces
372 ro <- R.ask
373 elemNS :: HM.HashMap NCName Namespace <-
374 (HM.fromList . List.concat <$>) $ forM attrs $ \case
375 (PName{..}, Sourced _ av)
376 | ns <- Namespace $ unescapeAttr av
377 , Nothing <- pNameSpace
378 , NCName "xmlns" <- pNameLocal ->
379 -- Default namespace declaration
380 case ns of
381 _ | ns == xmlns_xml -- DOC: it MUST NOT be declared as the default namespace
382 || ns == xmlns_xmlns -- DOC: it MUST NOT be declared as the default namespace
383 -> p_error $ Error_Namespace_reserved ns
384 _ -> return [(NCName "" , ns)]
385 | ns <- Namespace $ unescapeAttr av
386 , Just (NCName "xmlns") <- pNameSpace ->
387 -- Namespace prefix declaration
388 case unNCName pNameLocal of
389 "xml" -- DOC: It MAY, but need not, be declared,
390 -- and MUST NOT be bound to any other namespace name.
391 | ns == xmlns_xml -> return []
392 | otherwise -> p_error $ Error_Namespace_reserved_prefix pNameLocal
393 "xmlns" -- DOC: It MUST NOT be declared
394 -> p_error $ Error_Namespace_reserved_prefix pNameLocal
395 local | "xml" <- TL.toLower $ TL.take 3 local -> return []
396 -- DOC: All other prefixes beginning with the three-letter
397 -- sequence x, m, l, in any case combination, are reserved.
398 -- This means that: processors MUST NOT treat them as fatal errors.
399 _ | ns == xmlns_xml -- DOC: Other prefixes MUST NOT be bound to this namespace name.
400 || ns == xmlns_xmlns -- DOC: Other prefixes MUST NOT be bound to this namespace name.
401 -> p_error $ Error_Namespace_reserved ns
402 _ -> return [(pNameLocal, ns)]
403 | otherwise -> return []
404 let scopeNS = elemNS <> readTreeInh_ns_scope ro
405 let defaultNS = HM.lookupDefault (readTreeInh_ns_default ro) (NCName "") scopeNS
406 let
407 lookupNamePrefix prefix =
408 maybe (p_error $ Error_Namespace_prefix_unknown prefix) return $
409 HM.lookup prefix scopeNS
410 elemName :: QName <-
411 -- Expand element's QName
412 case pNameSpace n of
413 Nothing -> return QName{qNameSpace=defaultNS, qNameLocal=pNameLocal n}
414 -- DOC: If there is a default namespace declaration in scope,
415 -- the expanded name corresponding to an unprefixed element name
416 -- has the URI of the default namespace as its namespace name.
417 Just prefix
418 | NCName "xmlns" <- prefix ->
419 -- DOC: Element names MUST NOT have the prefix xmlns.
420 p_error $ Error_Namespace_reserved_prefix prefix
421 | otherwise -> do
422 ns <- lookupNamePrefix prefix
423 return QName{qNameSpace=ns, qNameLocal=pNameLocal n}
424 elemAttrs :: [(QName, FileSourced EscapedAttr)] <-
425 -- Expand attributes' PName into QName
426 forM attrs $ \(an, av) -> do
427 ns <- maybe (return "") lookupNamePrefix $ pNameSpace an
428 let qn = QName{qNameSpace=ns, qNameLocal=pNameLocal an}
429 return (qn, av)
430 -- Check for attribute collision
431 let
432 attrsByQName :: HM.HashMap QName [FileSourced EscapedAttr] =
433 HM.fromListWith (<>) $ (<$> elemAttrs) $
434 \(an, av) -> (an, [av])
435 case HM.toList $ HM.filter (\x -> length x > 1) attrsByQName of
436 (an, _):_ -> p_error $ Error_Attribute_collision an
437 _ -> return ()
438 content :: FileSourcedTrees <-
439 mempty <$ P.string "/>" <|>
440 R.local
441 (const ro
442 { readTreeInh_ns_scope = scopeNS
443 , readTreeInh_ns_default = defaultNS
444 })
445 (P.char '>' *> p_content <* p_ETag elemName)
446 src <- p_SourcedEnd
447 return $ TS.Tree (src $ NodeElem elemName (List.head <$> attrsByQName)) content
448
449 -- *** Attribute
450 -- | Note: despite the type, the returned 'FileSource'
451 -- encompasses also the attribute 'PName'.
452 -- It is pushed in the attribute value to fit the insertion
453 -- of the attribute into a 'HM.HashMap'.
454 p_Attribute :: P.Tokens s ~ TL.Text => ReadTree Error s (PName, FileSourced EscapedAttr)
455 p_Attribute =
456 p_SourcedBegin $ do
457 an <- P.try $ p_Spaces1 *> p_PName
458 void p_Eq
459 av <- p_AttrValue
460 src <- p_SourcedEnd
461 return (an, src av)
462
463 p_AttrValue :: P.Tokens s ~ TL.Text => ReadTree Error s EscapedAttr
464 p_AttrValue = p_quoted p_AttrValueText
465
466 p_AttrValueText :: P.Tokens s ~ TL.Text => Char -> ReadTree Error s EscapedAttr
467 p_AttrValueText q =
468 EscapedAttr . Seq.fromList <$> P.many (
469 p_Reference <|>
470 -- Supplementary alternative to always escape the quote
471 -- as expected by 'EscapedAttr'.
472 (if q /= '\"' then EscapedEntityRef entityRef_quot <$ P.char '"' else P.empty) <|>
473 EscapedPlain <$> P.label ("[^<&"<>[q]<>"]")
474 (P.takeWhile1P Nothing $ \c ->
475 XC.isXmlChar c &&
476 c `List.notElem` (q:"<&")
477 )
478 )
479
480 -- * content
481 p_content :: P.Tokens s ~ TL.Text => ReadTree Error s FileSourcedTrees
482 p_content =
483 (Seq.fromList <$>) $ P.many $
484 (p_SourcedBegin $ do
485 P.try $ P.char '<' *> P.notFollowedBy (P.char '/')
486 p_Element_ <|> p_PI_ <|> (P.char '!' *> (p_Comment_ <|> p_CDSect_))
487 )
488 <|> (
489 (TS.tree0 <$>) $
490 p_Sourced $ NodeText . EscapedText . foldMap unEscapedText
491 <$> P.some (
492 p_CharData <|>
493 EscapedText . pure <$> p_Reference
494 )
495 )
496
497 -- *** ETag
498 p_ETag :: P.Tokens s ~ TL.Text => QName -> ReadTree Error s ()
499 p_ETag expected = do
500 got <- P.string "</" *> p_QName <* p_Spaces <* P.char '>'
501 unless (got == expected) $
502 p_error $ Error_Closing_tag_unexpected got expected
503
504 -- * PName
505 p_PName :: P.Tokens s ~ TL.Text => ReadTree e s PName
506 p_PName = do
507 n <- p_NCName
508 s <- P.optional $ P.try $ P.char ':' *> p_NCName
509 return $ case s of
510 Nothing -> PName{pNameSpace=Nothing, pNameLocal=n}
511 Just l -> PName{pNameSpace=Just n , pNameLocal=l}
512
513 -- * QName
514 p_QName :: P.Tokens s ~ TL.Text => ReadTree Error s QName
515 p_QName = do
516 n <- p_NCName
517 s <- P.optional $ P.try $ P.char ':' *> p_NCName
518 ReadTreeInh{..} <- R.ask
519 case s of
520 Nothing -> return QName{qNameSpace=readTreeInh_ns_default, qNameLocal=n}
521 Just l ->
522 case HM.lookup n readTreeInh_ns_scope of
523 Nothing -> p_error $ Error_Namespace_prefix_unknown n
524 Just ns -> return QName{qNameSpace=ns, qNameLocal=l}
525
526 -- ** NCName
527 p_NCName :: P.Tokens s ~ TL.Text => ReadTree e s NCName
528 p_NCName = P.label "NCName" $ NCName
529 <$ P.notFollowedBy (P.satisfy (not . XC.isXmlNCNameStartChar))
530 <*> P.takeWhile1P Nothing XC.isXmlNCNameChar
531
532 -- * Reference
533 p_Reference :: P.Tokens s ~ TL.Text => ReadTree Error s Escaped
534 p_Reference =
535 EscapedCharRef <$> p_CharRef <|>
536 EscapedEntityRef <$> p_EntityRef
537
538 -- ** EntityRef
539 p_EntityRef :: P.Tokens s ~ TL.Text => ReadTree Error s EntityRef
540 p_EntityRef = do
541 ref <- P.char '&' *> p_NCName <* P.char ';'
542 EntityRef ref <$> lookupEntityRef ref
543 where
544 -- Because entities are declared in the (unimplemented) DTD,
545 -- only builtins entities are supported for now.
546 lookupEntityRef (NCName "lt" ) = pure "<"
547 lookupEntityRef (NCName "gt" ) = pure ">"
548 lookupEntityRef (NCName "amp" ) = pure "&"
549 lookupEntityRef (NCName "apos") = pure "'"
550 lookupEntityRef (NCName "quot") = pure "\""
551 lookupEntityRef n = p_error $ Error_EntityRef_unknown n
552
553 -- ** CharRef
554 p_CharRef :: P.Tokens s ~ TL.Text => ReadTree Error s CharRef
555 p_CharRef =
556 do
557 ref <- readHexadecimal
558 <$ P.string "&#x"
559 <*> P.some P.hexDigitChar
560 <* P.char ';'
561 check ref
562 <|> do
563 ref <- readDecimal
564 <$ P.string "&#"
565 <*> P.some P.digitChar
566 <* P.char ';'
567 check ref
568 where
569 check i =
570 let c = toEnum (fromInteger i) in
571 if i <= toInteger (fromEnum (maxBound::Char))
572 && XC.isXmlChar c
573 then pure $ CharRef c
574 else p_error $ Error_CharRef_invalid i
575
576 readInt :: Integer -> String -> Integer
577 readInt base digits =
578 sign * List.foldl' acc 0 (List.concatMap digToInt digits1)
579 where
580 acc q r = q*base + r
581 (sign, digits1) =
582 case digits of
583 [] -> (1, digits)
584 c:ds | c == '-' -> (-1, ds)
585 | c == '+' -> ( 1, ds)
586 | otherwise -> ( 1, digits)
587 ord = toInteger . Char.ord
588 digToInt c
589 | Char.isDigit c = [ord c - ord '0']
590 | Char.isAsciiLower c = [ord c - ord 'a' + 10]
591 | Char.isAsciiUpper c = [ord c - ord 'A' + 10]
592 | otherwise = []
593
594 readDecimal :: String -> Integer
595 readDecimal = readInt 10
596
597 readHexadecimal :: String -> Integer
598 readHexadecimal = readInt 16
599
600 -- * Char
601 p_Char :: P.Tokens s ~ TL.Text => ReadTree e s Char
602 p_Char = P.label "XmlChar" $ P.satisfy XC.isXmlCharCR <|> p_CRLF
603 {-# INLINE p_Char #-}
604
605 -- ** Space
606 -- | Map '\r' and '\r\n' to '\n'.
607 -- See: https://www.w3.org/TR/xml/#sec-line-ends
608 p_CRLF :: P.Tokens s ~ TL.Text => ReadTree e s Char
609 p_CRLF = P.char '\r' *> P.option '\n' (P.char '\n')
610
611 p_Space :: P.Tokens s ~ TL.Text => ReadTree e s Char
612 p_Space = P.label "space" $ P.satisfy XC.isXmlSpaceCharCR <|> p_CRLF
613 {-# INLINE p_Space #-}
614
615 p_Spaces :: P.Tokens s ~ TL.Text => ReadTree e s ()
616 p_Spaces = P.label "spaces" $ void $ P.takeWhileP Nothing XC.isXmlSpaceChar
617 {-# INLINE p_Spaces #-}
618
619 p_Spaces1 :: P.Tokens s ~ TL.Text => ReadTree e s ()
620 p_Spaces1 = P.label "spaces" $ void $ P.takeWhile1P Nothing XC.isXmlSpaceChar
621 {-# INLINE p_Spaces1 #-}
622
623 -- * Eq
624 p_separator :: P.Tokens s ~ TL.Text => Char -> ReadTree e s ()
625 p_separator c = P.try (() <$ p_Spaces <* P.char c) <* p_Spaces
626
627 p_Eq :: P.Tokens s ~ TL.Text => ReadTree e s ()
628 p_Eq = p_separator '='