1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Symantic.XML.Read
9 ( module Symantic.XML.Read.Parser
10 , module Symantic.XML.Read
13 import Control.Arrow (left)
14 import Control.Applicative (Applicative(..), Alternative(..))
15 import Control.Monad (Monad(..), void, unless, forM, join)
17 import Data.Char (Char)
18 import Data.Default.Class (Default(..))
19 import Data.Either (Either(..))
20 import Data.Eq (Eq(..))
21 import Data.Foldable (Foldable(..))
22 import Data.Function (($), (.), const)
23 import Data.Functor ((<$>), (<$))
24 import Data.Maybe (Maybe(..), maybe)
25 import Data.Monoid (Monoid(..))
26 import Data.Ord (Ord(..))
27 import Data.Semigroup (Semigroup(..))
28 import Data.String (String)
29 import Data.TreeSeq.Strict (Tree(..))
30 import Data.Tuple (snd)
31 import Prelude (Num(..), Enum(..), Bounded(..), Integer, toInteger)
32 import System.IO (FilePath, IO)
33 import Text.Megaparsec ((<?>))
34 import Text.Show (Show(..))
35 import qualified Control.Exception as Exn
36 import qualified Control.Monad.Trans.Reader as R
37 import qualified Data.ByteString.Lazy as BSL
38 import qualified Data.Char as Char
39 import qualified Data.Char.Properties.XMLCharProps as XC
40 import qualified Data.HashMap.Strict as HM
41 import qualified Data.List as List
42 import qualified Data.Sequence as Seq
43 import qualified Data.Text.Encoding.Error as TL
44 import qualified Data.Text.Lazy as TL
45 import qualified Data.Text.Lazy.Encoding as TL
46 import qualified Data.TreeSeq.Strict as TS
47 import qualified System.IO.Error as IO
48 import qualified Text.Megaparsec as P
49 import qualified Text.Megaparsec.Char as P
51 import Symantic.XML.Document hiding (XML, XMLs)
52 import Symantic.XML.Read.Parser
54 readXML :: FilePath -> TL.Text -> Either (P.ParseErrorBundle TL.Text Error) XMLs
55 readXML filePath stateInput =
58 (R.runReaderT p_document def)
62 , P.statePosState = P.PosState
63 { P.pstateInput = stateInput
65 , P.pstateSourcePos = P.initialPos filePath
66 , P.pstateTabWidth = P.pos1
67 , P.pstateLinePrefix = ""
71 readFile :: FilePath -> IO (Either ErrorRead TL.Text)
73 (left ErrorRead_Unicode . TL.decodeUtf8' <$> BSL.readFile fp)
75 if IO.isAlreadyInUseError e
76 || IO.isDoesNotExistError e
77 || IO.isPermissionError e
78 then return $ Left $ ErrorRead_IO e
83 = ErrorRead_IO IO.IOError
84 | ErrorRead_Unicode TL.UnicodeException
88 p_document :: P.Tokens s ~ TL.Text => Parser Error s XMLs
94 return (ps <> pure e <> join (Seq.fromList ms))
97 p_prolog :: P.Tokens s ~ TL.Text => Parser Error s XMLs
99 xmlDecl <- P.option Seq.empty $ pure <$> p_XMLDecl
101 return (xmlDecl <> join (Seq.fromList ms))
104 p_Misc :: P.Tokens s ~ TL.Text => Parser Error s XMLs
106 P.try (pure <$> p_Comment)
107 <|> P.try (pure <$> p_PI)
111 p_XMLDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
112 p_XMLDecl = P.label "XMLDecl" $ do
113 Sourced src as <- p_Sourced $ P.between (P.string "<?xml") (P.string "?>") $ do
114 vi <- pure <$> p_VersionInfo
115 ed <- P.option Seq.empty $ pure <$> p_EncodingDecl
116 sd <- P.option Seq.empty $ pure <$> p_SDDecl
118 return $ vi <> ed <> sd
119 return $ Tree (Sourced src $ NodePI "xml" "") as
121 p_VersionInfo :: P.Tokens s ~ TL.Text => Parser Error s XML
122 p_VersionInfo = P.label "VersionInfo" $ do
123 Sourced c v <- p_Sourced $ do
124 P.try (() <$ p_Spaces1 <* P.string "version")
126 p_quoted $ const $ p_Sourced $
129 <*> P.takeWhile1P Nothing Char.isDigit
130 return $ Tree (Sourced c $ NodeAttr "version") $ pure $
131 TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v
133 p_EncodingDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
134 p_EncodingDecl = P.label "EncodingDecl" $ do
135 Sourced c v <- p_Sourced $ do
136 P.try (() <$ p_Spaces1 <* P.string "encoding")
138 p_quoted $ const $ p_Sourced p_EncName
139 return $ Tree (Sourced c $ NodeAttr "encoding") $ pure $
140 TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v
142 p_EncName :: P.Tokens s ~ TL.Text => Parser Error s TL.Text
143 p_EncName = P.label "EncName" $ do
144 P.notFollowedBy (P.satisfy $ not . isAlpha)
145 P.takeWhile1P Nothing $ \c ->
146 isAlpha c || Char.isDigit c ||
147 c=='.' || c=='_' || c=='-'
148 where isAlpha c = Char.isAsciiLower c || Char.isAsciiUpper c
151 p_SDDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
152 p_SDDecl = P.label "SDDecl" $ do
154 Sourced ca () <- P.try (p_Sourced $ () <$ p_Spaces1 <* P.string "standalone")
156 v <- p_quoted $ const $ p_Sourced $ P.string "yes" <|> P.string "no"
157 return $ Tree (Sourced ca $ NodeAttr "standalone") $ pure $
158 TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v
161 p_CharData :: P.Tokens s ~ TL.Text => Parser e s EscapedText
164 <$> p_until1 (\c -> XC.isXmlChar c && c/='<' && c/='&') (']',"]>")
167 p_Comment :: P.Tokens s ~ TL.Text => Parser Error s XML
168 p_Comment = p_SourcedBegin $ P.string "<!--" *> p_Comment__
169 p_Comment_ :: P.Tokens s ~ TL.Text => Parser Error s XML
170 p_Comment_ = P.string "--" *> p_Comment__
171 p_Comment__:: P.Tokens s ~ TL.Text => Parser Error s XML
172 p_Comment__ = P.label "Comment" $ do
173 c <- p_until XC.isXmlChar ('-', "-")
174 void $ P.string "-->"
176 return $ TS.tree0 (cell $ NodeComment c)
179 p_CDSect :: P.Tokens s ~ TL.Text => Parser Error s XML
180 p_CDSect = p_SourcedBegin $ P.string "<![CDATA[" *> p_CDSect__
181 p_CDSect_ :: P.Tokens s ~ TL.Text => Parser Error s XML
182 p_CDSect_ = P.string "[CDATA[" *> p_CDSect__
183 p_CDSect__ :: P.Tokens s ~ TL.Text => Parser Error s XML
184 p_CDSect__ = P.label "CDSect" $ do
185 c <- p_until XC.isXmlChar (']', "]>")
186 void $ P.string "]]>"
188 return $ TS.tree0 $ cell $ NodeCDATA c
191 p_PI :: P.Tokens s ~ TL.Text => Parser Error s XML
192 p_PI = p_SourcedBegin $ P.string "<?" *> p_PI__
193 p_PI_ :: P.Tokens s ~ TL.Text => Parser Error s XML
194 p_PI_ = P.char '?' *> p_PI__
195 p_PI__ :: P.Tokens s ~ TL.Text => Parser Error s XML
196 p_PI__ = P.label "PI" $ do
198 v <- P.option "" $ P.try $ p_Spaces1 *> p_until XC.isXmlChar ('?', ">")
201 return $ TS.tree0 $ cell $ NodePI n v
202 p_PITarget :: P.Tokens s ~ TL.Text => Parser Error s PName
206 PName{pNameSpace=Nothing, pNameLocal=NCName l}
207 | "xml" == TL.toLower l -> p_error $ Error_PI_reserved n
211 p_Element :: P.Tokens s ~ TL.Text => Parser Error s XML
212 p_Element = p_SourcedBegin $ (P.char '<' *> p_Element_)
213 p_Element_ :: P.Tokens s ~ TL.Text => Parser Error s XML
214 p_Element_ = P.label "Element" p_STag
217 p_STag :: P.Tokens s ~ TL.Text => Parser Error s XML
220 as <- P.many $ P.try $ p_Spaces1 *> p_Attribute
223 elemNS :: HM.HashMap NCName Namespace <-
224 (HM.fromList . List.concat <$>) $ forM as $ \case
225 Sourced _ (PName{..}, Sourced _ av)
226 | ns <- Namespace $ unescapeText av
227 , Nothing <- pNameSpace
228 , NCName "xmlns" <- pNameLocal ->
229 -- NOTE: default namespace declaration.
231 _ | ns == xmlns_xml -- DOC: it MUST NOT be declared as the default namespace
232 || ns == xmlns_xmlns -- DOC: it MUST NOT be declared as the default namespace
233 -> p_error $ Error_Namespace_reserved ns
234 _ -> return [(NCName "" , ns)]
235 | ns <- Namespace $ unescapeText av
236 , Just (NCName "xmlns") <- pNameSpace ->
237 -- NOTE: namespace prefix declaration.
238 case unNCName pNameLocal of
239 "xml" -- DOC: It MAY, but need not, be declared,
240 -- and MUST NOT be bound to any other namespace name.
241 | ns == xmlns_xml -> return []
242 | otherwise -> p_error $ Error_Namespace_reserved_prefix pNameLocal
243 "xmlns" -- DOC: It MUST NOT be declared
244 -> p_error $ Error_Namespace_reserved_prefix pNameLocal
245 local | "xml" <- TL.toLower $ TL.take 3 local -> return []
246 -- DOC: All other prefixes beginning with the three-letter
247 -- sequence x, m, l, in any case combination, are reserved.
248 -- This means that: processors MUST NOT treat them as fatal errors.
249 _ | ns == xmlns_xml -- DOC: Other prefixes MUST NOT be bound to this namespace name.
250 || ns == xmlns_xmlns -- DOC: Other prefixes MUST NOT be bound to this namespace name.
251 -> p_error $ Error_Namespace_reserved ns
252 _ -> return [(pNameLocal, ns)]
253 | otherwise -> return []
254 let scopeNS = elemNS <> reader_ns_scope ro
255 let defaultNS = HM.lookupDefault (reader_ns_default ro) (NCName "") scopeNS
256 let lookupNamePrefix prefix =
257 maybe (p_error $ Error_Namespace_prefix_unknown prefix) return $
258 HM.lookup prefix scopeNS
260 -- NOTE: expand element's QName.
262 Nothing -> return QName{qNameSpace=defaultNS, qNameLocal=pNameLocal n}
263 -- DOC: If there is a default namespace declaration in scope,
264 -- the expanded name corresponding to an unprefixed element name
265 -- has the URI of the default namespace as its namespace name.
267 | NCName "xmlns" <- prefix ->
268 -- DOC: Element names MUST NOT have the prefix xmlns.
269 p_error $ Error_Namespace_reserved_prefix prefix
271 ns <- lookupNamePrefix prefix
272 return QName{qNameSpace=ns, qNameLocal=pNameLocal n}
273 elemAttrs :: [FileSourced (QName, FileSourced EscapedText)] <-
274 -- NOTE: expand attributes' PName into QName.
275 forM as $ \s@Sourced{unSourced=(an, av)} -> do
276 ns <- maybe (return "") lookupNamePrefix $ pNameSpace an
277 let qn = QName{qNameSpace=ns, qNameLocal=pNameLocal an}
278 return s{unSourced=(qn, av)}
279 -- NOTE: check for attribute collision.
280 let attrsByQName :: HM.HashMap QName [FileSourced (QName, FileSourced EscapedText)] =
281 HM.fromListWith (<>) $ (<$> elemAttrs) $ \a@(Sourced _c (an, _av)) -> (an, [a])
282 case HM.toList $ HM.filter (\x -> length x > 1) attrsByQName of
283 (an, _):_ -> p_error $ Error_Attribute_collision an
285 elemAttrsXML :: XMLs <- (Seq.fromList <$>) $
286 forM elemAttrs $ \(Sourced sa (an, av)) -> do
287 return $ TS.Tree (Sourced sa $ NodeAttr an) $
288 pure $ TS.tree0 $ NodeText <$> av
290 elemAttrsXML <$ P.string "/>" <|>
293 { reader_ns_scope = scopeNS
294 , reader_ns_default = defaultNS
296 ((elemAttrsXML <>) <$ P.char '>' <*> p_content <* p_ETag elemName)
298 return $ Tree (cell $ NodeElem elemName) content
301 p_Attribute :: P.Tokens s ~ TL.Text => Parser Error s (FileSourced (PName, FileSourced EscapedText))
302 p_Attribute = p_Sourced $ (,) <$> p_PName <* p_Eq <*> p_AttValue
304 p_AttValue :: P.Tokens s ~ TL.Text => Parser Error s (FileSourced EscapedText)
305 p_AttValue = P.label "AttValue" $ p_quoted p_AttValueText
307 p_AttValueText :: P.Tokens s ~ TL.Text => Char -> Parser Error s (FileSourced EscapedText)
308 p_AttValueText q = p_Sourced $
309 EscapedText . Seq.fromList <$> P.many
311 <|> EscapedPlain <$> P.takeWhile1P Nothing (\c ->
313 c `List.notElem` (q:"<&'\">"))
314 <|> EscapedEntityRef entityRef_gt <$ P.char '>'
316 then EscapedEntityRef entityRef_quot <$ P.char '"'
317 else EscapedEntityRef entityRef_apos <$ P.char '\'')
321 p_content :: P.Tokens s ~ TL.Text => Parser Error s XMLs
323 (Seq.fromList <$>) $ P.many $
325 P.try $ P.char '<' *> P.notFollowedBy (P.char '/')
326 p_Element_ <|> p_PI_ <|> (P.char '!' *> (p_Comment_ <|> p_CDSect_))
328 <|> ((tree0 <$>) $ p_Sourced $ NodeText . mconcat
329 <$> P.some (p_CharData <|> EscapedText . pure <$> p_Reference))
332 p_ETag :: P.Tokens s ~ TL.Text => QName -> Parser Error s ()
334 got <- P.string "</" *> p_QName <* p_Spaces <* P.char '>'
335 unless (got == expected) $
336 p_error $ Error_Closing_tag_unexpected got expected
339 p_Name :: P.Tokens s ~ TL.Text => Parser Error s Name
340 p_Name = P.label "Name" $
342 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNameStartChar)
343 <*> P.takeWhile1P Nothing XC.isXmlNameChar
346 p_PName :: P.Tokens s ~ TL.Text => Parser e s PName
347 p_PName = P.label "PName" $ do
349 s <- P.optional $ P.try $ P.char ':' *> p_NCName
351 Nothing -> PName{pNameSpace=Nothing, pNameLocal=n}
352 Just l -> PName{pNameSpace=Just n , pNameLocal=l}
355 p_QName :: P.Tokens s ~ TL.Text => Parser Error s QName
356 p_QName = P.label "QName" $ do
358 s <- P.optional $ P.try $ P.char ':' *> p_NCName
361 Nothing -> return QName{qNameSpace=reader_ns_default, qNameLocal=n}
363 case HM.lookup n reader_ns_scope of
364 Nothing -> p_error $ Error_Namespace_prefix_unknown n
365 Just ns -> return QName{qNameSpace=ns, qNameLocal=l}
368 p_NCName :: P.Tokens s ~ TL.Text => Parser e s NCName
369 p_NCName = P.label "NCName" $
371 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNCNameStartChar)
372 <*> P.takeWhile1P Nothing XC.isXmlNCNameChar
375 p_Reference :: P.Tokens s ~ TL.Text => Parser Error s Escaped
377 EscapedCharRef <$> p_CharRef <|>
378 EscapedEntityRef <$> p_EntityRef
381 p_EntityRef :: P.Tokens s ~ TL.Text => Parser Error s EntityRef
382 p_EntityRef = P.label "EntityRef" $ do
383 ref <- P.char '&' *> p_NCName <* P.char ';'
384 EntityRef ref <$> lookupEntityRef ref
386 lookupEntityRef (NCName "lt" ) = pure "<"
387 lookupEntityRef (NCName "gt" ) = pure ">"
388 lookupEntityRef (NCName "amp" ) = pure "&"
389 lookupEntityRef (NCName "apos") = pure "'"
390 lookupEntityRef (NCName "quot") = pure "\""
391 lookupEntityRef n = p_error $ Error_EntityRef_unknown n
394 p_CharRef :: P.Tokens s ~ TL.Text => Parser Error s CharRef
395 p_CharRef = P.label "CharRef" $
397 ref <- readHexadecimal
399 <*> P.some P.hexDigitChar
405 <*> P.some P.digitChar
410 let c = toEnum (fromInteger i) in
411 if i <= toInteger (fromEnum (maxBound::Char))
413 then pure $ CharRef c
414 else p_error $ Error_CharRef_invalid i
416 readInt :: Integer -> String -> Integer
417 readInt base digits =
418 sign * List.foldl' acc 0 (List.concatMap digToInt digits1)
424 c:ds | c == '-' -> (-1, ds)
425 | c == '+' -> ( 1, ds)
426 | otherwise -> ( 1, digits)
427 ord = toInteger . Char.ord
429 | Char.isDigit c = [ord c - ord '0']
430 | Char.isAsciiLower c = [ord c - ord 'a' + 10]
431 | Char.isAsciiUpper c = [ord c - ord 'A' + 10]
434 readDecimal :: String -> Integer
435 readDecimal = readInt 10
437 readHexadecimal :: String -> Integer
438 readHexadecimal = readInt 16
441 p_Char :: P.Tokens s ~ TL.Text => Parser e s Char
442 p_Char = P.label "Char" $ P.satisfy XC.isXmlCharCR <|> p_CRLF
443 {-# INLINE p_Char #-}
446 -- | Map '\r' and '\r\n' to '\n'.
447 p_CRLF :: P.Tokens s ~ TL.Text => Parser e s Char
448 p_CRLF = P.label "CRLF" $
449 P.char '\r' *> P.option '\n' (P.char '\n')
451 p_Space :: P.Tokens s ~ TL.Text => Parser e s Char
452 p_Space = P.label "Space" $
453 P.satisfy XC.isXmlSpaceCharCR <|> p_CRLF
454 {-# INLINE p_Space #-}
456 p_Spaces :: P.Tokens s ~ TL.Text => Parser e s ()
457 p_Spaces = P.label "Spaces" $
458 void $ P.takeWhileP Nothing XC.isXmlSpaceChar
459 {-# INLINE p_Spaces #-}
461 p_S :: P.Tokens s ~ TL.Text => Parser Error s XML
462 p_S = P.label "Spaces" $
463 (\ts -> TS.tree0 (NodeText . EscapedText . pure . EscapedPlain . TL.concat <$> ts))
464 <$> p_Sourced (P.some $
465 P.takeWhile1P Nothing XC.isXmlSpaceCharCR <|>
466 TL.singleton <$> p_CRLF)
468 p_Spaces1 :: P.Tokens s ~ TL.Text => Parser e s ()
469 p_Spaces1 = P.label "Spaces1" $
470 void $ P.takeWhile1P Nothing XC.isXmlSpaceChar
471 {-# INLINE p_Spaces1 #-}
474 p_separator :: P.Tokens s ~ TL.Text => Char -> Parser e s ()
475 p_separator c = P.try (() <$ p_Spaces <* P.char c) <* p_Spaces <?> [c]
477 p_Eq :: P.Tokens s ~ TL.Text => Parser e s ()
478 p_Eq = p_separator '=' <?> "Eq"