1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Language.Symantic.XML.Read
9 ( module Language.Symantic.XML.Read.Parser
10 , module Language.Symantic.XML.Read
13 import Control.Arrow (left)
14 import Control.Applicative (Applicative(..), Alternative(..))
15 import Control.Monad (Monad(..), void, unless, forM, join)
17 import Data.Char (Char)
18 import Data.Default.Class (Default(..))
19 import Data.Either (Either(..))
20 import Data.Eq (Eq(..))
21 import Data.Foldable (Foldable(..))
22 import Data.Function (($), (.), const)
23 import Data.Functor ((<$>), (<$))
24 import Data.Maybe (Maybe(..), maybe)
25 import Data.Monoid (Monoid(..))
26 import Data.Ord (Ord(..))
27 import Data.Semigroup (Semigroup(..))
28 import Data.String (String)
29 import Data.TreeSeq.Strict (Tree(..))
30 import Data.Tuple (snd)
31 import Prelude (Num(..), Enum(..), Bounded(..), Integer, toInteger)
32 import System.IO (FilePath, IO)
33 import Text.Megaparsec ((<?>))
34 import Text.Show (Show(..))
35 import qualified Control.Exception as Exn
36 import qualified Control.Monad.Trans.Reader as R
37 import qualified Data.ByteString.Lazy as BSL
38 import qualified Data.Char as Char
39 import qualified Data.Char.Properties.XMLCharProps as XC
40 import qualified Data.HashMap.Strict as HM
41 import qualified Data.List as List
42 import qualified Data.Sequence as Seq
43 import qualified Data.Text.Encoding.Error as TL
44 import qualified Data.Text.Lazy as TL
45 import qualified Data.Text.Lazy.Encoding as TL
46 import qualified Data.TreeSeq.Strict as TS
47 import qualified System.IO.Error as IO
48 import qualified Text.Megaparsec as P
49 import qualified Text.Megaparsec.Char as P
51 import Language.Symantic.XML.Document
52 import Language.Symantic.XML.Read.Parser
54 readXML :: FilePath -> TL.Text -> Either (P.ParseError (P.Token TL.Text) Error) XMLs
55 readXML filePath stateInput =
58 (R.runReaderT p_document def)
61 , P.statePos = pure $ P.initialPos filePath
62 , P.stateTabWidth = P.pos1 -- NOTE: do not expand tabs.
63 , P.stateTokensProcessed = 0
66 readFile :: FilePath -> IO (Either ErrorRead TL.Text)
68 (left ErrorRead_Unicode . TL.decodeUtf8' <$> BSL.readFile fp)
70 if IO.isAlreadyInUseError e
71 || IO.isDoesNotExistError e
72 || IO.isPermissionError e
73 then return $ Left $ ErrorRead_IO e
78 = ErrorRead_IO IO.IOError
79 | ErrorRead_Unicode TL.UnicodeException
83 p_document :: P.Tokens s ~ TL.Text => Parser Error s XMLs
89 return (ps <> pure e <> join (Seq.fromList ms))
92 p_prolog :: P.Tokens s ~ TL.Text => Parser Error s XMLs
94 xmlDecl <- P.option Seq.empty $ pure <$> p_XMLDecl
96 return (xmlDecl <> join (Seq.fromList ms))
99 p_Misc :: P.Tokens s ~ TL.Text => Parser Error s XMLs
101 P.try (pure <$> p_Comment)
102 <|> P.try (pure <$> p_PI)
106 p_XMLDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
107 p_XMLDecl = P.label "XMLDecl" $ do
108 Sourced src as <- p_Sourced $ P.between (P.string "<?xml") (P.string "?>") $ do
109 vi <- pure <$> p_VersionInfo
110 ed <- P.option Seq.empty $ pure <$> p_EncodingDecl
111 sd <- P.option Seq.empty $ pure <$> p_SDDecl
113 return $ vi <> ed <> sd
114 return $ Tree (Sourced src $ NodePI "xml" "") as
116 p_VersionInfo :: P.Tokens s ~ TL.Text => Parser Error s XML
117 p_VersionInfo = P.label "VersionInfo" $ do
118 Sourced c v <- p_Sourced $ do
119 P.try (() <$ p_Spaces1 <* P.string "version")
121 p_quoted $ const $ p_Sourced $
124 <*> P.takeWhile1P Nothing Char.isDigit
125 return $ Tree (Sourced c $ NodeAttr "version") $ pure $
126 TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v
128 p_EncodingDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
129 p_EncodingDecl = P.label "EncodingDecl" $ do
130 Sourced c v <- p_Sourced $ do
131 P.try (() <$ p_Spaces1 <* P.string "encoding")
133 p_quoted $ const $ p_Sourced p_EncName
134 return $ Tree (Sourced c $ NodeAttr "encoding") $ pure $
135 TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v
137 p_EncName :: P.Tokens s ~ TL.Text => Parser Error s TL.Text
138 p_EncName = P.label "EncName" $ do
139 P.notFollowedBy (P.satisfy $ not . isAlpha)
140 P.takeWhile1P Nothing $ \c ->
141 isAlpha c || Char.isDigit c ||
142 c=='.' || c=='_' || c=='-'
143 where isAlpha c = Char.isAsciiLower c || Char.isAsciiUpper c
146 p_SDDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
147 p_SDDecl = P.label "SDDecl" $ do
149 Sourced ca () <- P.try (p_Sourced $ () <$ p_Spaces1 <* P.string "standalone")
151 v <- p_quoted $ const $ p_Sourced $ P.string "yes" <|> P.string "no"
152 return $ Tree (Sourced ca $ NodeAttr "standalone") $ pure $
153 TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v
156 p_CharData :: P.Tokens s ~ TL.Text => Parser e s EscapedText
159 <$> p_until1 (\c -> XC.isXmlChar c && c/='<' && c/='&') (']',"]>")
162 p_Comment :: P.Tokens s ~ TL.Text => Parser Error s XML
163 p_Comment = p_SourcedBegin $ P.string "<!--" *> p_Comment__
164 p_Comment_ :: P.Tokens s ~ TL.Text => Parser Error s XML
165 p_Comment_ = P.string "--" *> p_Comment__
166 p_Comment__:: P.Tokens s ~ TL.Text => Parser Error s XML
167 p_Comment__ = P.label "Comment" $ do
168 c <- p_until XC.isXmlChar ('-', "-")
169 void $ P.string "-->"
171 return $ TS.tree0 (cell $ NodeComment c)
174 p_CDSect :: P.Tokens s ~ TL.Text => Parser Error s XML
175 p_CDSect = p_SourcedBegin $ P.string "<![CDATA[" *> p_CDSect__
176 p_CDSect_ :: P.Tokens s ~ TL.Text => Parser Error s XML
177 p_CDSect_ = P.string "[CDATA[" *> p_CDSect__
178 p_CDSect__ :: P.Tokens s ~ TL.Text => Parser Error s XML
179 p_CDSect__ = P.label "CDSect" $ do
180 c <- p_until XC.isXmlChar (']', "]>")
181 void $ P.string "]]>"
183 return $ TS.tree0 $ cell $ NodeCDATA c
186 p_PI :: P.Tokens s ~ TL.Text => Parser Error s XML
187 p_PI = p_SourcedBegin $ P.string "<?" *> p_PI__
188 p_PI_ :: P.Tokens s ~ TL.Text => Parser Error s XML
189 p_PI_ = P.char '?' *> p_PI__
190 p_PI__ :: P.Tokens s ~ TL.Text => Parser Error s XML
191 p_PI__ = P.label "PI" $ do
193 v <- P.option "" $ P.try $ p_Spaces1 *> p_until XC.isXmlChar ('?', ">")
196 return $ TS.tree0 $ cell $ NodePI n v
197 p_PITarget :: P.Tokens s ~ TL.Text => Parser Error s PName
201 PName{pNameSpace=Nothing, pNameLocal=NCName l}
202 | "xml" == TL.toLower l -> p_error $ Error_PI_reserved n
206 p_Element :: P.Tokens s ~ TL.Text => Parser Error s XML
207 p_Element = p_SourcedBegin $ (P.char '<' *> p_Element_)
208 p_Element_ :: P.Tokens s ~ TL.Text => Parser Error s XML
209 p_Element_ = P.label "Element" p_STag
212 p_STag :: P.Tokens s ~ TL.Text => Parser Error s XML
215 as <- P.many $ P.try $ p_Spaces1 *> p_Attribute
218 elemNS :: HM.HashMap NCName Namespace <-
219 (HM.fromList . List.concat <$>) $ forM as $ \case
220 Sourced _ (PName{..}, Sourced _ av)
221 | ns <- Namespace $ unescapeText av
222 , Nothing <- pNameSpace
223 , NCName "xmlns" <- pNameLocal ->
224 -- NOTE: default namespace declaration.
226 _ | ns == xmlns_xml -- DOC: it MUST NOT be declared as the default namespace
227 || ns == xmlns_xmlns -- DOC: it MUST NOT be declared as the default namespace
228 -> p_error $ Error_Namespace_reserved ns
229 _ -> return [(NCName "" , ns)]
230 | ns <- Namespace $ unescapeText av
231 , Just (NCName "xmlns") <- pNameSpace ->
232 -- NOTE: namespace prefix declaration.
233 case unNCName pNameLocal of
234 "xml" -- DOC: It MAY, but need not, be declared,
235 -- and MUST NOT be bound to any other namespace name.
236 | ns == xmlns_xml -> return []
237 | otherwise -> p_error $ Error_Namespace_reserved_prefix pNameLocal
238 "xmlns" -- DOC: It MUST NOT be declared
239 -> p_error $ Error_Namespace_reserved_prefix pNameLocal
240 local | "xml" <- TL.toLower $ TL.take 3 local -> return []
241 -- DOC: All other prefixes beginning with the three-letter
242 -- sequence x, m, l, in any case combination, are reserved.
243 -- This means that: processors MUST NOT treat them as fatal errors.
244 _ | ns == xmlns_xml -- DOC: Other prefixes MUST NOT be bound to this namespace name.
245 || ns == xmlns_xmlns -- DOC: Other prefixes MUST NOT be bound to this namespace name.
246 -> p_error $ Error_Namespace_reserved ns
247 _ -> return [(pNameLocal, ns)]
248 | otherwise -> return []
249 let scopeNS = elemNS <> reader_ns_scope ro
250 let defaultNS = HM.lookupDefault (reader_ns_default ro) (NCName "") scopeNS
251 let lookupNamePrefix prefix =
252 maybe (p_error $ Error_Namespace_prefix_unknown prefix) return $
253 HM.lookup prefix scopeNS
255 -- NOTE: expand element's QName.
257 Nothing -> return QName{qNameSpace=defaultNS, qNameLocal=pNameLocal n}
258 -- DOC: If there is a default namespace declaration in scope,
259 -- the expanded name corresponding to an unprefixed element name
260 -- has the URI of the default namespace as its namespace name.
262 | NCName "xmlns" <- prefix ->
263 -- DOC: Element names MUST NOT have the prefix xmlns.
264 p_error $ Error_Namespace_reserved_prefix prefix
266 ns <- lookupNamePrefix prefix
267 return QName{qNameSpace=ns, qNameLocal=pNameLocal n}
268 elemAttrs :: [Sourced FileSource (QName, Sourced FileSource EscapedText)] <-
269 -- NOTE: expand attributes' PName into QName.
270 forM as $ \s@Sourced{unSourced=(an, av)} -> do
271 ns <- maybe (return "") lookupNamePrefix $ pNameSpace an
272 let qn = QName{qNameSpace=ns, qNameLocal=pNameLocal an}
273 return s{unSourced=(qn, av)}
274 -- NOTE: check for attribute collision.
275 let attrsByQName :: HM.HashMap QName [Sourced FileSource (QName, Sourced FileSource EscapedText)] =
276 HM.fromListWith (<>) $ (<$> elemAttrs) $ \a@(Sourced _c (an, _av)) -> (an, [a])
277 case HM.toList $ HM.filter (\x -> length x > 1) attrsByQName of
278 (an, _):_ -> p_error $ Error_Attribute_collision an
280 elemAttrsXML :: XMLs <- (Seq.fromList <$>) $
281 forM elemAttrs $ \(Sourced sa (an, av)) -> do
282 return $ TS.Tree (Sourced sa $ NodeAttr an) $
283 pure $ TS.tree0 $ NodeText <$> av
285 elemAttrsXML <$ P.string "/>" <|>
288 { reader_ns_scope = scopeNS
289 , reader_ns_default = defaultNS
291 ((elemAttrsXML <>) <$ P.char '>' <*> p_content <* p_ETag elemName)
293 return $ Tree (cell $ NodeElem elemName) content
296 p_Attribute :: P.Tokens s ~ TL.Text => Parser Error s (Sourced FileSource (PName, Sourced FileSource EscapedText))
297 p_Attribute = p_Sourced $ (,) <$> p_PName <* p_Eq <*> p_AttValue
299 p_AttValue :: P.Tokens s ~ TL.Text => Parser Error s (Sourced FileSource EscapedText)
300 p_AttValue = P.label "AttValue" $ p_quoted p_AttValueText
302 p_AttValueText :: P.Tokens s ~ TL.Text => Char -> Parser Error s (Sourced FileSource EscapedText)
303 p_AttValueText q = p_Sourced $
304 EscapedText . Seq.fromList <$> P.many
306 <|> EscapedPlain <$> P.takeWhile1P Nothing (\c ->
308 c `List.notElem` (q:"<&'\">"))
309 <|> EscapedEntityRef entityRef_gt <$ P.char '>'
311 then EscapedEntityRef entityRef_quot <$ P.char '"'
312 else EscapedEntityRef entityRef_apos <$ P.char '\'')
316 p_content :: P.Tokens s ~ TL.Text => Parser Error s XMLs
318 (Seq.fromList <$>) $ P.many $
320 P.try $ P.char '<' *> P.notFollowedBy (P.char '/')
321 p_Element_ <|> p_PI_ <|> (P.char '!' *> (p_Comment_ <|> p_CDSect_))
323 <|> ((tree0 <$>) $ p_Sourced $ NodeText . mconcat
324 <$> P.some (p_CharData <|> EscapedText . pure <$> p_Reference))
327 p_ETag :: P.Tokens s ~ TL.Text => QName -> Parser Error s ()
329 got <- P.string "</" *> p_QName <* p_Spaces <* P.char '>'
330 unless (got == expected) $
331 p_error $ Error_Closing_tag_unexpected got expected
334 p_Name :: P.Tokens s ~ TL.Text => Parser Error s Name
335 p_Name = P.label "Name" $
337 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNameStartChar)
338 <*> P.takeWhile1P Nothing XC.isXmlNameChar
341 p_PName :: P.Tokens s ~ TL.Text => Parser e s PName
342 p_PName = P.label "PName" $ do
344 s <- P.optional $ P.try $ P.char ':' *> p_NCName
346 Nothing -> PName{pNameSpace=Nothing, pNameLocal=n}
347 Just l -> PName{pNameSpace=Just n , pNameLocal=l}
350 p_QName :: P.Tokens s ~ TL.Text => Parser Error s QName
351 p_QName = P.label "QName" $ do
353 s <- P.optional $ P.try $ P.char ':' *> p_NCName
356 Nothing -> return QName{qNameSpace=reader_ns_default, qNameLocal=n}
358 case HM.lookup n reader_ns_scope of
359 Nothing -> p_error $ Error_Namespace_prefix_unknown n
360 Just ns -> return QName{qNameSpace=ns, qNameLocal=l}
363 p_NCName :: P.Tokens s ~ TL.Text => Parser e s NCName
364 p_NCName = P.label "NCName" $
366 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNCNameStartChar)
367 <*> P.takeWhile1P Nothing XC.isXmlNCNameChar
370 p_Reference :: P.Tokens s ~ TL.Text => Parser Error s Escaped
372 EscapedCharRef <$> p_CharRef <|>
373 EscapedEntityRef <$> p_EntityRef
376 p_EntityRef :: P.Tokens s ~ TL.Text => Parser Error s EntityRef
377 p_EntityRef = P.label "EntityRef" $ do
378 ref <- P.char '&' *> p_NCName <* P.char ';'
379 EntityRef ref <$> lookupEntityRef ref
381 lookupEntityRef (NCName "lt" ) = pure "<"
382 lookupEntityRef (NCName "gt" ) = pure ">"
383 lookupEntityRef (NCName "amp" ) = pure "&"
384 lookupEntityRef (NCName "apos") = pure "'"
385 lookupEntityRef (NCName "quot") = pure "\""
386 lookupEntityRef n = p_error $ Error_EntityRef_unknown n
389 p_CharRef :: P.Tokens s ~ TL.Text => Parser Error s CharRef
390 p_CharRef = P.label "CharRef" $
392 ref <- readHexadecimal
394 <*> P.some P.hexDigitChar
400 <*> P.some P.digitChar
405 let c = toEnum (fromInteger i) in
406 if i <= toInteger (fromEnum (maxBound::Char))
408 then pure $ CharRef c
409 else p_error $ Error_CharRef_invalid i
411 readInt :: Integer -> String -> Integer
412 readInt base digits =
413 sign * List.foldl' acc 0 (List.concatMap digToInt digits1)
419 c:ds | c == '-' -> (-1, ds)
420 | c == '+' -> ( 1, ds)
421 | otherwise -> ( 1, digits)
422 ord = toInteger . Char.ord
424 | Char.isDigit c = [ord c - ord '0']
425 | Char.isAsciiLower c = [ord c - ord 'a' + 10]
426 | Char.isAsciiUpper c = [ord c - ord 'A' + 10]
429 readDecimal :: String -> Integer
430 readDecimal = readInt 10
432 readHexadecimal :: String -> Integer
433 readHexadecimal = readInt 16
436 p_Char :: P.Tokens s ~ TL.Text => Parser e s Char
437 p_Char = P.label "Char" $ P.satisfy XC.isXmlCharCR <|> p_CRLF
438 {-# INLINE p_Char #-}
441 -- | Map '\r' and '\r\n' to '\n'.
442 p_CRLF :: P.Tokens s ~ TL.Text => Parser e s Char
443 p_CRLF = P.label "CRLF" $
444 P.char '\r' *> P.option '\n' (P.char '\n')
446 p_Space :: P.Tokens s ~ TL.Text => Parser e s Char
447 p_Space = P.label "Space" $
448 P.satisfy XC.isXmlSpaceCharCR <|> p_CRLF
449 {-# INLINE p_Space #-}
451 p_Spaces :: P.Tokens s ~ TL.Text => Parser e s ()
452 p_Spaces = P.label "Spaces" $
453 void $ P.takeWhileP Nothing XC.isXmlSpaceChar
454 {-# INLINE p_Spaces #-}
456 p_S :: P.Tokens s ~ TL.Text => Parser Error s XML
457 p_S = P.label "Spaces" $
458 (\ts -> TS.tree0 (NodeText . EscapedText . pure . EscapedPlain . TL.concat <$> ts))
459 <$> p_Sourced (P.some $
460 P.takeWhile1P Nothing XC.isXmlSpaceCharCR <|>
461 TL.singleton <$> p_CRLF)
463 p_Spaces1 :: P.Tokens s ~ TL.Text => Parser e s ()
464 p_Spaces1 = P.label "Spaces1" $
465 void $ P.takeWhile1P Nothing XC.isXmlSpaceChar
466 {-# INLINE p_Spaces1 #-}
469 p_separator :: P.Tokens s ~ TL.Text => Char -> Parser e s ()
470 p_separator c = P.try (() <$ p_Spaces <* P.char c) <* p_Spaces <?> [c]
472 p_Eq :: P.Tokens s ~ TL.Text => Parser e s ()
473 p_Eq = p_separator '=' <?> "Eq"