1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Language.Symantic.XML.Read
9 ( module Language.Symantic.XML.Read.Parser
10 , module Language.Symantic.XML.Read
13 import Control.Arrow (left)
14 import Control.Applicative (Applicative(..), Alternative(..))
15 import Control.Monad (Monad(..), void, unless, forM, join)
17 import Data.Char (Char)
18 import Data.Default.Class (Default(..))
19 import Data.Either (Either(..))
20 import Data.Eq (Eq(..))
21 import Data.Foldable (Foldable(..))
22 import Data.Function (($), (.), const)
23 import Data.Functor ((<$>), (<$))
24 import Data.Maybe (Maybe(..), maybe)
25 import Data.Ord (Ord(..))
26 import Data.Semigroup (Semigroup(..))
27 import Data.String (String)
28 import Data.TreeSeq.Strict (Tree(..))
29 import Data.Tuple (snd)
30 import Prelude (Num(..), Enum(..), Bounded(..), Integer, toInteger)
31 import System.IO (FilePath, IO)
32 import Text.Megaparsec ((<?>))
33 import Text.Show (Show(..))
34 import qualified Control.Exception as Exn
35 import qualified Control.Monad.Trans.Reader as R
36 import qualified Data.ByteString.Lazy as BSL
37 import qualified Data.Char as Char
38 import qualified Data.Char.Properties.XMLCharProps as XC
39 import qualified Data.HashMap.Strict as HM
40 import qualified Data.List as List
41 import qualified Data.Sequence as Seq
42 import qualified Data.Text.Encoding.Error as TL
43 import qualified Data.Text.Lazy as TL
44 import qualified Data.Text.Lazy.Encoding as TL
45 import qualified Data.TreeSeq.Strict as TS
46 import qualified System.IO.Error as IO
47 import qualified Text.Megaparsec as P
48 import qualified Text.Megaparsec.Char as P
50 import Language.Symantic.XML.Document
51 import Language.Symantic.XML.Read.Parser
53 readXML :: FilePath -> TL.Text -> Either (P.ParseError (P.Token TL.Text) Error) XMLs
54 readXML filePath stateInput =
57 (R.runReaderT p_document def)
60 , P.statePos = pure $ P.initialPos filePath
61 , P.stateTabWidth = P.pos1 -- NOTE: do not expand tabs.
62 , P.stateTokensProcessed = 0
65 readFile :: FilePath -> IO (Either ErrorRead TL.Text)
67 (left ErrorRead_Unicode . TL.decodeUtf8' <$> BSL.readFile fp)
69 if IO.isAlreadyInUseError e
70 || IO.isDoesNotExistError e
71 || IO.isPermissionError e
72 then return $ Left $ ErrorRead_IO e
77 = ErrorRead_IO IO.IOError
78 | ErrorRead_Unicode TL.UnicodeException
82 p_document :: P.Tokens s ~ TL.Text => Parser Error s XMLs
88 return (ps <> pure e <> join (Seq.fromList ms))
91 p_prolog :: P.Tokens s ~ TL.Text => Parser Error s XMLs
93 xmlDecl <- P.option Seq.empty $ pure <$> p_XMLDecl
95 return (xmlDecl <> join (Seq.fromList ms))
98 p_Misc :: P.Tokens s ~ TL.Text => Parser Error s XMLs
100 P.try (pure <$> p_Comment)
101 <|> P.try (pure <$> p_PI)
105 p_XMLDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
106 p_XMLDecl = P.label "XMLDecl" $ do
107 Sourced src as <- p_Sourced $ P.between (P.string "<?xml") (P.string "?>") $ do
108 vi <- pure <$> p_VersionInfo
109 ed <- P.option Seq.empty $ pure <$> p_EncodingDecl
110 sd <- P.option Seq.empty $ pure <$> p_SDDecl
112 return $ vi <> ed <> sd
113 return $ Tree (Sourced src $ NodePI "xml" "") as
115 p_VersionInfo :: P.Tokens s ~ TL.Text => Parser Error s XML
116 p_VersionInfo = P.label "VersionInfo" $ do
117 Sourced c v <- p_Sourced $ do
118 P.try (() <$ p_Spaces1 <* P.string "version")
120 p_quoted $ const $ p_Sourced $
123 <*> P.takeWhile1P Nothing Char.isDigit
124 return $ Tree (Sourced c $ NodeAttr "version") $ pure $
125 TS.tree0 $ NodeText . pure . TextLexemePlain <$> v
127 p_EncodingDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
128 p_EncodingDecl = P.label "EncodingDecl" $ do
129 Sourced c v <- p_Sourced $ do
130 P.try (() <$ p_Spaces1 <* P.string "encoding")
132 p_quoted $ const $ p_Sourced p_EncName
133 return $ Tree (Sourced c $ NodeAttr "encoding") $ pure $
134 TS.tree0 $ NodeText . pure . TextLexemePlain <$> v
136 p_EncName :: P.Tokens s ~ TL.Text => Parser Error s TL.Text
137 p_EncName = P.label "EncName" $ do
138 P.notFollowedBy (P.satisfy $ not . isAlpha)
139 P.takeWhile1P Nothing $ \c ->
140 isAlpha c || Char.isDigit c ||
141 c=='.' || c=='_' || c=='-'
142 where isAlpha c = Char.isAsciiLower c || Char.isAsciiUpper c
145 p_SDDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
146 p_SDDecl = P.label "SDDecl" $ do
148 Sourced ca () <- P.try (p_Sourced $ () <$ p_Spaces1 <* P.string "standalone")
150 v <- p_quoted $ const $ p_Sourced $ P.string "yes" <|> P.string "no"
151 return $ Tree (Sourced ca $ NodeAttr "standalone") $ pure $
152 TS.tree0 $ NodeText . pure . TextLexemePlain <$> v
155 p_CharData :: P.Tokens s ~ TL.Text => Parser e s [TextLexeme]
158 <$> p_until1 (\c -> XC.isXmlChar c && c/='<' && c/='&') (']',"]>")
161 p_Comment :: P.Tokens s ~ TL.Text => Parser Error s XML
162 p_Comment = p_SourcedBegin $ P.string "<!--" *> p_Comment__
163 p_Comment_ :: P.Tokens s ~ TL.Text => Parser Error s XML
164 p_Comment_ = P.string "--" *> p_Comment__
165 p_Comment__:: P.Tokens s ~ TL.Text => Parser Error s XML
166 p_Comment__ = P.label "Comment" $ do
167 c <- p_until XC.isXmlChar ('-', "-")
168 void $ P.string "-->"
170 return $ TS.tree0 (cell $ NodeComment c)
173 p_CDSect :: P.Tokens s ~ TL.Text => Parser Error s XML
174 p_CDSect = p_SourcedBegin $ P.string "<![CDATA[" *> p_CDSect__
175 p_CDSect_ :: P.Tokens s ~ TL.Text => Parser Error s XML
176 p_CDSect_ = P.string "[CDATA[" *> p_CDSect__
177 p_CDSect__ :: P.Tokens s ~ TL.Text => Parser Error s XML
178 p_CDSect__ = P.label "CDSect" $ do
179 c <- p_until XC.isXmlChar (']', "]>")
180 void $ P.string "]]>"
182 return $ TS.tree0 $ cell $ NodeCDATA c
185 p_PI :: P.Tokens s ~ TL.Text => Parser Error s XML
186 p_PI = p_SourcedBegin $ P.string "<?" *> p_PI__
187 p_PI_ :: P.Tokens s ~ TL.Text => Parser Error s XML
188 p_PI_ = P.char '?' *> p_PI__
189 p_PI__ :: P.Tokens s ~ TL.Text => Parser Error s XML
190 p_PI__ = P.label "PI" $ do
192 v <- P.option "" $ P.try $ p_Spaces1 *> p_until XC.isXmlChar ('?', ">")
195 return $ TS.tree0 $ cell $ NodePI n v
196 p_PITarget :: P.Tokens s ~ TL.Text => Parser Error s PName
200 PName{pNameSpace=Nothing, pNameLocal=NCName l}
201 | "xml" == TL.toLower l -> p_error $ Error_PI_reserved n
205 p_Element :: P.Tokens s ~ TL.Text => Parser Error s XML
206 p_Element = p_SourcedBegin $ (P.char '<' *> p_Element_)
207 p_Element_ :: P.Tokens s ~ TL.Text => Parser Error s XML
208 p_Element_ = P.label "Element" p_STag
211 p_STag :: P.Tokens s ~ TL.Text => Parser Error s XML
214 as <- P.many $ P.try $ p_Spaces1 *> p_Attribute
217 elemNS :: HM.HashMap NCName Namespace <-
218 (HM.fromList . List.concat <$>) $ forM as $ \case
219 Sourced _ (PName{..}, Sourced _ av)
220 | ns <- Namespace $ flatText av
221 , Nothing <- pNameSpace
222 , NCName "xmlns" <- pNameLocal ->
223 -- NOTE: default namespace declaration.
225 _ | ns == xmlns_xml -- DOC: it MUST NOT be declared as the default namespace
226 || ns == xmlns_xmlns -- DOC: it MUST NOT be declared as the default namespace
227 -> p_error $ Error_Namespace_reserved ns
228 _ -> return [(NCName "" , ns)]
229 | ns <- Namespace $ flatText av
230 , Just (NCName "xmlns") <- pNameSpace ->
231 -- NOTE: namespace prefix declaration.
232 case unNCName pNameLocal of
233 "xml" -- DOC: It MAY, but need not, be declared,
234 -- and MUST NOT be bound to any other namespace name.
235 | ns == xmlns_xml -> return []
236 | otherwise -> p_error $ Error_Namespace_reserved_prefix pNameLocal
237 "xmlns" -- DOC: It MUST NOT be declared
238 -> p_error $ Error_Namespace_reserved_prefix pNameLocal
239 local | "xml" <- TL.toLower $ TL.take 3 local -> return []
240 -- DOC: All other prefixes beginning with the three-letter
241 -- sequence x, m, l, in any case combination, are reserved.
242 -- This means that: processors MUST NOT treat them as fatal errors.
243 _ | ns == xmlns_xml -- DOC: Other prefixes MUST NOT be bound to this namespace name.
244 || ns == xmlns_xmlns -- DOC: Other prefixes MUST NOT be bound to this namespace name.
245 -> p_error $ Error_Namespace_reserved ns
246 _ -> return [(pNameLocal, ns)]
247 | otherwise -> return []
248 let scopeNS = elemNS <> reader_ns_scope ro
249 let defaultNS = HM.lookupDefault (reader_ns_default ro) (NCName "") scopeNS
250 let lookupNamePrefix prefix =
251 maybe (p_error $ Error_Namespace_prefix_unknown prefix) return $
252 HM.lookup prefix scopeNS
254 -- NOTE: expand element's QName.
256 Nothing -> return QName{qNameSpace=defaultNS, qNameLocal=pNameLocal n}
257 -- DOC: If there is a default namespace declaration in scope,
258 -- the expanded name corresponding to an unprefixed element name
259 -- has the URI of the default namespace as its namespace name.
261 | NCName "xmlns" <- prefix ->
262 -- DOC: Element names MUST NOT have the prefix xmlns.
263 p_error $ Error_Namespace_reserved_prefix prefix
265 ns <- lookupNamePrefix prefix
266 return QName{qNameSpace=ns, qNameLocal=pNameLocal n}
267 elemAttrs :: [Sourced FileSource (QName, Sourced FileSource Text)] <-
268 -- NOTE: expand attributes' PName into QName.
269 forM as $ \s@Sourced{unSourced=(an, av)} -> do
270 ns <- maybe (return "") lookupNamePrefix $ pNameSpace an
271 let qn = QName{qNameSpace=ns, qNameLocal=pNameLocal an}
272 return s{unSourced=(qn, av)}
273 -- NOTE: check for attribute collision.
274 let attrsByQName :: HM.HashMap QName [Sourced FileSource (QName, Sourced FileSource Text)] =
275 HM.fromListWith (<>) $ (<$> elemAttrs) $ \a@(Sourced _c (an, _av)) -> (an, [a])
276 case HM.toList $ HM.filter (\x -> length x > 1) attrsByQName of
277 (an, _):_ -> p_error $ Error_Attribute_collision an
279 elemAttrsXML :: XMLs <- (Seq.fromList <$>) $
280 forM elemAttrs $ \(Sourced sa (an, av)) -> do
281 return $ TS.Tree (Sourced sa $ NodeAttr an) $
282 pure $ TS.tree0 $ NodeText <$> av
284 elemAttrsXML <$ P.string "/>" <|>
287 { reader_ns_scope = scopeNS
288 , reader_ns_default = defaultNS
290 ((elemAttrsXML <>) <$ P.char '>' <*> p_content <* p_ETag elemName)
292 return $ Tree (cell $ NodeElem elemName) content
295 p_Attribute :: P.Tokens s ~ TL.Text => Parser Error s (Sourced FileSource (PName, Sourced FileSource Text))
296 p_Attribute = p_Sourced $ (,) <$> p_PName <* p_Eq <*> p_AttValue
298 p_AttValue :: P.Tokens s ~ TL.Text => Parser Error s (Sourced FileSource Text)
299 p_AttValue = P.label "AttValue" $ p_quoted p_AttValueText
301 p_AttValueText :: P.Tokens s ~ TL.Text => Char -> Parser Error s (Sourced FileSource Text)
302 p_AttValueText q = p_Sourced $
305 <|> TextLexemePlain <$> P.takeWhile1P Nothing (\c ->
307 c `List.notElem` (q:"<&'\">"))
308 <|> TextLexemeEntityRef entityRef_gt <$ P.char '>'
310 then TextLexemeEntityRef entityRef_quot <$ P.char '"'
311 else TextLexemeEntityRef entityRef_apos <$ P.char '\'')
315 p_content :: P.Tokens s ~ TL.Text => Parser Error s XMLs
317 (Seq.fromList <$>) $ P.many $
319 P.try $ P.char '<' *> P.notFollowedBy (P.char '/')
320 p_Element_ <|> p_PI_ <|> (P.char '!' *> (p_Comment_ <|> p_CDSect_))
322 <|> ((tree0 <$>) $ p_Sourced $ NodeText . List.concat
323 <$> P.some (p_CharData <|> pure <$> p_Reference))
326 p_ETag :: P.Tokens s ~ TL.Text => QName -> Parser Error s ()
328 got <- P.string "</" *> p_QName <* p_Spaces <* P.char '>'
329 unless (got == expected) $
330 p_error $ Error_Closing_tag_unexpected got expected
333 p_Name :: P.Tokens s ~ TL.Text => Parser Error s Name
334 p_Name = P.label "Name" $
336 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNameStartChar)
337 <*> P.takeWhile1P Nothing XC.isXmlNameChar
340 p_PName :: P.Tokens s ~ TL.Text => Parser e s PName
341 p_PName = P.label "PName" $ do
343 s <- P.optional $ P.try $ P.char ':' *> p_NCName
345 Nothing -> PName{pNameSpace=Nothing, pNameLocal=n}
346 Just l -> PName{pNameSpace=Just n , pNameLocal=l}
349 p_QName :: P.Tokens s ~ TL.Text => Parser Error s QName
350 p_QName = P.label "QName" $ do
352 s <- P.optional $ P.try $ P.char ':' *> p_NCName
355 Nothing -> return QName{qNameSpace=reader_ns_default, qNameLocal=n}
357 case HM.lookup n reader_ns_scope of
358 Nothing -> p_error $ Error_Namespace_prefix_unknown n
359 Just ns -> return QName{qNameSpace=ns, qNameLocal=l}
362 p_NCName :: P.Tokens s ~ TL.Text => Parser e s NCName
363 p_NCName = P.label "NCName" $
365 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNCNameStartChar)
366 <*> P.takeWhile1P Nothing XC.isXmlNCNameChar
369 p_Reference :: P.Tokens s ~ TL.Text => Parser Error s TextLexeme
371 TextLexemeCharRef <$> p_CharRef <|>
372 TextLexemeEntityRef <$> p_EntityRef
375 p_EntityRef :: P.Tokens s ~ TL.Text => Parser Error s EntityRef
376 p_EntityRef = P.label "EntityRef" $ do
377 ref <- P.char '&' *> p_NCName <* P.char ';'
378 EntityRef ref <$> lookupEntityRef ref
380 lookupEntityRef (NCName "lt" ) = pure "<"
381 lookupEntityRef (NCName "gt" ) = pure ">"
382 lookupEntityRef (NCName "amp" ) = pure "&"
383 lookupEntityRef (NCName "apos") = pure "'"
384 lookupEntityRef (NCName "quot") = pure "\""
385 lookupEntityRef n = p_error $ Error_EntityRef_unknown n
388 p_CharRef :: P.Tokens s ~ TL.Text => Parser Error s CharRef
389 p_CharRef = P.label "CharRef" $
391 ref <- readHexadecimal
393 <*> P.some P.hexDigitChar
399 <*> P.some P.digitChar
404 let c = toEnum (fromInteger i) in
405 if i <= toInteger (fromEnum (maxBound::Char))
407 then pure $ CharRef c
408 else p_error $ Error_CharRef_invalid i
410 readInt :: Integer -> String -> Integer
411 readInt base digits =
412 sign * List.foldl' acc 0 (List.concatMap digToInt digits1)
418 c:ds | c == '-' -> (-1, ds)
419 | c == '+' -> ( 1, ds)
420 | otherwise -> ( 1, digits)
421 ord = toInteger . Char.ord
423 | Char.isDigit c = [ord c - ord '0']
424 | Char.isAsciiLower c = [ord c - ord 'a' + 10]
425 | Char.isAsciiUpper c = [ord c - ord 'A' + 10]
428 readDecimal :: String -> Integer
429 readDecimal = readInt 10
431 readHexadecimal :: String -> Integer
432 readHexadecimal = readInt 16
435 p_Char :: P.Tokens s ~ TL.Text => Parser e s Char
436 p_Char = P.label "Char" $ P.satisfy XC.isXmlCharCR <|> p_CRLF
437 {-# INLINE p_Char #-}
440 -- | Map '\r' and '\r\n' to '\n'.
441 p_CRLF :: P.Tokens s ~ TL.Text => Parser e s Char
442 p_CRLF = P.label "CRLF" $
443 P.char '\r' *> P.option '\n' (P.char '\n')
445 p_Space :: P.Tokens s ~ TL.Text => Parser e s Char
446 p_Space = P.label "Space" $
447 P.satisfy XC.isXmlSpaceCharCR <|> p_CRLF
448 {-# INLINE p_Space #-}
450 p_Spaces :: P.Tokens s ~ TL.Text => Parser e s ()
451 p_Spaces = P.label "Spaces" $
452 void $ P.takeWhileP Nothing XC.isXmlSpaceChar
453 {-# INLINE p_Spaces #-}
455 p_S :: P.Tokens s ~ TL.Text => Parser Error s XML
456 p_S = P.label "Spaces" $
457 (\ts -> TS.tree0 (NodeText . pure . TextLexemePlain . TL.concat <$> ts))
458 <$> p_Sourced (P.some $
459 P.takeWhile1P Nothing XC.isXmlSpaceCharCR <|>
460 TL.singleton <$> p_CRLF)
462 p_Spaces1 :: P.Tokens s ~ TL.Text => Parser e s ()
463 p_Spaces1 = P.label "Spaces1" $
464 void $ P.takeWhile1P Nothing XC.isXmlSpaceChar
465 {-# INLINE p_Spaces1 #-}
468 p_separator :: P.Tokens s ~ TL.Text => Char -> Parser e s ()
469 p_separator c = P.try (() <$ p_Spaces <* P.char c) <* p_Spaces <?> [c]
471 p_Eq :: P.Tokens s ~ TL.Text => Parser e s ()
472 p_Eq = p_separator '=' <?> "Eq"