1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Language.Symantic.XML.Read
9 ( module Language.Symantic.XML.Read.Parser
10 , module Language.Symantic.XML.Read
13 import Control.Applicative (Applicative(..), Alternative(..))
14 import Control.Monad (Monad(..), void, unless, forM, join)
16 import Data.Char (Char)
17 import Data.Default.Class (Default(..))
18 import Data.Either (Either(..))
19 import Data.Eq (Eq(..))
20 import Data.Foldable (Foldable(..))
21 import Data.Function (($), (.), const)
22 import Data.Functor ((<$>), (<$))
23 import Data.Maybe (Maybe(..), maybe)
24 import Data.Ord (Ord(..))
25 import Data.Semigroup (Semigroup(..))
26 import Data.String (String)
27 import Data.TreeSeq.Strict (Tree(..))
28 import Data.Tuple (snd)
29 import Prelude (Num(..), Enum(..), Bounded(..), Integer, toInteger)
30 import System.IO (FilePath, IO)
31 import Text.Megaparsec ((<?>))
32 import qualified Control.Monad.Trans.Reader as R
33 import qualified Data.ByteString.Lazy as BSL
34 import qualified Data.Char as Char
35 import qualified Data.Char.Properties.XMLCharProps as XC
36 import qualified Data.HashMap.Strict as HM
37 import qualified Data.List as List
38 import qualified Data.Sequence as Seq
39 import qualified Data.Text.Lazy as TL
40 import qualified Data.Text.Lazy.Encoding as TL
41 import qualified Data.TreeSeq.Strict as TS
42 import qualified System.IO.Error as IO
43 import qualified Text.Megaparsec as P
44 import qualified Text.Megaparsec.Char as P
46 import Language.Symantic.XML.Document
47 import Language.Symantic.XML.Read.Parser
49 readXML :: FilePath -> TL.Text -> Either (P.ParseError (P.Token TL.Text) Error) XMLs
50 readXML filePath stateInput =
53 (R.runReaderT p_document def)
56 , P.statePos = pure $ P.initialPos filePath
57 , P.stateTabWidth = P.pos1 -- NOTE: do not expand tabs.
58 , P.stateTokensProcessed = 0
61 readFile :: FilePath -> IO (Either IO.IOError TL.Text)
63 (Right . TL.decodeUtf8 <$> BSL.readFile fp)
64 `IO.catchIOError` \e ->
65 if IO.isAlreadyInUseError e
66 || IO.isDoesNotExistError e
67 || IO.isPermissionError e
72 p_document :: P.Tokens s ~ TL.Text => Parser Error s XMLs
78 return (ps <> pure e <> join (Seq.fromList ms))
81 p_prolog :: P.Tokens s ~ TL.Text => Parser Error s XMLs
83 xmlDecl <- P.option Seq.empty $ pure <$> p_XMLDecl
85 return (xmlDecl <> join (Seq.fromList ms))
88 p_Misc :: P.Tokens s ~ TL.Text => Parser Error s XMLs
90 P.try (pure <$> p_Comment)
91 <|> P.try (pure <$> p_PI)
95 p_XMLDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
96 p_XMLDecl = P.label "XMLDecl" $ do
97 Sourced src as <- p_Sourced $ P.between (P.string "<?xml") (P.string "?>") $ do
98 vi <- pure <$> p_VersionInfo
99 ed <- P.option Seq.empty $ pure <$> p_EncodingDecl
100 sd <- P.option Seq.empty $ pure <$> p_SDDecl
102 return $ vi <> ed <> sd
103 return $ Tree (Sourced src $ NodePI "xml" "") as
105 p_VersionInfo :: P.Tokens s ~ TL.Text => Parser Error s XML
106 p_VersionInfo = P.label "VersionInfo" $ do
107 Sourced c v <- p_Sourced $ do
108 P.try (() <$ p_Spaces <* P.string "version")
110 p_quoted $ const $ p_Sourced $
113 <*> P.takeWhile1P Nothing Char.isDigit
114 return $ Tree (Sourced c $ NodeAttr "version") $ pure $
115 TS.tree0 $ NodeText . pure . TextLexemePlain <$> v
117 p_EncodingDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
118 p_EncodingDecl = P.label "EncodingDecl" $ do
119 Sourced c v <- p_Sourced $ do
120 P.try (() <$ p_Spaces <* P.string "encoding")
122 p_quoted $ const $ p_Sourced p_EncName
123 return $ Tree (Sourced c $ NodeAttr "encoding") $ pure $
124 TS.tree0 $ NodeText . pure . TextLexemePlain <$> v
126 p_EncName :: P.Tokens s ~ TL.Text => Parser Error s TL.Text
127 p_EncName = P.label "EncName" $ do
128 P.notFollowedBy (P.satisfy $ not . isAlpha)
129 P.takeWhile1P Nothing $ \c ->
130 isAlpha c || Char.isDigit c ||
131 c=='.' || c=='_' || c=='-'
132 where isAlpha c = Char.isAsciiLower c || Char.isAsciiUpper c
135 p_SDDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
136 p_SDDecl = P.label "SDDecl" $ do
138 Sourced ca () <- P.try (p_Sourced $ () <$ p_Spaces <* P.string "standalone")
140 v <- p_quoted $ const $ p_Sourced $ P.string "yes" <|> P.string "no"
141 return $ Tree (Sourced ca $ NodeAttr "standalone") $ pure $
142 TS.tree0 $ NodeText . pure . TextLexemePlain <$> v
145 p_CharData :: P.Tokens s ~ TL.Text => Parser e s XML
147 (\t -> TS.tree0 (NodeText . escapeText <$> t))
148 <$> p_Sourced (p_until1 (\c -> XC.isXmlChar c && c/='<' && c/='&') (']',"]>"))
151 p_Comment :: P.Tokens s ~ TL.Text => Parser Error s XML
152 p_Comment = p_SourcedBegin $ P.string "<!--" *> p_Comment__
153 p_Comment_ :: P.Tokens s ~ TL.Text => Parser Error s XML
154 p_Comment_ = P.string "--" *> p_Comment__
155 p_Comment__:: P.Tokens s ~ TL.Text => Parser Error s XML
156 p_Comment__ = P.label "Comment" $ do
157 c <- p_until XC.isXmlChar ('-', "-")
158 void $ P.string "-->"
160 return $ TS.tree0 (cell $ NodeComment c)
163 p_CDSect :: P.Tokens s ~ TL.Text => Parser Error s XML
164 p_CDSect = p_SourcedBegin $ P.string "<![CDATA[" *> p_CDSect__
165 p_CDSect_ :: P.Tokens s ~ TL.Text => Parser Error s XML
166 p_CDSect_ = P.string "[CDATA[" *> p_CDSect__
167 p_CDSect__ :: P.Tokens s ~ TL.Text => Parser Error s XML
168 p_CDSect__ = P.label "CDSect" $ do
169 c <- p_until XC.isXmlChar (']', "]>")
170 void $ P.string "]]>"
172 return $ TS.tree0 $ cell $ NodeCDATA c
175 p_PI :: P.Tokens s ~ TL.Text => Parser Error s XML
176 p_PI = p_SourcedBegin $ P.string "<?" *> p_PI__
177 p_PI_ :: P.Tokens s ~ TL.Text => Parser Error s XML
178 p_PI_ = P.char '?' *> p_PI__
179 p_PI__ :: P.Tokens s ~ TL.Text => Parser Error s XML
180 p_PI__ = P.label "PI" $ do
182 v <- P.option "" $ P.try $ p_Spaces1 *> p_until XC.isXmlChar ('?', ">")
185 return $ TS.tree0 $ cell $ NodePI n v
186 p_PITarget :: P.Tokens s ~ TL.Text => Parser Error s PName
190 PName{pNameSpace=Just "", pNameLocal=NCName l}
191 | "xml" == TL.toLower l -> p_error $ Error_PI_reserved n
195 p_Element :: P.Tokens s ~ TL.Text => Parser Error s XML
196 p_Element = p_SourcedBegin $ (P.char '<' *> p_Element_)
197 p_Element_ :: P.Tokens s ~ TL.Text => Parser Error s XML
198 p_Element_ = P.label "Element" p_STag
201 p_STag :: P.Tokens s ~ TL.Text => Parser Error s XML
204 as <- P.many $ P.try $ p_Spaces1 *> p_Attribute
207 elemNS :: HM.HashMap NCName Namespace <-
208 (HM.fromList . List.concat <$>) $ forM as $ \case
209 Sourced _ (PName{..}, Sourced _ av)
210 | ns <- Namespace $ flatText av
211 , Nothing <- pNameSpace
212 , NCName "xmlns" <- pNameLocal ->
213 -- NOTE: default namespace declaration.
215 _ | ns == xmlns_xml -- DOC: it MUST NOT be declared as the default namespace
216 || ns == xmlns_xmlns -- DOC: it MUST NOT be declared as the default namespace
217 -> p_error $ Error_Namespace_reserved ns
218 _ -> return [(NCName "" , ns)]
219 | ns <- Namespace $ flatText av
220 , Just (NCName "xmlns") <- pNameSpace ->
221 -- NOTE: namespace prefix declaration.
222 case unNCName pNameLocal of
223 "xml" -- DOC: It MAY, but need not, be declared,
224 -- and MUST NOT be bound to any other namespace name.
225 | ns == xmlns_xml -> return []
226 | otherwise -> p_error $ Error_Namespace_reserved_prefix pNameLocal
227 "xmlns" -- DOC: It MUST NOT be declared
228 -> p_error $ Error_Namespace_reserved_prefix pNameLocal
229 local | "xml" <- TL.toLower $ TL.take 3 local -> return []
230 -- DOC: All other prefixes beginning with the three-letter
231 -- sequence x, m, l, in any case combination, are reserved.
232 -- This means that: processors MUST NOT treat them as fatal errors.
233 _ | ns == xmlns_xml -- DOC: Other prefixes MUST NOT be bound to this namespace name.
234 || ns == xmlns_xmlns -- DOC: Other prefixes MUST NOT be bound to this namespace name.
235 -> p_error $ Error_Namespace_reserved ns
236 _ -> return [(pNameLocal, ns)]
237 | otherwise -> return []
238 let scopeNS = elemNS <> reader_ns_scope ro
239 let defaultNS = HM.lookupDefault (reader_ns_default ro) (NCName "") scopeNS
240 let lookupNamePrefix prefix =
241 maybe (p_error $ Error_Namespace_prefix_unknown prefix) return $
242 HM.lookup prefix scopeNS
244 -- NOTE: expand element's QName.
246 Nothing -> return QName{qNameSpace=defaultNS, qNameLocal=pNameLocal n}
247 -- DOC: If there is a default namespace declaration in scope,
248 -- the expanded name corresponding to an unprefixed element name
249 -- has the URI of the default namespace as its namespace name.
251 | NCName "xmlns" <- prefix ->
252 -- DOC: Element names MUST NOT have the prefix xmlns.
253 p_error $ Error_Namespace_reserved_prefix prefix
255 ns <- lookupNamePrefix prefix
256 return QName{qNameSpace=ns, qNameLocal=pNameLocal n}
257 elemAttrs :: [Sourced FileSource (QName, Sourced FileSource Text)] <-
258 -- NOTE: expand attributes' PName into QName.
259 forM as $ \s@Sourced{unSourced=(an, av)} -> do
260 ns <- maybe (return "") lookupNamePrefix $ pNameSpace an
261 let qn = QName{qNameSpace=ns, qNameLocal=pNameLocal an}
262 return s{unSourced=(qn, av)}
263 -- NOTE: check for attribute collision.
264 let attrsByQName :: HM.HashMap QName [Sourced FileSource (QName, Sourced FileSource Text)] =
265 HM.fromListWith (<>) $ (<$> elemAttrs) $ \a@(Sourced _c (an, _av)) -> (an, [a])
266 case HM.toList $ HM.filter (\x -> length x > 1) attrsByQName of
267 (an, _):_ -> p_error $ Error_Attribute_collision an
269 elemAttrsXML :: XMLs <- (Seq.fromList <$>) $
270 forM elemAttrs $ \(Sourced sa (an, av)) -> do
271 return $ TS.Tree (Sourced sa $ NodeAttr an) $
272 pure $ TS.tree0 $ NodeText <$> av
274 elemAttrsXML <$ P.string "/>" <|>
277 { reader_ns_scope = scopeNS
278 , reader_ns_default = defaultNS
280 ((elemAttrsXML <>) <$ P.char '>' <*> p_content <* p_ETag elemName)
282 return $ Tree (cell $ NodeElem elemName) content
285 p_Attribute :: P.Tokens s ~ TL.Text => Parser Error s (Sourced FileSource (PName, Sourced FileSource Text))
286 p_Attribute = p_Sourced $ (,) <$> p_PName <* p_Eq <*> p_AttValue
288 p_AttValue :: P.Tokens s ~ TL.Text => Parser Error s (Sourced FileSource Text)
289 p_AttValue = P.label "AttValue" $ p_quoted p_AttValueText
291 p_AttValueText :: P.Tokens s ~ TL.Text => Char -> Parser Error s (Sourced FileSource Text)
292 p_AttValueText q = p_Sourced $
295 <|> TextLexemePlain <$> P.takeWhile1P Nothing (\c ->
297 c `List.notElem` (q:"<&'\">"))
298 <|> TextLexemeEntityRef entityRef_gt <$ P.char '>'
300 then TextLexemeEntityRef entityRef_quot <$ P.char '"'
301 else TextLexemeEntityRef entityRef_apos <$ P.char '\'')
305 p_content :: P.Tokens s ~ TL.Text => Parser Error s XMLs
307 (Seq.fromList <$>) $ P.many $
309 P.try $ P.char '<' *> P.notFollowedBy (P.char '/')
310 p_Element_ <|> p_PI_ <|> (P.char '!' *> (p_Comment_ <|> p_CDSect_))
314 p_ETag :: P.Tokens s ~ TL.Text => QName -> Parser Error s ()
316 got <- P.string "</" *> p_QName <* p_Spaces <* P.char '>'
317 unless (got == expected) $
318 p_error $ Error_Closing_tag_unexpected got expected
321 p_Name :: P.Tokens s ~ TL.Text => Parser Error s Name
322 p_Name = P.label "Name" $
324 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNameStartChar)
325 <*> P.takeWhile1P Nothing XC.isXmlNameChar
328 p_PName :: P.Tokens s ~ TL.Text => Parser e s PName
329 p_PName = P.label "PName" $ do
331 s <- P.optional $ P.try $ P.char ':' *> p_NCName
333 Nothing -> PName{pNameSpace=Nothing, pNameLocal=n}
334 Just l -> PName{pNameSpace=Just n , pNameLocal=l}
337 p_QName :: P.Tokens s ~ TL.Text => Parser Error s QName
338 p_QName = P.label "QName" $ do
340 s <- P.optional $ P.try $ P.char ':' *> p_NCName
343 Nothing -> return QName{qNameSpace=reader_ns_default, qNameLocal=n}
345 case HM.lookup n reader_ns_scope of
346 Nothing -> p_error $ Error_Namespace_prefix_unknown n
347 Just ns -> return QName{qNameSpace=ns, qNameLocal=l}
350 p_NCName :: P.Tokens s ~ TL.Text => Parser e s NCName
351 p_NCName = P.label "NCName" $
353 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNCNameStartChar)
354 <*> P.takeWhile1P Nothing XC.isXmlNCNameChar
357 p_Reference :: P.Tokens s ~ TL.Text => Parser Error s TextLexeme
359 TextLexemeCharRef <$> p_CharRef <|>
360 TextLexemeEntityRef <$> p_EntityRef
363 p_EntityRef :: P.Tokens s ~ TL.Text => Parser Error s EntityRef
364 p_EntityRef = P.label "EntityRef" $ do
365 ref <- P.char '&' *> p_NCName <* P.char ';'
366 EntityRef ref <$> lookupEntityRef ref
368 lookupEntityRef (NCName "lt" ) = pure "<"
369 lookupEntityRef (NCName "gt" ) = pure ">"
370 lookupEntityRef (NCName "amp" ) = pure "&"
371 lookupEntityRef (NCName "apos") = pure "'"
372 lookupEntityRef (NCName "quot") = pure "\""
373 lookupEntityRef n = p_error $ Error_EntityRef_unknown n
376 p_CharRef :: P.Tokens s ~ TL.Text => Parser Error s CharRef
377 p_CharRef = P.label "CharRef" $
379 ref <- readHexadecimal
381 <*> P.some P.hexDigitChar
387 <*> P.some P.digitChar
392 let c = toEnum (fromInteger i) in
393 if i <= toInteger (fromEnum (maxBound::Char))
395 then pure $ CharRef c
396 else p_error $ Error_CharRef_invalid i
398 readInt :: Integer -> String -> Integer
399 readInt base digits =
400 sign * List.foldl' acc 0 (List.concatMap digToInt digits1)
406 c:ds | c == '-' -> (-1, ds)
407 | c == '+' -> ( 1, ds)
408 | otherwise -> ( 1, digits)
409 ord = toInteger . Char.ord
411 | Char.isDigit c = [ord c - ord '0']
412 | Char.isAsciiLower c = [ord c - ord 'a' + 10]
413 | Char.isAsciiUpper c = [ord c - ord 'A' + 10]
416 readDecimal :: String -> Integer
417 readDecimal = readInt 10
419 readHexadecimal :: String -> Integer
420 readHexadecimal = readInt 16
423 p_Char :: P.Tokens s ~ TL.Text => Parser e s Char
424 p_Char = P.label "Char" $ P.satisfy XC.isXmlCharCR <|> p_CRLF
425 {-# INLINE p_Char #-}
428 -- | Map '\r' and '\r\n' to '\n'.
429 p_CRLF :: P.Tokens s ~ TL.Text => Parser e s Char
430 p_CRLF = P.label "CRLF" $
431 P.char '\r' *> P.option '\n' (P.char '\n')
433 p_Space :: P.Tokens s ~ TL.Text => Parser e s Char
434 p_Space = P.label "Space" $
435 P.satisfy XC.isXmlSpaceCharCR <|> p_CRLF
436 {-# INLINE p_Space #-}
438 p_Spaces :: P.Tokens s ~ TL.Text => Parser e s ()
439 p_Spaces = P.label "Spaces" $
440 void $ P.takeWhileP Nothing XC.isXmlSpaceChar
441 {-# INLINE p_Spaces #-}
443 p_S :: P.Tokens s ~ TL.Text => Parser Error s XML
444 p_S = P.label "Spaces" $
445 (\ts -> TS.tree0 (NodeText . pure . TextLexemePlain . TL.concat <$> ts))
446 <$> p_Sourced (P.some $
447 P.takeWhile1P Nothing XC.isXmlSpaceCharCR <|>
448 TL.singleton <$> p_CRLF)
450 p_Spaces1 :: P.Tokens s ~ TL.Text => Parser e s ()
451 p_Spaces1 = P.label "Spaces1" $
452 void $ P.takeWhile1P Nothing XC.isXmlSpaceChar
453 {-# INLINE p_Spaces1 #-}
456 p_separator :: P.Tokens s ~ TL.Text => Char -> Parser e s ()
457 p_separator c = P.try (() <$ p_Spaces <* P.char c) <* p_Spaces <?> [c]
459 p_Eq :: P.Tokens s ~ TL.Text => Parser e s ()
460 p_Eq = p_separator '=' <?> "Eq"