1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Language.Symantic.XML.Read
9 ( module Language.Symantic.XML.Read.Parser
10 , module Language.Symantic.XML.Read
13 import Control.Applicative (Applicative(..), Alternative(..))
14 import Control.Monad (Monad(..), void, unless, forM, join)
16 import Data.Char (Char)
17 import Data.Default.Class (Default(..))
18 import Data.Either (Either(..))
19 import Data.Eq (Eq(..))
20 import Data.Foldable (Foldable(..))
21 import Data.Function (($), (.), const)
22 import Data.Functor ((<$>), (<$))
23 import Data.Maybe (Maybe(..), maybe)
24 import Data.Ord (Ord(..))
25 import Data.Semigroup (Semigroup(..))
26 import Data.String (String)
27 import Data.TreeSeq.Strict (Tree(..))
28 import Data.Tuple (snd)
29 import Prelude (Num(..), Enum(..), Bounded(..), Integer, toInteger)
30 import System.IO (FilePath, IO)
31 import Text.Megaparsec ((<?>))
32 import qualified Control.Monad.Trans.Reader as R
33 import qualified Data.ByteString.Lazy as BSL
34 import qualified Data.Char as Char
35 import qualified Data.Char.Properties.XMLCharProps as XC
36 import qualified Data.HashMap.Strict as HM
37 import qualified Data.List as List
38 import qualified Data.Sequence as Seq
39 import qualified Data.Text.Lazy as TL
40 import qualified Data.Text.Lazy.Encoding as TL
41 import qualified Data.TreeSeq.Strict as TS
42 import qualified System.IO.Error as IO
43 import qualified Text.Megaparsec as P
44 import qualified Text.Megaparsec.Char as P
46 import Language.Symantic.XML.Document
47 import Language.Symantic.XML.Read.Parser
49 readXML :: FilePath -> TL.Text -> Either (P.ParseError (P.Token TL.Text) Error) XMLs
50 readXML filePath stateInput =
53 (R.runReaderT p_document def)
56 , P.statePos = pure $ P.initialPos filePath
57 , P.stateTabWidth = P.pos1 -- NOTE: do not expand tabs.
58 , P.stateTokensProcessed = 0
61 readFile :: FilePath -> IO (Either IO.IOError TL.Text)
63 (Right . TL.decodeUtf8 <$> BSL.readFile fp)
64 `IO.catchIOError` \e ->
65 if IO.isAlreadyInUseError e
66 || IO.isDoesNotExistError e
67 || IO.isPermissionError e
72 p_document :: P.Tokens s ~ TL.Text => Parser Error s XMLs
78 return (ps <> pure e <> join (Seq.fromList ms))
81 p_prolog :: P.Tokens s ~ TL.Text => Parser Error s XMLs
83 xmlDecl <- P.option Seq.empty $ pure <$> p_XMLDecl
85 return (xmlDecl <> join (Seq.fromList ms))
88 p_Misc :: P.Tokens s ~ TL.Text => Parser Error s XMLs
90 P.try (pure <$> p_Comment)
92 <|> Seq.empty <$ p_Spaces1
95 p_XMLDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
96 p_XMLDecl = P.label "XMLDecl" $ do
97 Sourced src as <- p_Sourced $ P.between (P.string "<?xml") (P.string "?>") $ do
98 vi <- pure <$> p_VersionInfo
99 ed <- P.option Seq.empty $ pure <$> p_EncodingDecl
100 sd <- P.option Seq.empty $ pure <$> p_SDDecl
102 return $ vi <> ed <> sd
103 return $ Tree (Sourced src $ NodePI "xml" "") as
105 p_VersionInfo :: P.Tokens s ~ TL.Text => Parser Error s XML
106 p_VersionInfo = P.label "VersionInfo" $ do
107 Sourced c v <- p_Sourced $ do
108 P.try (() <$ p_Spaces <* P.string "version")
110 p_quoted $ const $ p_Sourced $
113 <*> P.takeWhile1P Nothing Char.isDigit
114 return $ Tree (Sourced c $ NodeAttr "version") $ pure $
115 TS.tree0 $ NodeText <$> v
117 p_EncodingDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
118 p_EncodingDecl = P.label "EncodingDecl" $ do
119 Sourced c v <- p_Sourced $ do
120 P.try (() <$ p_Spaces <* P.string "encoding")
122 p_quoted $ const $ p_Sourced p_EncName
123 return $ Tree (Sourced c $ NodeAttr "encoding") $ pure $
124 TS.tree0 $ NodeText <$> v
126 p_EncName :: P.Tokens s ~ TL.Text => Parser Error s TL.Text
127 p_EncName = P.label "EncName" $ do
128 P.notFollowedBy (P.satisfy $ not . isAlpha)
129 P.takeWhile1P Nothing $ \c ->
130 isAlpha c || Char.isDigit c ||
131 c=='.' || c=='_' || c=='-'
132 where isAlpha c = Char.isAsciiLower c || Char.isAsciiUpper c
135 p_SDDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
136 p_SDDecl = P.label "SDDecl" $ do
138 Sourced ca () <- P.try (p_Sourced $ () <$ p_Spaces <* P.string "standalone")
140 v <- p_quoted $ const $ p_Sourced $ P.string "yes" <|> P.string "no"
141 return $ Tree (Sourced ca $ NodeAttr "standalone") $ pure $
142 TS.tree0 $ NodeText <$> v
145 p_CharData :: P.Tokens s ~ TL.Text => Parser e s XML
147 (\t -> TS.tree0 (NodeText <$> t))
148 <$> p_Sourced (p_until1 (\c -> XC.isXmlChar c && c/='<' && c/='&') (']',"]>"))
151 p_Comment :: P.Tokens s ~ TL.Text => Parser Error s XML
152 p_Comment = p_SourcedBegin $ P.string "<!--" *> p_Comment__
153 p_Comment_ :: P.Tokens s ~ TL.Text => Parser Error s XML
154 p_Comment_ = P.string "--" *> p_Comment__
155 p_Comment__:: P.Tokens s ~ TL.Text => Parser Error s XML
156 p_Comment__ = P.label "Comment" $ do
157 c <- p_until XC.isXmlChar ('-', "-")
158 void $ P.string "-->"
160 return $ TS.tree0 (cell $ NodeComment c)
163 p_CDSect :: P.Tokens s ~ TL.Text => Parser Error s XML
164 p_CDSect = p_SourcedBegin $ P.string "<![CDATA[" *> p_CDSect__
165 p_CDSect_ :: P.Tokens s ~ TL.Text => Parser Error s XML
166 p_CDSect_ = P.string "[CDATA[" *> p_CDSect__
167 p_CDSect__ :: P.Tokens s ~ TL.Text => Parser Error s XML
168 p_CDSect__ = P.label "CDSect" $ do
169 c <- p_until XC.isXmlChar (']', "]>")
170 void $ P.string "]]>"
172 return $ TS.tree0 $ cell $ NodeCDATA c
175 p_PI :: P.Tokens s ~ TL.Text => Parser Error s XML
176 p_PI = p_SourcedBegin $ P.string "<?" *> p_PI__
177 p_PI_ :: P.Tokens s ~ TL.Text => Parser Error s XML
178 p_PI_ = P.char '?' *> p_PI__
179 p_PI__ :: P.Tokens s ~ TL.Text => Parser Error s XML
180 p_PI__ = P.label "PI" $ do
182 v <- P.option "" $ P.try $ p_Spaces1 *> p_until XC.isXmlChar ('?', ">")
185 return $ TS.tree0 $ cell $ NodePI n v
186 p_PITarget :: P.Tokens s ~ TL.Text => Parser Error s PName
190 PName{pNameSpace=Just "", pNameLocal=NCName l}
191 | "xml" == TL.toLower l -> p_error $ Error_PI_reserved n
195 p_Element :: P.Tokens s ~ TL.Text => Parser Error s XML
196 p_Element = p_SourcedBegin $ (P.char '<' *> p_Element_)
197 p_Element_ :: P.Tokens s ~ TL.Text => Parser Error s XML
198 p_Element_ = P.label "Element" p_STag
201 p_STag :: P.Tokens s ~ TL.Text => Parser Error s XML
204 as <- P.many $ P.try $ p_Spaces1 *> p_Attribute
207 elemNS :: HM.HashMap NCName Namespace <-
208 (HM.fromList . List.concat <$>) $ forM as $ \case
209 Sourced _ (PName{..}, Sourced _ av)
211 , Nothing <- pNameSpace
212 , NCName "xmlns" <- pNameLocal ->
213 -- NOTE: default namespace declaration.
215 _ | ns == xmlns_xml -- DOC: it MUST NOT be declared as the default namespace
216 || ns == xmlns_xmlns -- DOC: it MUST NOT be declared as the default namespace
217 -> p_error $ Error_Namespace_reserved ns
218 _ -> return [(NCName "" , ns)]
220 , Just (NCName "xmlns") <- pNameSpace ->
221 -- NOTE: namespace prefix declaration.
222 case unNCName pNameLocal of
223 "xml" -- DOC: It MAY, but need not, be declared,
224 -- and MUST NOT be bound to any other namespace name.
225 | ns == xmlns_xml -> return []
226 | otherwise -> p_error $ Error_Namespace_reserved_prefix pNameLocal
227 "xmlns" -- DOC: It MUST NOT be declared
228 -> p_error $ Error_Namespace_reserved_prefix pNameLocal
229 local | "xml" <- TL.toLower $ TL.take 3 local -> return []
230 -- DOC: All other prefixes beginning with the three-letter
231 -- sequence x, m, l, in any case combination, are reserved.
232 -- This means that: processors MUST NOT treat them as fatal errors.
233 _ | ns == xmlns_xml -- DOC: Other prefixes MUST NOT be bound to this namespace name.
234 || ns == xmlns_xmlns -- DOC: Other prefixes MUST NOT be bound to this namespace name.
235 -> p_error $ Error_Namespace_reserved ns
236 _ -> return [(pNameLocal, ns)]
237 | otherwise -> return []
238 let scopeNS = elemNS <> reader_ns_scope ro
239 let defaultNS = HM.lookupDefault (reader_ns_default ro) (NCName "") scopeNS
240 let lookupNamePrefix prefix =
241 maybe (p_error $ Error_Namespace_prefix_unknown prefix) return $
242 HM.lookup prefix scopeNS
244 -- NOTE: expand element's QName.
246 Nothing -> return QName{qNameSpace=defaultNS, qNameLocal=pNameLocal n}
247 -- DOC: If there is a default namespace declaration in scope,
248 -- the expanded name corresponding to an unprefixed element name
249 -- has the URI of the default namespace as its namespace name.
251 | NCName "xmlns" <- prefix ->
252 -- DOC: Element names MUST NOT have the prefix xmlns.
253 p_error $ Error_Namespace_reserved_prefix prefix
255 ns <- lookupNamePrefix prefix
256 return QName{qNameSpace=ns, qNameLocal=pNameLocal n}
257 elemAttrs :: [Sourced FileSource (QName, Sourced FileSource Text)] <-
258 -- NOTE: expand attributes' PName into QName.
259 forM as $ \s@Sourced{unSourced=(an, av)} -> do
260 ns <- maybe (return "") lookupNamePrefix $ pNameSpace an
261 let qn = QName{qNameSpace=ns, qNameLocal=pNameLocal an}
262 return s{unSourced=(qn, av)}
263 -- Check for attribute collision.
264 let attrsByQName :: HM.HashMap QName [Sourced FileSource (QName, Sourced FileSource Text)] =
265 HM.fromListWith (<>) $ (<$> elemAttrs) $ \a@(Sourced _c (an, _av)) -> (an, [a])
266 case HM.toList $ HM.filter (\x -> length x > 1) attrsByQName of
267 (an, _):_ -> p_error $ Error_Attribute_collision an
269 elemAttrsXML :: XMLs <- (Seq.fromList <$>) $
270 forM elemAttrs $ \(Sourced sa (an, av)) -> do
271 return $ TS.Tree (Sourced sa $ NodeAttr an) $
272 pure $ TS.tree0 $ NodeText <$> av
274 elemAttrsXML <$ P.string "/>" <|>
277 { reader_ns_scope = scopeNS
278 , reader_ns_default = defaultNS
280 ((elemAttrsXML <>) <$ P.char '>' <*> p_content <* p_ETag elemName)
282 return $ Tree (cell $ NodeElem elemName) content
285 p_Attribute :: P.Tokens s ~ TL.Text => Parser Error s (Sourced FileSource (PName, Sourced FileSource TL.Text))
286 p_Attribute = p_Sourced $ (,) <$> p_PName <* p_Eq <*> p_AttValue
288 p_AttValue :: P.Tokens s ~ TL.Text => Parser Error s (Sourced FileSource TL.Text)
289 p_AttValue = P.label "AttValue" $ p_quoted p_AttValueText
291 p_AttValueText :: P.Tokens s ~ TL.Text => Char -> Parser Error s (Sourced FileSource TL.Text)
292 p_AttValueText q = p_Sourced $
295 <|> P.takeWhile1P Nothing (\c ->
297 c `List.notElem` (q:"<&"))
301 p_content :: P.Tokens s ~ TL.Text => Parser Error s XMLs
303 (Seq.fromList <$>) $ P.many $
305 P.try $ P.char '<' *> P.notFollowedBy (P.char '/')
306 p_Element_ <|> p_PI_ <|> (P.char '!' *> (p_Comment_ <|> p_CDSect_))
310 p_ETag :: P.Tokens s ~ TL.Text => QName -> Parser Error s ()
312 got <- P.string "</" *> p_QName <* p_Spaces <* P.char '>'
313 unless (got == expected) $
314 p_error $ Error_Closing_tag_unexpected got expected
317 p_Name :: P.Tokens s ~ TL.Text => Parser Error s Name
318 p_Name = P.label "Name" $
320 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNameStartChar)
321 <*> P.takeWhile1P Nothing XC.isXmlNameChar
324 p_PName :: P.Tokens s ~ TL.Text => Parser e s PName
325 p_PName = P.label "PName" $ do
327 s <- P.optional $ P.try $ P.char ':' *> p_NCName
329 Nothing -> PName{pNameSpace=Nothing, pNameLocal=n}
330 Just l -> PName{pNameSpace=Just n , pNameLocal=l}
333 p_QName :: P.Tokens s ~ TL.Text => Parser Error s QName
334 p_QName = P.label "QName" $ do
336 s <- P.optional $ P.try $ P.char ':' *> p_NCName
339 Nothing -> return QName{qNameSpace=reader_ns_default, qNameLocal=n}
341 case HM.lookup n reader_ns_scope of
342 Nothing -> p_error $ Error_Namespace_prefix_unknown n
343 Just ns -> return QName{qNameSpace=ns, qNameLocal=l}
346 p_NCName :: P.Tokens s ~ TL.Text => Parser e s NCName
347 p_NCName = P.label "NCName" $
349 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNCNameStartChar)
350 <*> P.takeWhile1P Nothing XC.isXmlNCNameChar
353 p_Reference :: P.Tokens s ~ TL.Text => Parser Error s TL.Text
354 p_Reference = TL.singleton <$> p_CharRef <|> p_EntityRef
357 p_EntityRef :: P.Tokens s ~ TL.Text => Parser Error s TL.Text
358 p_EntityRef = P.label "EntityRef" $ do
359 ref <- P.char '&' *> p_Name <* P.char ';'
362 lookupEntityRef (Name "lt" ) = pure "<"
363 lookupEntityRef (Name "gt" ) = pure ">"
364 lookupEntityRef (Name "amp" ) = pure "&"
365 lookupEntityRef (Name "apos") = pure "'"
366 lookupEntityRef (Name "quot") = pure "\""
367 lookupEntityRef n = p_error $ Error_EntityRef_unknown n
370 p_CharRef :: P.Tokens s ~ TL.Text => Parser Error s Char
371 p_CharRef = P.label "CharRef" $
373 ref <- readHexadecimal
375 <*> P.some P.hexDigitChar
381 <*> P.some P.digitChar
386 let c = toEnum (fromInteger i) in
387 if i <= toInteger (fromEnum (maxBound::Char))
390 else p_error $ Error_CharRef_invalid i
392 readInt :: Integer -> String -> Integer
393 readInt base digits =
394 sign * List.foldl' acc 0 (List.concatMap digToInt digits1)
400 c:ds | c == '-' -> (-1, ds)
401 | c == '+' -> ( 1, ds)
402 | otherwise -> ( 1, digits)
403 ord = toInteger . Char.ord
405 | Char.isDigit c = [ord c - ord '0']
406 | Char.isAsciiLower c = [ord c - ord 'a' + 10]
407 | Char.isAsciiUpper c = [ord c - ord 'A' + 10]
410 readDecimal :: String -> Integer
411 readDecimal = readInt 10
413 readHexadecimal :: String -> Integer
414 readHexadecimal = readInt 16
417 p_Char :: P.Tokens s ~ TL.Text => Parser e s Char
418 p_Char = P.label "Char" $ P.satisfy XC.isXmlCharCR <|> p_CRLF
419 {-# INLINE p_Char #-}
422 -- | Map '\r' and '\r\n' to '\n'.
423 p_CRLF :: P.Tokens s ~ TL.Text => Parser e s Char
424 p_CRLF = P.label "CRLF" $
425 P.char '\r' *> P.option '\n' (P.char '\n')
427 p_Space :: P.Tokens s ~ TL.Text => Parser e s Char
428 p_Space = P.label "Space" $
429 P.satisfy XC.isXmlSpaceCharCR <|> p_CRLF
430 {-# INLINE p_Space #-}
432 p_Spaces :: P.Tokens s ~ TL.Text => Parser e s ()
433 p_Spaces = P.label "Spaces" $
434 void $ P.takeWhileP Nothing XC.isXmlSpaceChar
435 {-# INLINE p_Spaces #-}
437 p_S :: P.Tokens s ~ TL.Text => Parser Error s XML
438 p_S = P.label "Spaces" $
439 (\ts -> TS.tree0 (NodeText . TL.concat <$> ts))
440 <$> p_Sourced (P.some $
441 P.takeWhile1P Nothing XC.isXmlSpaceCharCR <|>
442 TL.singleton <$> p_CRLF)
444 p_Spaces1 :: P.Tokens s ~ TL.Text => Parser e s ()
445 p_Spaces1 = P.label "Spaces1" $
446 void $ P.takeWhile1P Nothing XC.isXmlSpaceChar
447 {-# INLINE p_Spaces1 #-}
450 p_separator :: P.Tokens s ~ TL.Text => Char -> Parser e s ()
451 p_separator c = P.try (() <$ p_Spaces <* P.char c) <* p_Spaces <?> [c]
453 p_Eq :: P.Tokens s ~ TL.Text => Parser e s ()
454 p_Eq = p_separator '=' <?> "Eq"