1 {-# LANGUAGE FlexibleContexts #-}
 
   2 {-# LANGUAGE FlexibleInstances #-}
 
   3 {-# LANGUAGE OverloadedStrings #-}
 
   4 {-# LANGUAGE Rank2Types #-}
 
   5 {-# LANGUAGE ScopedTypeVariables #-}
 
   6 {-# LANGUAGE TypeFamilies #-}
 
   7 {-# OPTIONS_GHC -fno-warn-orphans #-}
 
   8 module Language.Symantic.XML.Read
 
   9  ( module Language.Symantic.XML.Read.Parser
 
  10  , module Language.Symantic.XML.Read
 
  13 import Control.Applicative (Applicative(..), Alternative(..))
 
  14 import Control.Monad (Monad(..), void, unless, forM, join)
 
  16 import Data.Char (Char)
 
  17 import Data.Default.Class (Default(..))
 
  18 import Data.Either (Either(..))
 
  19 import Data.Eq (Eq(..))
 
  20 import Data.Foldable (Foldable(..))
 
  21 import Data.Function (($), (.), const)
 
  22 import Data.Functor ((<$>), (<$))
 
  23 import Data.Maybe (Maybe(..), maybe)
 
  24 import Data.Ord (Ord(..))
 
  25 import Data.Semigroup (Semigroup(..))
 
  26 import Data.String (String)
 
  27 import Data.TreeSeq.Strict (Tree(..))
 
  28 import Data.Tuple (snd)
 
  29 import Prelude (Num(..), Enum(..), Bounded(..), Integer, toInteger)
 
  30 import System.IO (FilePath, IO)
 
  31 import Text.Megaparsec ((<?>))
 
  32 import qualified Control.Monad.Trans.Reader as R
 
  33 import qualified Data.ByteString.Lazy as BSL
 
  34 import qualified Data.Char as Char
 
  35 import qualified Data.Char.Properties.XMLCharProps as XC
 
  36 import qualified Data.HashMap.Strict as HM
 
  37 import qualified Data.List as List
 
  38 import qualified Data.Sequence as Seq
 
  39 import qualified Data.Text.Lazy as TL
 
  40 import qualified Data.Text.Lazy.Encoding as TL
 
  41 import qualified Data.TreeSeq.Strict as TS
 
  42 import qualified System.IO.Error as IO
 
  43 import qualified Text.Megaparsec as P
 
  44 import qualified Text.Megaparsec.Char as P
 
  46 import Language.Symantic.XML.Document
 
  47 import Language.Symantic.XML.Read.Parser
 
  49 readXML :: FilePath -> TL.Text -> Either (P.ParseError (P.Token TL.Text) Error) XMLs
 
  50 readXML filePath stateInput =
 
  53          (R.runReaderT p_document def)
 
  56          , P.statePos = pure $ P.initialPos filePath
 
  57          , P.stateTabWidth = P.pos1 -- NOTE: do not expand tabs.
 
  58          , P.stateTokensProcessed = 0
 
  61 readFile :: FilePath -> IO (Either IO.IOError TL.Text)
 
  63         (Right . TL.decodeUtf8 <$> BSL.readFile fp)
 
  64         `IO.catchIOError` \e ->
 
  65                 if IO.isAlreadyInUseError e
 
  66                 || IO.isDoesNotExistError e
 
  67                 || IO.isPermissionError   e
 
  72 p_document :: P.Tokens s ~ TL.Text => Parser Error s XMLs
 
  78         return (ps <> pure e <> join (Seq.fromList ms))
 
  81 p_prolog :: P.Tokens s ~ TL.Text => Parser Error s XMLs
 
  83         xmlDecl <- P.option Seq.empty $ pure <$> p_XMLDecl
 
  85         return (xmlDecl <> join (Seq.fromList ms))
 
  88 p_Misc :: P.Tokens s ~ TL.Text => Parser Error s XMLs
 
  90         P.try (pure <$> p_Comment)
 
  91          <|> P.try (pure <$> p_PI)
 
  95 p_XMLDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
 
  96 p_XMLDecl = P.label "XMLDecl" $ do
 
  97         Sourced src as <- p_Sourced $ P.between (P.string "<?xml") (P.string "?>") $ do
 
  98                 vi <- pure <$> p_VersionInfo
 
  99                 ed <- P.option Seq.empty $ pure <$> p_EncodingDecl
 
 100                 sd <- P.option Seq.empty $ pure <$> p_SDDecl
 
 102                 return $ vi <> ed <> sd
 
 103         return $ Tree (Sourced src $ NodePI "xml" "") as
 
 105 p_VersionInfo :: P.Tokens s ~ TL.Text => Parser Error s XML
 
 106 p_VersionInfo = P.label "VersionInfo" $ do
 
 107         Sourced c v <- p_Sourced $ do
 
 108                 P.try (() <$ p_Spaces <* P.string "version")
 
 110                 p_quoted $ const $ p_Sourced $
 
 113                          <*> P.takeWhile1P Nothing Char.isDigit
 
 114         return $ Tree (Sourced c $ NodeAttr "version") $ pure $
 
 115                 TS.tree0 $ NodeText . pure . TextLexemePlain <$> v
 
 117 p_EncodingDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
 
 118 p_EncodingDecl = P.label "EncodingDecl" $ do
 
 119         Sourced c v <- p_Sourced $ do
 
 120                 P.try (() <$ p_Spaces <* P.string "encoding")
 
 122                 p_quoted $ const $ p_Sourced p_EncName
 
 123         return $ Tree (Sourced c $ NodeAttr "encoding") $ pure $
 
 124                 TS.tree0 $ NodeText . pure . TextLexemePlain <$> v
 
 126 p_EncName :: P.Tokens s ~ TL.Text => Parser Error s TL.Text
 
 127 p_EncName = P.label "EncName" $ do
 
 128         P.notFollowedBy (P.satisfy $ not . isAlpha)
 
 129         P.takeWhile1P Nothing $ \c ->
 
 130                 isAlpha c || Char.isDigit c ||
 
 131                 c=='.' || c=='_' || c=='-'
 
 132         where isAlpha c = Char.isAsciiLower c || Char.isAsciiUpper c
 
 135 p_SDDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
 
 136 p_SDDecl = P.label "SDDecl" $ do
 
 138                 Sourced ca () <- P.try (p_Sourced $ () <$ p_Spaces <* P.string "standalone")
 
 140                 v <- p_quoted $ const $ p_Sourced $ P.string "yes" <|> P.string "no"
 
 141                 return $ Tree (Sourced ca $ NodeAttr "standalone") $ pure $
 
 142                         TS.tree0 $ NodeText . pure . TextLexemePlain <$> v
 
 145 p_CharData :: P.Tokens s ~ TL.Text => Parser e s XML
 
 147         (\t -> TS.tree0 (NodeText . escapeText <$> t))
 
 148          <$> p_Sourced (p_until1 (\c -> XC.isXmlChar c && c/='<' && c/='&') (']',"]>"))
 
 151 p_Comment :: P.Tokens s ~ TL.Text => Parser Error s XML
 
 152 p_Comment = p_SourcedBegin $ P.string "<!--" *> p_Comment__
 
 153 p_Comment_ :: P.Tokens s ~ TL.Text => Parser Error s XML
 
 154 p_Comment_ = P.string "--" *> p_Comment__
 
 155 p_Comment__:: P.Tokens s ~ TL.Text => Parser Error s XML
 
 156 p_Comment__ = P.label "Comment" $ do
 
 157         c <- p_until XC.isXmlChar ('-', "-")
 
 158         void $ P.string "-->"
 
 160         return $ TS.tree0 (cell $ NodeComment c)
 
 163 p_CDSect :: P.Tokens s ~ TL.Text => Parser Error s XML
 
 164 p_CDSect = p_SourcedBegin $ P.string "<![CDATA[" *> p_CDSect__
 
 165 p_CDSect_ :: P.Tokens s ~ TL.Text => Parser Error s XML
 
 166 p_CDSect_ = P.string "[CDATA[" *> p_CDSect__
 
 167 p_CDSect__ :: P.Tokens s ~ TL.Text => Parser Error s XML
 
 168 p_CDSect__ = P.label "CDSect" $ do
 
 169         c <- p_until XC.isXmlChar (']', "]>")
 
 170         void $ P.string "]]>"
 
 172         return $ TS.tree0 $ cell $ NodeCDATA c
 
 175 p_PI :: P.Tokens s ~ TL.Text => Parser Error s XML
 
 176 p_PI = p_SourcedBegin $ P.string "<?" *> p_PI__
 
 177 p_PI_ :: P.Tokens s ~ TL.Text => Parser Error s XML
 
 178 p_PI_ = P.char '?' *> p_PI__
 
 179 p_PI__ :: P.Tokens s ~ TL.Text => Parser Error s XML
 
 180 p_PI__ = P.label "PI" $ do
 
 182         v <- P.option "" $ P.try $ p_Spaces1 *> p_until XC.isXmlChar ('?', ">")
 
 185         return $ TS.tree0 $ cell $ NodePI n v
 
 186 p_PITarget :: P.Tokens s ~ TL.Text => Parser Error s PName
 
 190          PName{pNameSpace=Just "", pNameLocal=NCName l}
 
 191           | "xml" == TL.toLower l -> p_error $ Error_PI_reserved n
 
 195 p_Element :: P.Tokens s ~ TL.Text => Parser Error s XML
 
 196 p_Element = p_SourcedBegin $ (P.char '<' *> p_Element_)
 
 197 p_Element_ :: P.Tokens s ~ TL.Text => Parser Error s XML
 
 198 p_Element_ = P.label "Element" p_STag
 
 201 p_STag :: P.Tokens s ~ TL.Text => Parser Error s XML
 
 204         as <- P.many $ P.try $ p_Spaces1 *> p_Attribute
 
 207         elemNS :: HM.HashMap NCName Namespace <-
 
 208                 (HM.fromList . List.concat <$>) $ forM as $ \case
 
 209                  Sourced _ (PName{..}, Sourced _ av)
 
 210                   | ns <- Namespace $ flatText av
 
 211                   , Nothing        <- pNameSpace
 
 212                   , NCName "xmlns" <- pNameLocal ->
 
 213                         -- NOTE: default namespace declaration.
 
 215                          _ |  ns == xmlns_xml   -- DOC: it MUST NOT be declared as the default namespace
 
 216                            || ns == xmlns_xmlns -- DOC: it MUST NOT be declared as the default namespace
 
 217                            -> p_error $ Error_Namespace_reserved ns
 
 218                          _ -> return [(NCName "" , ns)]
 
 219                   | ns <- Namespace $ flatText av
 
 220                   , Just (NCName "xmlns") <- pNameSpace ->
 
 221                         -- NOTE: namespace prefix declaration.
 
 222                         case unNCName pNameLocal of
 
 223                          "xml" -- DOC: It MAY, but need not, be declared,
 
 224                                -- and MUST NOT be bound to any other namespace name.
 
 225                                | ns == xmlns_xml -> return []
 
 226                                | otherwise -> p_error $ Error_Namespace_reserved_prefix pNameLocal
 
 227                          "xmlns" -- DOC: It MUST NOT be declared
 
 228                                  -> p_error $ Error_Namespace_reserved_prefix pNameLocal
 
 229                          local | "xml" <- TL.toLower $ TL.take 3 local -> return []
 
 230                                -- DOC: All other prefixes beginning with the three-letter
 
 231                                -- sequence x, m, l, in any case combination, are reserved.
 
 232                                -- This means that: processors MUST NOT treat them as fatal errors.
 
 233                          _ |  ns == xmlns_xml   -- DOC: Other prefixes MUST NOT be bound to this namespace name.
 
 234                            || ns == xmlns_xmlns -- DOC: Other prefixes MUST NOT be bound to this namespace name.
 
 235                            -> p_error $ Error_Namespace_reserved ns
 
 236                          _ -> return [(pNameLocal, ns)]
 
 237                   | otherwise -> return []
 
 238         let scopeNS = elemNS <> reader_ns_scope ro
 
 239         let defaultNS = HM.lookupDefault (reader_ns_default ro) (NCName "") scopeNS
 
 240         let lookupNamePrefix prefix =
 
 241                 maybe (p_error $ Error_Namespace_prefix_unknown prefix) return $
 
 242                 HM.lookup prefix scopeNS
 
 244                 -- NOTE: expand element's QName.
 
 246                  Nothing -> return QName{qNameSpace=defaultNS, qNameLocal=pNameLocal n}
 
 247                   -- DOC: If there is a default namespace declaration in scope,
 
 248                   -- the expanded name corresponding to an unprefixed element name
 
 249                   -- has the URI of the default namespace as its namespace name.
 
 251                   | NCName "xmlns" <- prefix ->
 
 252                         -- DOC: Element names MUST NOT have the prefix xmlns.
 
 253                         p_error $ Error_Namespace_reserved_prefix prefix
 
 255                         ns <- lookupNamePrefix prefix
 
 256                         return QName{qNameSpace=ns, qNameLocal=pNameLocal n}
 
 257         elemAttrs :: [Sourced FileSource (QName, Sourced FileSource Text)] <-
 
 258                 -- NOTE: expand attributes' PName into QName.
 
 259                 forM as $ \s@Sourced{unSourced=(an, av)} -> do
 
 260                         ns <- maybe (return "") lookupNamePrefix $ pNameSpace an
 
 261                         let qn = QName{qNameSpace=ns, qNameLocal=pNameLocal an}
 
 262                         return s{unSourced=(qn, av)}
 
 263         -- NOTE: check for attribute collision.
 
 264         let attrsByQName :: HM.HashMap QName [Sourced FileSource (QName, Sourced FileSource Text)] =
 
 265                 HM.fromListWith (<>) $ (<$> elemAttrs) $ \a@(Sourced _c (an, _av)) -> (an, [a])
 
 266         case HM.toList $ HM.filter (\x -> length x > 1) attrsByQName of
 
 267          (an, _):_ -> p_error $ Error_Attribute_collision an
 
 269         elemAttrsXML :: XMLs <- (Seq.fromList <$>) $
 
 270                 forM elemAttrs $ \(Sourced sa (an, av)) -> do
 
 271                         return $ TS.Tree (Sourced sa $ NodeAttr an) $
 
 272                                 pure $ TS.tree0 $ NodeText <$> av
 
 274                 elemAttrsXML <$ P.string "/>" <|>
 
 277                          { reader_ns_scope   = scopeNS
 
 278                          , reader_ns_default = defaultNS
 
 280                  ((elemAttrsXML <>) <$ P.char '>' <*> p_content <* p_ETag elemName)
 
 282         return $ Tree (cell $ NodeElem elemName) content
 
 285 p_Attribute :: P.Tokens s ~ TL.Text => Parser Error s (Sourced FileSource (PName, Sourced FileSource Text))
 
 286 p_Attribute = p_Sourced $ (,) <$> p_PName <* p_Eq <*> p_AttValue
 
 288 p_AttValue :: P.Tokens s ~ TL.Text => Parser Error s (Sourced FileSource Text)
 
 289 p_AttValue = P.label "AttValue" $ p_quoted p_AttValueText
 
 291 p_AttValueText :: P.Tokens s ~ TL.Text => Char -> Parser Error s (Sourced FileSource Text)
 
 292 p_AttValueText q = p_Sourced $
 
 295          <|> TextLexemePlain <$> P.takeWhile1P Nothing (\c ->
 
 297                 c `List.notElem` (q:"<&'\">"))
 
 298          <|> TextLexemeEntityRef entityRef_gt <$ P.char '>'
 
 300                 then TextLexemeEntityRef entityRef_quot <$ P.char '"'
 
 301                 else TextLexemeEntityRef entityRef_apos <$ P.char '\'')
 
 305 p_content :: P.Tokens s ~ TL.Text => Parser Error s XMLs
 
 307         (Seq.fromList <$>) $ P.many $
 
 309                         P.try $ P.char '<' *> P.notFollowedBy (P.char '/')
 
 310                         p_Element_ <|> p_PI_ <|> (P.char '!' *> (p_Comment_ <|> p_CDSect_))
 
 314 p_ETag :: P.Tokens s ~ TL.Text => QName -> Parser Error s ()
 
 316         got <- P.string "</" *> p_QName <* p_Spaces <* P.char '>'
 
 317         unless (got == expected) $
 
 318                 p_error $ Error_Closing_tag_unexpected got expected
 
 321 p_Name :: P.Tokens s ~ TL.Text => Parser Error s Name
 
 322 p_Name = P.label "Name" $
 
 324          <$  P.notFollowedBy (P.satisfy $ not . XC.isXmlNameStartChar)
 
 325          <*> P.takeWhile1P Nothing XC.isXmlNameChar
 
 328 p_PName :: P.Tokens s ~ TL.Text => Parser e s PName
 
 329 p_PName = P.label "PName" $ do
 
 331         s <- P.optional $ P.try $ P.char ':' *> p_NCName
 
 333          Nothing -> PName{pNameSpace=Nothing, pNameLocal=n}
 
 334          Just l  -> PName{pNameSpace=Just n , pNameLocal=l}
 
 337 p_QName :: P.Tokens s ~ TL.Text => Parser Error s QName
 
 338 p_QName = P.label "QName" $ do
 
 340         s <- P.optional $ P.try $ P.char ':' *> p_NCName
 
 343          Nothing -> return QName{qNameSpace=reader_ns_default, qNameLocal=n}
 
 345                 case HM.lookup n reader_ns_scope of
 
 346                  Nothing -> p_error $ Error_Namespace_prefix_unknown n
 
 347                  Just ns -> return QName{qNameSpace=ns, qNameLocal=l}
 
 350 p_NCName :: P.Tokens s ~ TL.Text => Parser e s NCName
 
 351 p_NCName = P.label "NCName" $
 
 353          <$  P.notFollowedBy (P.satisfy $ not . XC.isXmlNCNameStartChar)
 
 354          <*> P.takeWhile1P Nothing XC.isXmlNCNameChar
 
 357 p_Reference :: P.Tokens s ~ TL.Text => Parser Error s TextLexeme
 
 359         TextLexemeCharRef <$> p_CharRef <|>
 
 360         TextLexemeEntityRef <$> p_EntityRef
 
 363 p_EntityRef :: P.Tokens s ~ TL.Text => Parser Error s EntityRef
 
 364 p_EntityRef = P.label "EntityRef" $ do
 
 365         ref <- P.char '&' *> p_NCName <* P.char ';'
 
 366         EntityRef ref <$> lookupEntityRef ref
 
 368         lookupEntityRef (NCName "lt"  ) = pure "<"
 
 369         lookupEntityRef (NCName "gt"  ) = pure ">"
 
 370         lookupEntityRef (NCName "amp" ) = pure "&"
 
 371         lookupEntityRef (NCName "apos") = pure "'"
 
 372         lookupEntityRef (NCName "quot") = pure "\""
 
 373         lookupEntityRef n = p_error $ Error_EntityRef_unknown n
 
 376 p_CharRef :: P.Tokens s ~ TL.Text => Parser Error s CharRef
 
 377 p_CharRef = P.label "CharRef" $
 
 379                 ref <- readHexadecimal
 
 381                  <*> P.some P.hexDigitChar
 
 387                  <*> P.some P.digitChar
 
 392                 let c = toEnum (fromInteger i) in
 
 393                 if i <= toInteger (fromEnum (maxBound::Char))
 
 395                 then pure $ CharRef c
 
 396                 else p_error $ Error_CharRef_invalid i
 
 398 readInt :: Integer -> String -> Integer
 
 399 readInt base digits =
 
 400         sign * List.foldl' acc 0 (List.concatMap digToInt digits1)
 
 406                  c:ds | c == '-'  -> (-1, ds)
 
 407                       | c == '+'  -> ( 1, ds)
 
 408                       | otherwise -> ( 1, digits)
 
 409         ord = toInteger . Char.ord
 
 411          | Char.isDigit c      = [ord c - ord '0']
 
 412          | Char.isAsciiLower c = [ord c - ord 'a' + 10]
 
 413          | Char.isAsciiUpper c = [ord c - ord 'A' + 10]
 
 416 readDecimal :: String -> Integer
 
 417 readDecimal = readInt 10
 
 419 readHexadecimal :: String -> Integer
 
 420 readHexadecimal = readInt 16
 
 423 p_Char :: P.Tokens s ~ TL.Text => Parser e s Char
 
 424 p_Char = P.label "Char" $ P.satisfy XC.isXmlCharCR <|> p_CRLF
 
 425 {-# INLINE p_Char #-}
 
 428 -- | Map '\r' and '\r\n' to '\n'.
 
 429 p_CRLF :: P.Tokens s ~ TL.Text => Parser e s Char
 
 430 p_CRLF = P.label "CRLF" $
 
 431         P.char '\r' *> P.option '\n' (P.char '\n')
 
 433 p_Space :: P.Tokens s ~ TL.Text => Parser e s Char
 
 434 p_Space = P.label "Space" $
 
 435         P.satisfy XC.isXmlSpaceCharCR <|> p_CRLF
 
 436 {-# INLINE p_Space #-}
 
 438 p_Spaces :: P.Tokens s ~ TL.Text => Parser e s ()
 
 439 p_Spaces = P.label "Spaces" $
 
 440         void $ P.takeWhileP Nothing XC.isXmlSpaceChar
 
 441 {-# INLINE p_Spaces #-}
 
 443 p_S :: P.Tokens s ~ TL.Text => Parser Error s XML
 
 444 p_S = P.label "Spaces" $
 
 445         (\ts -> TS.tree0 (NodeText . pure . TextLexemePlain . TL.concat <$> ts))
 
 446          <$> p_Sourced (P.some $
 
 447                 P.takeWhile1P Nothing XC.isXmlSpaceCharCR <|>
 
 448                 TL.singleton <$> p_CRLF)
 
 450 p_Spaces1 :: P.Tokens s ~ TL.Text => Parser e s ()
 
 451 p_Spaces1 = P.label "Spaces1" $
 
 452         void $ P.takeWhile1P Nothing XC.isXmlSpaceChar
 
 453 {-# INLINE p_Spaces1 #-}
 
 456 p_separator :: P.Tokens s ~ TL.Text => Char -> Parser e s ()
 
 457 p_separator c = P.try (() <$ p_Spaces <* P.char c) <* p_Spaces <?> [c]
 
 459 p_Eq :: P.Tokens s ~ TL.Text => Parser e s ()
 
 460 p_Eq = p_separator '=' <?> "Eq"