{-# LANGUAGE ConstraintKinds #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Rank2Types #-} {-# LANGUAGE StrictData #-} {-# LANGUAGE TypeFamilies #-} module Language.Symantic.XML.Read.Parser where import Control.Applicative (Applicative(..), Alternative(..)) import Control.Monad (Monad(..)) import Data.Bool import Data.Char (Char) import Data.Default.Class (Default(..)) import Data.Either (Either(..)) import Data.Eq (Eq(..)) import Data.Function (($), (.)) import Data.Functor ((<$>)) import Data.Functor.Identity (Identity(..)) import Data.List.NonEmpty (NonEmpty(..), nonEmpty) import Data.Maybe (Maybe(..), fromMaybe) import Data.Ord (Ord(..)) import Data.Proxy (Proxy(..)) import Data.Semigroup (Semigroup(..)) import Data.String (IsString) import Data.Tuple (snd) import Prelude (Integer) import Text.Show (Show(..)) import qualified Control.Monad.Trans.Reader as R import qualified Data.HashMap.Strict as HM import qualified Data.Set as Set import qualified Data.Text.Lazy as TL import qualified Text.Megaparsec as P import qualified Text.Megaparsec.Char as P import Language.Symantic.XML.Document -- * Type 'Parser' -- | Convenient alias. type Parser e s a = Parsable e s a => R.ReaderT Reader (P.ParsecT e s Identity) a -- ** Type 'Parsable' type Parsable e s a = ( P.Stream s , P.Token s ~ Char , Ord e , IsString (P.Tokens s) , P.ShowErrorComponent e ) -- ** Type 'Reader' data Reader = Reader { reader_source :: FileSource , reader_ns_scope :: HM.HashMap NCName Namespace , reader_ns_default :: Namespace } deriving (Show) instance Default Reader where def = Reader { reader_source = pure def , reader_ns_scope = HM.fromList [ ("xml" , xmlns_xml) , ("xmlns", xmlns_xmlns) ] , reader_ns_default = "" } p_FilePos :: Parser e s FilePos p_FilePos = do pos :| _ <- P.statePos <$> P.getParserState return $ FilePos (P.unPos $ P.sourceLine pos) (P.unPos $ P.sourceColumn pos) p_Sourced :: Parser e s a -> Parser e s (Sourced FileSource a) p_Sourced pa = do Reader{reader_source} <- R.ask beginPos :| _ <- P.statePos <$> P.getParserState a <- pa fileRange_end <- p_FilePos let fileRange = FileRange { fileRange_file = P.sourceName beginPos , fileRange_begin = FilePos (P.unPos $ P.sourceLine beginPos) (P.unPos $ P.sourceColumn beginPos) , fileRange_end } return $ Sourced (setSource fileRange reader_source) a setSource :: FileRange -> FileSource -> FileSource setSource fileRange (_curr:|next) = fileRange :| next -- | Like 'p_Sourced' but uncoupled for more flexibility. p_SourcedBegin :: Parser e s a -> Parser e s a p_SourcedBegin pa = do currPos :| _ <- P.statePos <$> P.getParserState let fileRange_begin = FilePos (P.unPos $ P.sourceLine currPos) (P.unPos $ P.sourceColumn currPos) let fileRange = FileRange { fileRange_file = P.sourceName currPos , fileRange_begin , fileRange_end = fileRange_begin } (`R.local` pa) $ \ro@Reader{..} -> ro{ reader_source = setSource fileRange reader_source } -- | Only to be used within a 'p_SourcedBegin'. p_SourcedEnd :: Parser e s (a -> Sourced FileSource a) p_SourcedEnd = do fileRange_end <- p_FilePos Reader{..} <- R.ask return $ Sourced $ (\(curr:|path) -> curr{fileRange_end}:|path) reader_source -- ** Type 'StreamSourced' -- | Wrap 'TL.Text' to have a 'P.Stream' instance -- whose 'P.advance1' method abuses the tab width state -- to instead pass the line indent. -- This in order to report correct 'P.SourcePos' -- when parsing a 'Sourced' containing newlines. newtype StreamSourced = StreamSourced { unStreamSourced :: TL.Text } deriving (IsString,Eq,Ord) instance P.Stream StreamSourced where type Token StreamSourced = Char type Tokens StreamSourced = TL.Text take1_ (StreamSourced t) = (StreamSourced <$>) <$> P.take1_ t takeN_ n (StreamSourced t) = (StreamSourced <$>) <$> P.takeN_ n t takeWhile_ f (StreamSourced t) = StreamSourced <$> P.takeWhile_ f t tokensToChunk _s = P.tokensToChunk (Proxy::Proxy TL.Text) chunkToTokens _s = P.chunkToTokens (Proxy::Proxy TL.Text) chunkLength _s = P.chunkLength (Proxy::Proxy TL.Text) advance1 _s indent (P.SourcePos n line col) c = case c of '\n' -> P.SourcePos n (line <> P.pos1) indent _ -> P.SourcePos n line (col <> P.pos1) advanceN s indent = TL.foldl' (P.advance1 s indent) -- | Wrapper around |P.runParser'| -- to use given 'Sourced' as starting position. runParserOnSourced :: Parsable e StreamSourced a => Parser e StreamSourced a -> Sourced FileSource TL.Text -> Either (P.ParseError (P.Token StreamSourced) e) a runParserOnSourced p (Sourced (FileRange inp bp _ep :| path) s) = snd $ P.runParser' (R.runReaderT p ro <* P.eof) P.State { P.stateInput = StreamSourced s , P.statePos = pure $ P.SourcePos inp (P.mkPos $ filePos_line bp) indent , P.stateTabWidth = indent , P.stateTokensProcessed = 0 } where indent = P.mkPos $ filePos_column bp ro = def{ reader_source = fromMaybe (pure def) $ nonEmpty path } -- * Type 'Error' data Error = Error_CharRef_invalid Integer -- ^ Well-formedness constraint: Legal Character. -- -- Characters referred to using character references MUST match the production for Char. | Error_EntityRef_unknown NCName -- ^ Well-formedness constraint: Entity Declared -- -- In a document without any DTD, a document with only an internal DTD -- subset which contains no parameter entity references, or a document -- with " standalone='yes' ", for an entity reference that does not occur -- within the external subset or a parameter entity, the Name given in the -- entity reference MUST match that in an entity declaration that does not -- occur within the external subset or a parameter entity, except that -- well-formed documents need not declare any of the following entities: -- amp, lt, gt, apos, quot. The declaration of a general entity MUST -- precede any reference to it which appears in a default value in an -- attribute-list declaration. -- -- Note that non-validating processors are not obligated to read and -- process entity declarations occurring in parameter entities or in the -- external subset; for such documents, the rule that an entity must be -- declared is a well-formedness constraint only if standalone='yes'. | Error_Closing_tag_unexpected QName QName -- ^ Well-formedness constraint: Element Type Match. -- -- The Name in an element's end-tag MUST match the element type in the start-tag. | Error_Attribute_collision QName -- ^ Well-formedness constraint: Unique Att Spec. -- -- An attribute name MUST NOT appear more than once in the same start-tag or empty-element tag. | Error_PI_reserved PName -- ^ The target names " XML ", " xml ", and so on are reserved for standardization. | Error_Namespace_prefix_unknown NCName -- ^ Namespace constraint: Prefix Declared -- -- The namespace prefix, unless it is xml or xmlns, MUST have been declared in a namespace declaration attribute in either the start-tag of the element where the prefix is used or in an ancestor element (i.e., an element in whose content the prefixed markup occurs). | Error_Namespace_empty NCName -- ^ Namespace constraint: No Prefix Undeclaring -- -- In a namespace declaration for a prefix (i.e., where the NSAttName is a PrefixedAttName), the attribute value MUST NOT be empty. | Error_Namespace_reserved Namespace | Error_Namespace_reserved_prefix NCName -- ^ Namespace constraint: Reserved Prefixes and Namespace Names -- -- The prefix xml is by definition bound to the namespace name -- http://www.w3.org/XML/1998/namespace. It MAY, but need not, be -- declared, and MUST NOT be bound to any other namespace name. Other -- prefixes MUST NOT be bound to this namespace name, and it MUST NOT be -- declared as the default namespace. -- -- The prefix xmlns is used only to declare namespace bindings and is by -- definition bound to the namespace name http://www.w3.org/2000/xmlns/. -- It MUST NOT be declared . Other prefixes MUST NOT be bound to this -- namespace name, and it MUST NOT be declared as the default namespace. -- Element names MUST NOT have the prefix xmlns. -- -- All other prefixes beginning with the three-letter sequence x, m, l, in -- any case combination, are reserved. This means that: -- -- - users SHOULD NOT use them except as defined by later specifications -- - processors MUST NOT treat them as fatal errors. deriving (Eq,Ord,Show) instance P.ShowErrorComponent Error where showErrorComponent = show -- * Helpers p_error :: e -> Parser e s a p_error = P.fancyFailure . Set.singleton . P.ErrorCustom p_quoted :: P.Tokens s ~ TL.Text => (Char -> Parser e s a) -> Parser e s a p_quoted p = P.between (P.char '"') (P.char '"') (p '"') <|> P.between (P.char '\'') (P.char '\'') (p '\'') p_until :: P.Tokens s ~ TL.Text => (Char -> Bool) -> (Char, TL.Text) -> Parser e s TL.Text p_until content (end, end_) = (TL.concat <$>) $ P.many $ P.takeWhile1P Nothing (\c -> content c && c /= end) <|> P.try (TL.singleton <$> P.char end <* P.notFollowedBy (P.string end_)) p_until1 :: P.Tokens s ~ TL.Text => (Char -> Bool) -> (Char, TL.Text) -> Parser e s TL.Text p_until1 content (end, end_) = (TL.concat <$>) $ P.some $ P.takeWhile1P Nothing (\c -> content c && c /= end) <|> P.try (TL.singleton <$> P.char end <* P.notFollowedBy (P.string end_))