{-# LANGUAGE ConstraintKinds #-} {-# LANGUAGE InstanceSigs #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Rank2Types #-} {-# LANGUAGE UndecidableInstances #-} {-# OPTIONS_GHC -fno-warn-orphans #-} module Symantic.XML.Read where import Control.Applicative as Alternative (Applicative(..), Alternative(..), optional) import Control.Monad (Monad(..)) import Control.Monad.Trans.Class (MonadTrans(..)) import Data.Bool import Data.Either (Either(..)) import Data.Eq (Eq(..)) import Data.Foldable (Foldable(..), all) import Data.Function (($), (.), const, id, flip) import Data.Functor (Functor(..), (<$>)) import Data.List.NonEmpty (NonEmpty(..)) import Data.Maybe (Maybe(..), maybe, isNothing, maybeToList) import Data.Monoid (Monoid(..)) import Data.Ord (Ord(..)) import Data.Proxy (Proxy(..)) import Data.Semigroup (Semigroup(..)) import Data.String (String, IsString(..)) import Data.Tuple (fst) import Data.Void (Void) import Numeric.Natural (Natural) import Prelude ((+), Integer, undefined) import System.IO (IO, FilePath) import Text.Show (Show(..)) import qualified Data.Char as Char import qualified Data.HashMap.Strict as HM import qualified Data.List as List import qualified Data.List.NonEmpty as NonEmpty import qualified Data.Sequence as Seq import qualified Data.Set as Set import qualified Data.Text as Text import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Builder as TLB import qualified Data.TreeSeq.Strict as TS import qualified Text.Megaparsec as P import qualified Text.Megaparsec.Char as P import qualified Text.Megaparsec.Char.Lexer as P import qualified Text.Megaparsec.Internal as P import Symantic.Base import Symantic.XML.Language import Symantic.XML.RelaxNG.Language import Symantic.XML.Tree -- | Main reading function. read :: Read FileSourced Void (x->x) a -> FilePath -> IO (Either String a) read rng path = readTree path >>= \case Left err -> return $ Left err Right xml -> return $ runRead rng xml -- | Like 'readWithRelaxNG' but on a 'FileSourcedTrees'. runRead :: Read FileSourced Void (x->x) a -> FileSourcedTrees -> Either String a runRead rng xml = case P.runParser (unRead rng) "" (mempty, xml) of Left err -> Left $ foldMap parseErrorTextPretty $ P.bundleErrors err Right a -> Right $ a id -- * Type 'ReadStream' type ReadStream src = ( HM.HashMap QName (src EscapedAttr) , Trees src ) -- | Take one 'Node' from the 'ReadStream', -- or fallback to an attribute, or 'Nothing'. -- -- Use 'pTokenAttr' to take only attributes. take1_ :: UnSource src => (Node (src EscapedAttr) -> Bool) -> ReadStream src -> Maybe ( P.Token (ReadStream src) , ReadStream src ) take1_ isIgnoredNode s@(attrs, trees) = go trees where go trs = case Seq.viewl trs of Seq.EmptyL | null attrs -> Nothing | otherwise -> Just (Left attrs, s) t Seq.:< ts -> case unSource (TS.unTree t) of n | isIgnoredNode n -> go ts | otherwise -> Just (Right t, (attrs, ts)) -- Note that having an ignored node -- can split a text into two 'NodeText's. -- Not sure if it would be better to unify them. -- ** Type 'ReadConstraints' -- | Convenient alias to be less verbose. type ReadConstraints src = ( Ord (src (Node (src EscapedAttr))) , Ord (src EscapedAttr) , UnSource src , NoSource src , SourceOf src , Show (Source src) , Show (src String) , Functor src ) instance ReadConstraints src => P.Stream (ReadStream src) where type Token (ReadStream src) = Either (HM.HashMap QName (src EscapedAttr)) (Tree src) type Tokens (ReadStream src) = ReadStream src take1_ = take1_ isIgnoredNode where isIgnoredNode = \case NodeComment{} -> True NodePI{} -> True _ -> False showTokens _s toks = orList $ mconcat $ toList $ showTree <$> toks where showSource :: src String -> String showSource sa = let src = sourceOf sa in if nullSource @src src then unSource sa else unSource sa<>" at "<>show src showTree = \case Left as -> (\(an, av) -> showSource $ ("(attribute "<>show an<>")") <$ av) <$> List.sortOn fst (HM.toList as) Right (TS.Tree nod ts) -> pure $ showSource . (<$ nod) $ case unSource nod of NodeElem n _as -> "(element "<>show n<>")" NodeText{} -> case Seq.viewl ts of TS.Tree tn _ Seq.:< _ | NodeText lit <- unSource tn -> -- Abuse the encoding to detect expected 'literal' -- using nested 'NodeText' "\""<>TL.unpack (unescapeText lit)<>"\"" _ -> "text" NodeComment _c -> "comment" NodePI n _t -> "(processing-instruction "<>show n<>")" NodeCDATA _t -> "cdata" -- Useless methods for validating an XML AST takeN_ = undefined tokensToChunk = undefined chunkToTokens = undefined chunkLength = undefined takeWhile_ = undefined reachOffset = undefined reachOffsetNoLine = undefined -- * Type 'Read' newtype Read src e f k = Read { unRead :: P.Parsec e (ReadStream src) (f->k) } instance ( Ord err , ReadConstraints src ) => Emptyable (Read src err) where empty = Read $ id <$ P.eof instance ( Ord err , ReadConstraints src ) => Unitable (Read src err) where unit = Read $ return ($ ()) instance ( Ord err , ReadConstraints src ) => Voidable (Read src err) where void _a (Read x) = Read $ (\a2b2k b -> a2b2k (\_a -> b)) <$> x instance ( Ord err , ReadConstraints src ) => Constant (Read src err) where constant a = Read $ return ($ a) instance ( Ord err , ReadConstraints src ) => Permutable (Read src err) where type Permutation (Read src err) = ReadPerm src err permutable (ReadPerm ma p) = Read $ do r <- Alternative.optional p unRead $ case r of Just perms -> permutable perms Nothing -> Read $ maybe -- Not 'empty' here so that 'P.TrivialError' -- has the unexpected token. (P.token (const Nothing) Set.empty) return ma noPerm = ReadPerm Nothing Alternative.empty perm (Read x) = ReadPerm Nothing $ (<$> x) $ \a -> ReadPerm (Just a) Alternative.empty permWithDefault d (Read x) = ReadPerm (Just ($ d)) $ (<$> x) $ \a -> ReadPerm (Just a) Alternative.empty instance ( Ord err , ReadConstraints src ) => Composable (Read src err) where Read x <.> Read y = Read $ x >>= \a2b -> (. a2b) <$> y instance ( Ord err , ReadConstraints src ) => Tupable (Read src err) where Read x <:> Read y = Read $ consCont (,) <$> x <*> y instance ( Ord err , ReadConstraints src ) => Eitherable (Read src err) where Read x <+> Read y = Read $ mapCont Left <$> P.try x <|> mapCont Right <$> y {- instance ( Ord err , ReadConstraints src ) => Routable (Read src err) where Read x Read y = Read $ (\a2k (a:!:_b) -> a2k a) <$> P.try x <|> (\b2k (_a:!:b) -> b2k b) <$> y -} instance ( Ord err , ReadConstraints src ) => Optionable (Read src err) where option (Read x) = Read $ P.try x <|> return id optional (Read x) = Read $ mapCont Just <$> P.try x <|> return ($ Nothing) instance ( Ord err , ReadConstraints src ) => Repeatable (Read src err) where many0 (Read x) = Read $ concatCont <$> many x many1 (Read x) = Read $ concatCont <$> some x instance ( Ord err , ReadConstraints src ) => Dimapable (Read src err) where dimap a2b _b2a (Read r) = Read $ (\k b2k -> k (b2k . a2b)) <$> r instance ( Ord err , ReadConstraints src ) => Dicurryable (Read src err) where dicurry (_::proxy args) constr _destr (Read x) = Read $ do f <- x return $ \r2k -> f (mapresultN @args r2k constr) instance ( Ord err , ReadConstraints src , Textable (Read src err) ) => XML (Read src err) where namespace _nm _ns = Read (return id) element n p = Read $ do s <- P.token check $ Set.singleton $ P.Tokens $ pure expected unRead $ readNested p s where expected = Right $ TS.tree0 $ noSource $ NodeElem n mempty check = \case Right (TS.Tree nod ts) | NodeElem e as <- unSource nod , e == n -> Just (removeXMLNS as, removeSpaces ts) _ -> Nothing attribute n p = Read $ do v <- pTokenAttr n $ Set.singleton $ P.Tokens $ pure expected unRead $ readNested p (mempty, pure (TS.tree0 (NodeText . EscapedText . unEscapedAttr <$> v))) -- Cast 'EscapedAttr' into 'EscapedText' -- because it will be read, not written, -- hence only given to 'unescapeText' -- which is the same than 'unescapeAttr'. where expected = Left $ HM.singleton n $ noSource "" literal lit = Read $ do P.token check $ Set.singleton $ P.Tokens $ pure expected where expected = Right $ TS.Tree (noSource $ NodeText "") (pure $ TS.tree0 (noSource $ NodeText $ escapeText lit)) check = \case Right (Tree0 nod) | NodeText t <- unSource nod , unescapeText t == lit -> Just id _ -> Nothing pi n = Read $ do v <- pTokenPI n $ Set.singleton $ P.Tokens $ pure expected return ($ v) where expected = Right $ TS.tree0 $ noSource $ NodePI n mempty cdata = Read $ P.token check $ Set.singleton $ P.Tokens $ pure expected where expected = Right $ TS.tree0 $ noSource $ NodeCDATA mempty check = \case Right (Tree0 nod) | NodeCDATA v <- unSource nod -> Just ($ v) _ -> Nothing comment = Read $ P.token check $ Set.singleton $ P.Tokens $ pure expected where expected = Right $ TS.tree0 $ noSource $ NodeComment mempty check = \case Right (Tree0 nod) | NodeComment v <- unSource nod -> Just ($ v) _ -> Nothing instance Ord err => Textable (Read FileSourced err) where type TextConstraint (Read FileSourced err) a = DecodeText a text :: forall a k repr. repr ~ Read FileSourced err => TextConstraint repr a => repr (a->k) k text = Read $ do Sourced (FileSource (src :| _)) txt <- P.token check $ Set.singleton $ P.Tokens $ pure expected case P.runParser @Void (decodeText @a <* P.eof) "" (unescapeText txt) of Right a -> return ($ a) Left errs -> P.fancyFailure $ Set.singleton $ P.ErrorFail $ (`foldMap` P.bundleErrors errs) $ \err -> fileRange_path src <> ":" <> show (fileRange_begin src <> Offset (P.errorOffset err)) <> "\n" <> P.parseErrorTextPretty err where expected = Right $ TS.tree0 $ noSource $ NodeText $ EscapedText mempty check = \case Right (Tree0 nod) | NodeText t <- unSource nod -> Just (t <$ nod) _ -> Nothing instance ( Ord err , ReadConstraints src , Textable (Read src err) , Definable (Read src err) ) => RelaxNG (Read src err) where elementMatch nc p = Read $ do (n,s) <- P.token check $ Set.singleton $ P.Tokens $ pure expected ((\a2k n2a -> a2k (n2a n)) <$>) $ unRead (readNested p s) where expected = Right $ TS.tree0 $ noSource $ NodeElem (qName (NCName (TLB.toLazyText (textify (mempty::Namespaces NCName,(infixN0,SideL),nc))))) mempty check = \case Right (TS.Tree nod ts) | NodeElem n as <- unSource nod , matchNameClass nc n -> Just (n, (removeXMLNS as, removeSpaces ts)) _ -> Nothing attributeMatch nc p = Read $ do (an,av) <- pTokenAttrNameClass nc $ Set.singleton $ P.Tokens $ pure expected ((\a2k n2a -> a2k (n2a an)) <$>) $ unRead $ readNested p (mempty, pure (TS.tree0 (NodeText . EscapedText . unEscapedAttr <$> av))) -- See comment in 'attribute' about the cast to 'EscapedText' here. where expected = Left $ HM.singleton (qName (NCName n)) $ noSource "" where n = TLB.toLazyText $ textify (mempty::Namespaces NCName,(infixN0,SideL),nc) instance Ord err => Definable (Read FileSourced err) where define n = Read . P.label n . unRead -- ** Type 'ReadPerm' data ReadPerm (src :: * -> *) err a k = ReadPerm { readPerm_result :: !(Maybe (a->k)) , readPerm_parser :: P.Parsec err (ReadStream src) (ReadPerm src err a k) } instance (Ord err, ReadConstraints src) => Dimapable (ReadPerm src err) where dimap a2b b2a (ReadPerm a ma) = ReadPerm (merge <$> a) (dimap a2b b2a `fmap` ma) where merge = \a2k2k b2k -> a2k2k $ b2k . a2b instance (Ord err, ReadConstraints src) => Dicurryable (ReadPerm src err) where dicurry :: forall args r k proxy. CurryN args => proxy args -> (args-..->r) -> -- construction (r->Tuples args) -> -- destruction ReadPerm src err (args-..->k) k -> ReadPerm src err (r->k) k dicurry px constr destr (ReadPerm a ma) = ReadPerm (merge <$> a) (dicurry px constr destr `fmap` ma) where merge args2k2k = \r2k -> args2k2k $ mapresultN @args r2k constr instance (Ord err, ReadConstraints src) => Composable (ReadPerm src err) where lhs@(ReadPerm da pa) <.> rhs@(ReadPerm db pb) = ReadPerm a $ lhsAlt <|> rhsAlt where lhsAlt = (<.> rhs) <$> pa rhsAlt = (lhs <.>) <$> pb a = flip (.) <$> da <*> db instance (Ord err, ReadConstraints src) => Tupable (ReadPerm src err) where lhs@(ReadPerm da pa) <:> rhs@(ReadPerm db pb) = ReadPerm a (lhsAlt <|> rhsAlt) where lhsAlt = (<:> rhs) <$> pa rhsAlt = (lhs <:>) <$> pb a = consCont (,) <$> da <*> db instance Definable (ReadPerm src err) where define _n = id -- * Utils concatCont :: [(a->k)->k] -> ([a]->k)->k concatCont = List.foldr (consCont (:)) ($ []) {-# INLINE concatCont #-} consCont :: (a->b->c) -> ((a->k)->k) -> ((b->k)->k) -> (c->k)->k consCont ab2c a2k2k b2k2k = \c2k -> a2k2k $ \a -> b2k2k $ \b -> c2k (ab2c a b) {-# INLINE consCont #-} mapCont :: (a->b) -> ((a->k)->k) -> ((b->k)->k) mapCont a2b a2k2k = \b2k -> a2k2k (b2k . a2b) {-# INLINE mapCont #-} -- | An adaptation of megaparsec's 'pToken', -- to handle 'attribute' properly. pTokenAttr :: forall e m src. ReadConstraints src => QName -> Set.Set (P.ErrorItem (P.Token (ReadStream src))) -> P.ParsecT e (ReadStream src) m (src EscapedAttr) pTokenAttr an ps = P.ParsecT $ \st@(P.State s@(attrs,trees) o pst de) cok _ _ eerr -> case HM.lookup an attrs of Just av -> cok av (P.State (HM.delete an attrs, trees) (o+1) pst de) mempty Nothing -> eerr (P.TrivialError o us ps) st where us = case P.take1_ s of Nothing -> pure P.EndOfInput Just (t,_ts) -> (Just . P.Tokens . pure) t {-# INLINE pTokenAttr #-} -- | An adaptation of megaparsec's 'pToken', -- to handle 'attributeMatch' properly. pTokenAttrNameClass :: forall e m src. ReadConstraints src => NameClass -> Set.Set (P.ErrorItem (P.Token (ReadStream src))) -> P.ParsecT e (ReadStream src) m (QName, src EscapedAttr) pTokenAttrNameClass nc ps = P.ParsecT $ \st@(P.State s@(attrs,trees) o pst de) cok _ _ eerr -> case HM.toList attrs of a@(an,_av):_ | matchNameClass nc an -> cok a (P.State (HM.delete an attrs, trees) (o+1) pst de) mempty _ -> eerr (P.TrivialError o us ps) st where us = case P.take1_ s of Nothing -> pure P.EndOfInput Just (t,_ts) -> (Just . P.Tokens . pure) t {-# INLINE pTokenAttrNameClass #-} -- | An adaptation of megaparsec's 'pToken', -- to handle 'pi' since 'NodePI' is ignored by 'P.take1_'. pTokenPI :: forall e m src. UnSource src => PName -> Set.Set (P.ErrorItem (P.Token (ReadStream src))) -> P.ParsecT e (ReadStream src) m TL.Text pTokenPI n ps = P.ParsecT $ \st@(P.State s o pst de) cok _ _ eerr -> case take1 s of Nothing -> eerr (P.TrivialError o us ps) st where us = pure P.EndOfInput Just (c, cs) | Right (TS.Tree nod _) <- c , NodePI pn pv <- unSource nod , pn == n -> cok pv (P.State cs (o+1) pst de) mempty | otherwise -> eerr (P.TrivialError o us ps) st where us = case take1 s of Nothing -> pure P.EndOfInput Just (t,_ts) -> (Just . P.Tokens . pure) t where take1 = take1_ isIgnoredNode where isIgnoredNode = \case NodeComment{} -> True _ -> False removeXMLNS :: HM.HashMap QName (src EscapedAttr) -> HM.HashMap QName (src EscapedAttr) removeXMLNS = HM.filterWithKey $ \an _av -> case an of QName "" "xmlns" -> False QName ns _l -> ns /= xmlns_xmlns removeSpaces :: UnSource src => Trees src -> Trees src removeSpaces xs = if (`all` xs) $ \case TS.Tree nod _ts | NodeText (EscapedText et) <- unSource nod -> all (\case EscapedPlain t -> TL.all Char.isSpace t _ -> False) et _ -> True then (`Seq.filter` xs) $ \case TS.Tree nod _ts | NodeText EscapedText{} <- unSource nod -> False _ -> True else xs -- | @readNested v xs@ returns a 'Read' parsing @xs@ entirely with @v@, -- updating 'P.stateOffset' and re-raising any exception. readNested :: Ord err => ReadConstraints src => Read src err f a -> ReadStream src -> Read src err f a readNested (Read p) stateInput = Read $ do st <- P.getParserState (st', res) <- lift $ P.runParserT' (p <* P.eof) st { P.stateInput , P.stateOffset = P.stateOffset st } P.updateParserState (\s -> s{P.stateOffset = P.stateOffset st'}) case res of Right a -> return a Left (P.ParseErrorBundle errs _) -> case NonEmpty.head errs of P.TrivialError _o us es -> P.failure us es P.FancyError _o es -> P.fancyFailure es -- * Class 'DecodeText' class DecodeText a where decodeText :: P.Parsec Void TL.Text a instance DecodeText String where decodeText = TL.unpack . fst <$> P.match (P.skipMany P.anySingle) instance DecodeText Text.Text where decodeText = TL.toStrict . fst <$> P.match (P.skipMany P.anySingle) instance DecodeText TL.Text where decodeText = fst <$> P.match (P.skipMany P.anySingle) instance DecodeText Bool where decodeText = False <$ (P.string "false" <|> P.string "0") <|> True <$ (P.string "true" <|> P.string "1") instance DecodeText Integer where decodeText = P.signed (return ()) P.decimal instance DecodeText Natural where decodeText = P.optional (P.char '+') *> P.decimal -- * Megaparsec adaptations -- | Rewrite 'P.parseErrorTextPretty' to keep 'Ord' of 'node's. parseErrorTextPretty :: P.ShowErrorComponent err => P.ParseError (ReadStream FileSourced) err -> String parseErrorTextPretty (P.TrivialError o us ps) = if isNothing us && Set.null ps then "unknown parse error\n" else (case us of Just P.Tokens{} -> "" _ -> -- FIXME: this is not informative enough, -- but P.EndOfInput can not carry a source location, -- and retraversing the XML tree cannot be done -- exactly as the parser did only knowing the Offset, -- because of attributes being permutable. "node #"<>show o<>"\n" ) <> messageItemsPretty "unexpected " (showErrorItem px <$> maybeToList us) <> messageItemsPretty "expecting " (showErrorItem px <$> Set.toAscList ps) where px = Proxy :: Proxy s parseErrorTextPretty err = P.parseErrorTextPretty err messageItemsPretty :: String -> [String] -> String messageItemsPretty prefix ts | null ts = "" | otherwise = prefix <> orList ts <> "\n" orList :: IsString s => Monoid s => [s] -> s orList [] = mempty orList [x] = x orList [x,y] = x <> " or " <> y orList xs = mconcat (List.intersperse ", " (List.init xs)) <> ", or " <> List.last xs showErrorItem :: (s ~ ReadStream (Sourced (FileSource Offset))) => Proxy s -> P.ErrorItem (P.Token s) -> String showErrorItem px = \case P.Tokens ts -> P.showTokens px ts P.Label label -> NonEmpty.toList label P.EndOfInput -> "end-of-node"