1 {-# LANGUAGE ConstraintKinds #-}
2 {-# LANGUAGE InstanceSigs #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE UndecidableInstances #-}
6 {-# OPTIONS_GHC -fno-warn-orphans #-}
7 module Symantic.XML.Read where
9 import Control.Applicative as Alternative (Applicative(..), Alternative(..), optional)
10 import Control.Monad (Monad(..))
11 import Control.Monad.Trans.Class (MonadTrans(..))
13 import Data.Either (Either(..))
14 import Data.Eq (Eq(..))
15 import Data.Foldable (Foldable(..), all)
16 import Data.Function (($), (.), const, id, flip)
17 import Data.Functor (Functor(..), (<$>))
18 import Data.List.NonEmpty (NonEmpty(..))
19 import Data.Maybe (Maybe(..), maybe, isNothing, maybeToList)
20 import Data.Monoid (Monoid(..))
21 import Data.Ord (Ord(..))
22 import Data.Proxy (Proxy(..))
23 import Data.Semigroup (Semigroup(..))
24 import Data.String (String, IsString(..))
25 import Data.Tuple (fst)
26 import Data.Void (Void)
27 import Numeric.Natural (Natural)
28 import Prelude ((+), Integer, undefined)
29 import System.IO (IO, FilePath)
30 import Text.Show (Show(..))
31 import qualified Data.Char as Char
32 import qualified Data.HashMap.Strict as HM
33 import qualified Data.List as List
34 import qualified Data.List.NonEmpty as NonEmpty
35 import qualified Data.Sequence as Seq
36 import qualified Data.Set as Set
37 import qualified Data.Text as Text
38 import qualified Data.Text.Lazy as TL
39 import qualified Data.Text.Lazy.Builder as TLB
40 import qualified Data.TreeSeq.Strict as TS
41 import qualified Text.Megaparsec as P
42 import qualified Text.Megaparsec.Char as P
43 import qualified Text.Megaparsec.Char.Lexer as P
44 import qualified Text.Megaparsec.Internal as P
47 import Symantic.XML.Language
48 import Symantic.XML.RelaxNG.Language
49 import Symantic.XML.Tree
51 -- | Main reading function.
53 Read FileSourced Void (x->x) a ->
57 readTree path >>= \case
58 Left err -> return $ Left err
59 Right xml -> return $ runRead rng xml
61 -- | Like 'readWithRelaxNG' but on a 'FileSourcedTrees'.
63 Read FileSourced Void (x->x) a ->
67 case P.runParser (unRead rng) "" (mempty, xml) of
68 Left err -> Left $ foldMap parseErrorTextPretty $ P.bundleErrors err
69 Right a -> Right $ a id
71 -- * Type 'ReadStream'
73 ( HM.HashMap QName (src EscapedAttr)
77 -- | Take one 'Node' from the 'ReadStream',
78 -- or fallback to an attribute, or 'Nothing'.
80 -- Use 'pTokenAttr' to take only attributes.
83 (Node (src EscapedAttr) -> Bool) ->
85 Maybe ( P.Token (ReadStream src)
87 take1_ isIgnoredNode s@(attrs, trees) =
93 | null attrs -> Nothing
94 | otherwise -> Just (Left attrs, s)
96 case unSource (TS.unTree t) of
97 n | isIgnoredNode n -> go ts
98 | otherwise -> Just (Right t, (attrs, ts))
99 -- Note that having an ignored node
100 -- can split a text into two 'NodeText's.
101 -- Not sure if it would be better to unify them.
103 -- ** Type 'ReadConstraints'
104 -- | Convenient alias to be less verbose.
105 type ReadConstraints src =
106 ( Ord (src (Node (src EscapedAttr)))
107 , Ord (src EscapedAttr)
116 instance ReadConstraints src => P.Stream (ReadStream src) where
117 type Token (ReadStream src) = Either
118 (HM.HashMap QName (src EscapedAttr))
120 type Tokens (ReadStream src) = ReadStream src
121 take1_ = take1_ isIgnoredNode
123 isIgnoredNode = \case
124 NodeComment{} -> True
130 toList $ showTree <$> toks
132 showSource :: src String -> String
134 let src = sourceOf sa in
135 if nullSource @src src
137 else unSource sa<>" at "<>show src
140 (\(an, av) -> showSource $ ("(attribute "<>show an<>")") <$ av)
141 <$> List.sortOn fst (HM.toList as)
142 Right (TS.Tree nod ts) ->
144 showSource . (<$ nod) $
146 NodeElem n _as -> "(element "<>show n<>")"
149 TS.Tree tn _ Seq.:< _
150 | NodeText lit <- unSource tn ->
151 -- Abuse the encoding to detect expected 'literal'
152 -- using nested 'NodeText'
153 "\""<>TL.unpack (unescapeText lit)<>"\""
155 NodeComment _c -> "comment"
156 NodePI n _t -> "(processing-instruction "<>show n<>")"
157 NodeCDATA _t -> "cdata"
158 -- Useless methods for validating an XML AST
160 tokensToChunk = undefined
161 chunkToTokens = undefined
162 chunkLength = undefined
163 takeWhile_ = undefined
164 reachOffset = undefined
165 reachOffsetNoLine = undefined
168 newtype Read src e f k
170 { unRead :: P.Parsec e (ReadStream src) (f->k) }
174 , ReadConstraints src
175 ) => Emptyable (Read src err) where
176 empty = Read $ id <$ P.eof
179 , ReadConstraints src
180 ) => Unitable (Read src err) where
181 unit = Read $ return ($ ())
184 , ReadConstraints src
185 ) => Voidable (Read src err) where
186 void _a (Read x) = Read $
187 (\a2b2k b -> a2b2k (\_a -> b)) <$> x
190 , ReadConstraints src
191 ) => Constant (Read src err) where
192 constant a = Read $ return ($ a)
195 , ReadConstraints src
196 ) => Permutable (Read src err) where
197 type Permutation (Read src err) =
199 permutable (ReadPerm ma p) = Read $ do
200 r <- Alternative.optional p
203 Just perms -> permutable perms
206 -- Not 'empty' here so that 'P.TrivialError'
207 -- has the unexpected token.
208 (P.token (const Nothing) Set.empty)
210 noPerm = ReadPerm Nothing Alternative.empty
212 ReadPerm Nothing $ (<$> x) $ \a ->
213 ReadPerm (Just a) Alternative.empty
214 permWithDefault d (Read x) =
215 ReadPerm (Just ($ d)) $ (<$> x) $ \a ->
216 ReadPerm (Just a) Alternative.empty
219 , ReadConstraints src
220 ) => Composable (Read src err) where
221 Read x <.> Read y = Read $
222 x >>= \a2b -> (. a2b) <$> y
225 , ReadConstraints src
226 ) => Tupable (Read src err) where
227 Read x <:> Read y = Read $
228 consCont (,) <$> x <*> y
231 , ReadConstraints src
232 ) => Eitherable (Read src err) where
233 Read x <+> Read y = Read $
234 mapCont Left <$> P.try x <|>
239 , ReadConstraints src
240 ) => Routable (Read src err) where
241 Read x <!> Read y = Read $
242 (\a2k (a:!:_b) -> a2k a) <$> P.try x <|>
243 (\b2k (_a:!:b) -> b2k b) <$> y
247 , ReadConstraints src
248 ) => Optionable (Read src err) where
249 option (Read x) = Read $
250 P.try x <|> return id
251 optional (Read x) = Read $
252 mapCont Just <$> P.try x <|>
256 , ReadConstraints src
257 ) => Repeatable (Read src err) where
258 many0 (Read x) = Read $ concatCont <$> many x
259 many1 (Read x) = Read $ concatCont <$> some x
262 , ReadConstraints src
263 ) => Dimapable (Read src err) where
264 dimap a2b _b2a (Read r) =
265 Read $ (\k b2k -> k (b2k . a2b)) <$> r
268 , ReadConstraints src
269 ) => Dicurryable (Read src err) where
270 dicurry (_::proxy args) constr _destr (Read x) = Read $ do
273 f (mapresultN @args r2k constr)
276 , ReadConstraints src
277 , Textable (Read src err)
278 ) => XML (Read src err) where
279 namespace _nm _ns = Read (return id)
280 element n p = Read $ do
281 s <- P.token check $ Set.singleton $
282 P.Tokens $ pure expected
283 unRead $ readNested p s
285 expected = Right $ TS.tree0 $ noSource $ NodeElem n mempty
287 Right (TS.Tree nod ts)
288 | NodeElem e as <- unSource nod
290 -> Just (removeXMLNS as, removeSpaces ts)
292 attribute n p = Read $ do
293 v <- pTokenAttr n $ Set.singleton $
294 P.Tokens $ pure expected
295 unRead $ readNested p
296 (mempty, pure (TS.tree0 (NodeText . EscapedText . unEscapedAttr <$> v)))
297 -- Cast 'EscapedAttr' into 'EscapedText'
298 -- because it will be read, not written,
299 -- hence only given to 'unescapeText'
300 -- which is the same than 'unescapeAttr'.
302 expected = Left $ HM.singleton n $ noSource ""
303 literal lit = Read $ do
304 P.token check $ Set.singleton $ P.Tokens $ pure expected
307 TS.Tree (noSource $ NodeText "")
308 (pure $ TS.tree0 (noSource $ NodeText $ escapeText lit))
311 | NodeText t <- unSource nod
312 , unescapeText t == lit
316 v <- pTokenPI n $ Set.singleton $
317 P.Tokens $ pure expected
320 expected = Right $ TS.tree0 $ noSource $ NodePI n mempty
322 P.token check $ Set.singleton $
323 P.Tokens $ pure expected
325 expected = Right $ TS.tree0 $ noSource $ NodeCDATA mempty
328 | NodeCDATA v <- unSource nod
332 P.token check $ Set.singleton $
333 P.Tokens $ pure expected
335 expected = Right $ TS.tree0 $ noSource $ NodeComment mempty
338 | NodeComment v <- unSource nod
341 instance Ord err => Textable (Read FileSourced err) where
342 type TextConstraint (Read FileSourced err) a =
344 text :: forall a k repr.
345 repr ~ Read FileSourced err =>
346 TextConstraint repr a => repr (a->k) k
348 Sourced (FileSource (src :| _)) txt <-
349 P.token check $ Set.singleton $ P.Tokens $ pure expected
350 case P.runParser @Void (decodeText @a <* P.eof) "" (unescapeText txt) of
351 Right a -> return ($ a)
352 Left errs -> P.fancyFailure $ Set.singleton $ P.ErrorFail $
353 (`foldMap` P.bundleErrors errs) $ \err ->
354 fileRange_path src <> ":" <>
355 show (fileRange_begin src <> Offset (P.errorOffset err)) <> "\n" <>
356 P.parseErrorTextPretty err
358 expected = Right $ TS.tree0 $ noSource $ NodeText $ EscapedText mempty
361 | NodeText t <- unSource nod
366 , ReadConstraints src
367 , Textable (Read src err)
368 , Definable (Read src err)
369 ) => RelaxNG (Read src err) where
370 elementMatch nc p = Read $ do
371 (n,s) <- P.token check $ Set.singleton $
372 P.Tokens $ pure expected
373 ((\a2k n2a -> a2k (n2a n)) <$>) $
374 unRead (readNested p s)
376 expected = Right $ TS.tree0 $ noSource $
377 NodeElem (qName (NCName (TLB.toLazyText
378 (textify (mempty::Namespaces NCName,(infixN0,SideL),nc)))))
381 Right (TS.Tree nod ts)
382 | NodeElem n as <- unSource nod
383 , matchNameClass nc n
384 -> Just (n, (removeXMLNS as, removeSpaces ts))
386 attributeMatch nc p = Read $ do
387 (an,av) <- pTokenAttrNameClass nc $ Set.singleton $
388 P.Tokens $ pure expected
389 ((\a2k n2a -> a2k (n2a an)) <$>) $
390 unRead $ readNested p
391 (mempty, pure (TS.tree0 (NodeText . EscapedText . unEscapedAttr <$> av)))
392 -- See comment in 'attribute' about the cast to 'EscapedText' here.
394 expected = Left $ HM.singleton (qName (NCName n)) $ noSource ""
395 where n = TLB.toLazyText $ textify (mempty::Namespaces NCName,(infixN0,SideL),nc)
396 instance Ord err => Definable (Read FileSourced err) where
397 define n = Read . P.label n . unRead
399 -- ** Type 'ReadPerm'
400 data ReadPerm (src :: * -> *) err a k
402 { readPerm_result :: !(Maybe (a->k))
403 , readPerm_parser :: P.Parsec err (ReadStream src) (ReadPerm src err a k)
407 (Ord err, ReadConstraints src) =>
408 Dimapable (ReadPerm src err) where
409 dimap a2b b2a (ReadPerm a ma) =
410 ReadPerm (merge <$> a)
411 (dimap a2b b2a `fmap` ma)
412 where merge = \a2k2k b2k -> a2k2k $ b2k . a2b
414 (Ord err, ReadConstraints src) =>
415 Dicurryable (ReadPerm src err) where
417 forall args r k proxy.
420 (args-..->r) -> -- construction
421 (r->Tuples args) -> -- destruction
422 ReadPerm src err (args-..->k) k ->
423 ReadPerm src err (r->k) k
424 dicurry px constr destr (ReadPerm a ma) =
425 ReadPerm (merge <$> a)
426 (dicurry px constr destr `fmap` ma)
428 merge args2k2k = \r2k ->
429 args2k2k $ mapresultN @args r2k constr
431 (Ord err, ReadConstraints src) =>
432 Composable (ReadPerm src err) where
433 lhs@(ReadPerm da pa) <.> rhs@(ReadPerm db pb) =
437 lhsAlt = (<.> rhs) <$> pa
438 rhsAlt = (lhs <.>) <$> pb
439 a = flip (.) <$> da <*> db
441 (Ord err, ReadConstraints src) =>
442 Tupable (ReadPerm src err) where
443 lhs@(ReadPerm da pa) <:> rhs@(ReadPerm db pb) =
444 ReadPerm a (lhsAlt <|> rhsAlt)
446 lhsAlt = (<:> rhs) <$> pa
447 rhsAlt = (lhs <:>) <$> pb
448 a = consCont (,) <$> da <*> db
449 instance Definable (ReadPerm src err) where
454 concatCont :: [(a->k)->k] -> ([a]->k)->k
455 concatCont = List.foldr (consCont (:)) ($ [])
456 {-# INLINE concatCont #-}
458 consCont :: (a->b->c) -> ((a->k)->k) -> ((b->k)->k) -> (c->k)->k
459 consCont ab2c a2k2k b2k2k = \c2k -> a2k2k $ \a -> b2k2k $ \b -> c2k (ab2c a b)
460 {-# INLINE consCont #-}
462 mapCont :: (a->b) -> ((a->k)->k) -> ((b->k)->k)
463 mapCont a2b a2k2k = \b2k -> a2k2k (b2k . a2b)
464 {-# INLINE mapCont #-}
466 -- | An adaptation of megaparsec's 'pToken',
467 -- to handle 'attribute' properly.
470 ReadConstraints src =>
472 Set.Set (P.ErrorItem (P.Token (ReadStream src))) ->
473 P.ParsecT e (ReadStream src) m (src EscapedAttr)
474 pTokenAttr an ps = P.ParsecT $ \st@(P.State s@(attrs,trees) o pst de) cok _ _ eerr ->
475 case HM.lookup an attrs of
476 Just av -> cok av (P.State (HM.delete an attrs, trees) (o+1) pst de) mempty
477 Nothing -> eerr (P.TrivialError o us ps) st
479 us = case P.take1_ s of
480 Nothing -> pure P.EndOfInput
481 Just (t,_ts) -> (Just . P.Tokens . pure) t
482 {-# INLINE pTokenAttr #-}
484 -- | An adaptation of megaparsec's 'pToken',
485 -- to handle 'attributeMatch' properly.
486 pTokenAttrNameClass ::
488 ReadConstraints src =>
490 Set.Set (P.ErrorItem (P.Token (ReadStream src))) ->
491 P.ParsecT e (ReadStream src) m (QName, src EscapedAttr)
492 pTokenAttrNameClass nc ps = P.ParsecT $ \st@(P.State s@(attrs,trees) o pst de) cok _ _ eerr ->
493 case HM.toList attrs of
494 a@(an,_av):_ | matchNameClass nc an -> cok a (P.State (HM.delete an attrs, trees) (o+1) pst de) mempty
495 _ -> eerr (P.TrivialError o us ps) st
497 us = case P.take1_ s of
498 Nothing -> pure P.EndOfInput
499 Just (t,_ts) -> (Just . P.Tokens . pure) t
500 {-# INLINE pTokenAttrNameClass #-}
502 -- | An adaptation of megaparsec's 'pToken',
503 -- to handle 'pi' since 'NodePI' is ignored by 'P.take1_'.
508 Set.Set (P.ErrorItem (P.Token (ReadStream src))) ->
509 P.ParsecT e (ReadStream src) m TL.Text
510 pTokenPI n ps = P.ParsecT $ \st@(P.State s o pst de) cok _ _ eerr ->
512 Nothing -> eerr (P.TrivialError o us ps) st
513 where us = pure P.EndOfInput
515 | Right (TS.Tree nod _) <- c
516 , NodePI pn pv <- unSource nod
517 , pn == n -> cok pv (P.State cs (o+1) pst de) mempty
518 | otherwise -> eerr (P.TrivialError o us ps) st
521 Nothing -> pure P.EndOfInput
522 Just (t,_ts) -> (Just . P.Tokens . pure) t
524 take1 = take1_ isIgnoredNode
526 isIgnoredNode = \case
527 NodeComment{} -> True
531 HM.HashMap QName (src EscapedAttr) ->
532 HM.HashMap QName (src EscapedAttr)
534 HM.filterWithKey $ \an _av ->
536 QName "" "xmlns" -> False
537 QName ns _l -> ns /= xmlns_xmlns
539 removeSpaces :: UnSource src => Trees src -> Trees src
541 if (`all` xs) $ \case
543 | NodeText (EscapedText et) <- unSource nod ->
545 EscapedPlain t -> TL.all Char.isSpace t
548 then (`Seq.filter` xs) $ \case
550 | NodeText EscapedText{} <- unSource nod -> False
554 -- | @readNested v xs@ returns a 'Read' parsing @xs@ entirely with @v@,
555 -- updating 'P.stateOffset' and re-raising any exception.
558 ReadConstraints src =>
562 readNested (Read p) stateInput = Read $ do
563 st <- P.getParserState
564 (st', res) <- lift $ P.runParserT' (p <* P.eof) st
566 , P.stateOffset = P.stateOffset st
568 P.updateParserState (\s -> s{P.stateOffset = P.stateOffset st'})
571 Left (P.ParseErrorBundle errs _) ->
572 case NonEmpty.head errs of
573 P.TrivialError _o us es -> P.failure us es
574 P.FancyError _o es -> P.fancyFailure es
576 -- * Class 'DecodeText'
577 class DecodeText a where
578 decodeText :: P.Parsec Void TL.Text a
579 instance DecodeText String where
580 decodeText = TL.unpack . fst <$>
581 P.match (P.skipMany P.anySingle)
582 instance DecodeText Text.Text where
583 decodeText = TL.toStrict . fst <$>
584 P.match (P.skipMany P.anySingle)
585 instance DecodeText TL.Text where
587 P.match (P.skipMany P.anySingle)
588 instance DecodeText Bool where
590 False <$ (P.string "false" <|> P.string "0") <|>
591 True <$ (P.string "true" <|> P.string "1")
592 instance DecodeText Integer where
593 decodeText = P.signed (return ()) P.decimal
594 instance DecodeText Natural where
595 decodeText = P.optional (P.char '+') *> P.decimal
597 -- * Megaparsec adaptations
598 -- | Rewrite 'P.parseErrorTextPretty' to keep 'Ord' of 'node's.
599 parseErrorTextPretty ::
600 P.ShowErrorComponent err =>
601 P.ParseError (ReadStream FileSourced) err -> String
602 parseErrorTextPretty (P.TrivialError o us ps) =
603 if isNothing us && Set.null ps
604 then "unknown parse error\n"
607 Just P.Tokens{} -> ""
609 -- FIXME: this is not informative enough,
610 -- but P.EndOfInput can not carry a source location,
611 -- and retraversing the XML tree cannot be done
612 -- exactly as the parser did only knowing the Offset,
613 -- because of attributes being permutable.
614 "node #"<>show o<>"\n"
616 messageItemsPretty "unexpected "
617 (showErrorItem px <$> maybeToList us) <>
618 messageItemsPretty "expecting "
619 (showErrorItem px <$> Set.toAscList ps)
620 where px = Proxy :: Proxy s
621 parseErrorTextPretty err = P.parseErrorTextPretty err
623 messageItemsPretty :: String -> [String] -> String
624 messageItemsPretty prefix ts
626 | otherwise = prefix <> orList ts <> "\n"
628 orList :: IsString s => Monoid s => [s] -> s
631 orList [x,y] = x <> " or " <> y
632 orList xs = mconcat (List.intersperse ", " (List.init xs)) <> ", or " <> List.last xs
635 (s ~ ReadStream (Sourced (FileSource Offset))) =>
636 Proxy s -> P.ErrorItem (P.Token s) -> String
637 showErrorItem px = \case
638 P.Tokens ts -> P.showTokens px ts
639 P.Label label -> NonEmpty.toList label
640 P.EndOfInput -> "end-of-node"