]> Git — Sourcephile - haskell/symantic-xml.git/blob - Language/Symantic/XML/Read/Parser.hs
Upgrade to megaparsec-7
[haskell/symantic-xml.git] / Language / Symantic / XML / Read / Parser.hs
1 {-# LANGUAGE ConstraintKinds #-}
2 {-# LANGUAGE FlexibleContexts #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE StrictData #-}
6 {-# LANGUAGE TypeFamilies #-}
7 module Language.Symantic.XML.Read.Parser where
8
9 import Control.Applicative (Applicative(..), Alternative(..))
10 import Control.Monad (Monad(..))
11 import Data.Bool
12 import Data.Char (Char)
13 import Data.Default.Class (Default(..))
14 import Data.Eq (Eq(..))
15 import Data.Function (($), (.))
16 import Data.Functor ((<$>))
17 import Data.List.NonEmpty (NonEmpty(..))
18 import Data.Maybe (Maybe(..))
19 import Data.Ord (Ord(..))
20 import Data.String (IsString)
21 import Prelude (Integer)
22 import Text.Show (Show(..))
23 import qualified Control.Monad.Trans.Reader as R
24 import qualified Data.HashMap.Strict as HM
25 import qualified Data.Set as Set
26 import qualified Data.Text.Lazy as TL
27 import qualified Text.Megaparsec as P
28 import qualified Text.Megaparsec.Char as P
29
30 import Language.Symantic.XML.Document
31
32 -- * Type 'Parser'
33 -- | Convenient alias.
34 type Parser e s a =
35 Parsable e s a =>
36 R.ReaderT Reader (P.Parsec e s) a
37
38 -- ** Type 'Parsable'
39 type Parsable e s a =
40 ( P.Stream s
41 , P.Token s ~ Char
42 , Ord e
43 , IsString (P.Tokens s)
44 , P.ShowErrorComponent e
45 )
46
47 -- ** Type 'Reader'
48 data Reader = Reader
49 { reader_source :: FileSource Offset
50 , reader_ns_scope :: HM.HashMap NCName Namespace
51 , reader_ns_default :: Namespace
52 } deriving (Show)
53 instance Default Reader where
54 def = Reader
55 { reader_source = pure def
56 , reader_ns_scope = HM.fromList
57 [ ("xml" , xmlns_xml)
58 , ("xmlns", xmlns_xmlns)
59 ]
60 , reader_ns_default = ""
61 }
62
63 p_Offset :: Parser e s Offset
64 p_Offset = Offset <$> P.getOffset
65 {-# INLINE p_Offset #-}
66
67 p_Sourced :: Parser e s a -> Parser e s (Sourced (FileSource Offset) a)
68 p_Sourced pa = do
69 Reader{reader_source} <- R.ask
70 b <- P.getParserState
71 let fileRange_file = P.sourceName $ P.pstateSourcePos $ P.statePosState b
72 let fileRange_begin = Offset $ P.stateOffset b
73 a <- pa
74 e <- P.getParserState
75 let fileRange_end = Offset $ P.stateOffset e
76 return $ Sourced (setSource FileRange{..} reader_source) a
77
78 setSource :: FileRange pos -> FileSource pos -> FileSource pos
79 setSource fileRange (_curr:|next) = fileRange :| next
80
81 -- | Like 'p_Sourced' but uncoupled (through the use of 'p_SourcedEnd') for more flexibility.
82 p_SourcedBegin :: Parser e s a -> Parser e s a
83 p_SourcedBegin pa = do
84 b <- P.getParserState
85 let fileRange_file = P.sourceName $ P.pstateSourcePos $ P.statePosState b
86 let fileRange_begin = Offset $ P.stateOffset b
87 let fileRange_end = fileRange_begin
88 (`R.local` pa) $ \ro@Reader{..} ->
89 ro{ reader_source = setSource FileRange{..} reader_source }
90
91 -- | WARNING: Only to be used within a 'p_SourcedBegin'.
92 p_SourcedEnd :: Parser e s (a -> Sourced (FileSource Offset) a)
93 p_SourcedEnd = do
94 Reader{..} <- R.ask
95 e <- P.getParserState
96 let fileRange_end = Offset $ P.stateOffset e
97 return $ Sourced $
98 (\(curr:|path) -> curr{fileRange_end}:|path)
99 reader_source
100
101 {-
102 -- ** Type 'StreamSourced'
103 -- | Wrap 'TL.Text' to have a 'P.Stream' instance
104 -- whose 'P.advance1' method abuses the tab width state
105 -- to instead pass the line indent.
106 -- This in order to report correct 'P.SourcePos'
107 -- when parsing a 'Sourced' containing newlines.
108 newtype StreamSourced = StreamSourced { unStreamSourced :: TL.Text }
109 deriving (IsString,Eq,Ord)
110 instance P.Stream StreamSourced where
111 type Token StreamSourced = Char
112 type Tokens StreamSourced = TL.Text
113 take1_ (StreamSourced t) = (StreamSourced <$>) <$> P.take1_ t
114 takeN_ n (StreamSourced t) = (StreamSourced <$>) <$> P.takeN_ n t
115 takeWhile_ f (StreamSourced t) = StreamSourced <$> P.takeWhile_ f t
116 tokensToChunk _s = P.tokensToChunk (Proxy::Proxy TL.Text)
117 chunkToTokens _s = P.chunkToTokens (Proxy::Proxy TL.Text)
118 chunkLength _s = P.chunkLength (Proxy::Proxy TL.Text)
119 {-
120 advance1 _s indent (P.SourcePos n line col) c =
121 case c of
122 '\n' -> P.SourcePos n (line <> P.pos1) indent
123 _ -> P.SourcePos n line (col <> P.pos1)
124 advanceN s indent = TL.foldl' (P.advance1 s indent)
125 -}
126
127 -- | Wrapper around |P.runParser'|
128 -- to use given 'Sourced' as starting position.
129 runParserOnSourced ::
130 Parsable e StreamSourced a =>
131 Parser e StreamSourced a ->
132 Sourced FileSource TL.Text ->
133 Either (P.ParseError (P.Token StreamSourced) e) a
134 runParserOnSourced p (Sourced (FileRange inp bp _ep :| path) s) =
135 snd $
136 P.runParser' (R.runReaderT p ro <* P.eof)
137 P.State
138 { P.stateInput = StreamSourced s
139 , P.statePos = pure $ P.SourcePos inp (P.mkPos $ filePos_line bp) indent
140 , P.stateTabWidth = indent
141 , P.stateTokensProcessed = 0
142 }
143 where
144 indent = P.mkPos $ filePos_column bp
145 ro = def{ reader_source = fromMaybe (pure def) $ nonEmpty path }
146 -}
147
148 -- * Type 'Error'
149 data Error
150 = Error_CharRef_invalid Integer
151 -- ^ Well-formedness constraint: Legal Character.
152 --
153 -- Characters referred to using character references MUST match the production for Char.
154 | Error_EntityRef_unknown NCName
155 -- ^ Well-formedness constraint: Entity Declared
156 --
157 -- In a document without any DTD, a document with only an internal DTD
158 -- subset which contains no parameter entity references, or a document
159 -- with " standalone='yes' ", for an entity reference that does not occur
160 -- within the external subset or a parameter entity, the Name given in the
161 -- entity reference MUST match that in an entity declaration that does not
162 -- occur within the external subset or a parameter entity, except that
163 -- well-formed documents need not declare any of the following entities:
164 -- amp, lt, gt, apos, quot. The declaration of a general entity MUST
165 -- precede any reference to it which appears in a default value in an
166 -- attribute-list declaration.
167 --
168 -- Note that non-validating processors are not obligated to read and
169 -- process entity declarations occurring in parameter entities or in the
170 -- external subset; for such documents, the rule that an entity must be
171 -- declared is a well-formedness constraint only if standalone='yes'.
172 | Error_Closing_tag_unexpected QName QName
173 -- ^ Well-formedness constraint: Element Type Match.
174 --
175 -- The Name in an element's end-tag MUST match the element type in the start-tag.
176 | Error_Attribute_collision QName
177 -- ^ Well-formedness constraint: Unique Att Spec.
178 --
179 -- An attribute name MUST NOT appear more than once in the same start-tag or empty-element tag.
180 | Error_PI_reserved PName
181 -- ^ The target names " XML ", " xml ", and so on are reserved for standardization.
182 | Error_Namespace_prefix_unknown NCName
183 -- ^ Namespace constraint: Prefix Declared
184 --
185 -- The namespace prefix, unless it is xml or xmlns, MUST have been declared in a namespace declaration attribute in either the start-tag of the element where the prefix is used or in an ancestor element (i.e., an element in whose content the prefixed markup occurs).
186 | Error_Namespace_empty NCName
187 -- ^ Namespace constraint: No Prefix Undeclaring
188 --
189 -- In a namespace declaration for a prefix (i.e., where the NSAttName is a PrefixedAttName), the attribute value MUST NOT be empty.
190 | Error_Namespace_reserved Namespace
191 | Error_Namespace_reserved_prefix NCName
192 -- ^ Namespace constraint: Reserved Prefixes and Namespace Names
193 --
194 -- The prefix xml is by definition bound to the namespace name
195 -- http://www.w3.org/XML/1998/namespace. It MAY, but need not, be
196 -- declared, and MUST NOT be bound to any other namespace name. Other
197 -- prefixes MUST NOT be bound to this namespace name, and it MUST NOT be
198 -- declared as the default namespace.
199 --
200 -- The prefix xmlns is used only to declare namespace bindings and is by
201 -- definition bound to the namespace name http://www.w3.org/2000/xmlns/.
202 -- It MUST NOT be declared . Other prefixes MUST NOT be bound to this
203 -- namespace name, and it MUST NOT be declared as the default namespace.
204 -- Element names MUST NOT have the prefix xmlns.
205 --
206 -- All other prefixes beginning with the three-letter sequence x, m, l, in
207 -- any case combination, are reserved. This means that:
208 --
209 -- - users SHOULD NOT use them except as defined by later specifications
210 -- - processors MUST NOT treat them as fatal errors.
211 deriving (Eq,Ord,Show)
212 instance P.ShowErrorComponent Error where
213 showErrorComponent = show
214
215 -- * Helpers
216 p_error :: e -> Parser e s a
217 p_error = P.fancyFailure . Set.singleton . P.ErrorCustom
218
219 p_quoted :: P.Tokens s ~ TL.Text => (Char -> Parser e s a) -> Parser e s a
220 p_quoted p =
221 P.between (P.char '"') (P.char '"') (p '"') <|>
222 P.between (P.char '\'') (P.char '\'') (p '\'')
223
224 p_until :: P.Tokens s ~ TL.Text => (Char -> Bool) -> (Char, TL.Text) -> Parser e s TL.Text
225 p_until content (end, end_) =
226 (TL.concat <$>) $ P.many $
227 P.takeWhile1P Nothing (\c -> content c && c /= end) <|>
228 P.try (TL.singleton <$> P.char end <* P.notFollowedBy (P.string end_))
229
230 p_until1 :: P.Tokens s ~ TL.Text => (Char -> Bool) -> (Char, TL.Text) -> Parser e s TL.Text
231 p_until1 content (end, end_) =
232 (TL.concat <$>) $ P.some $
233 P.takeWhile1P Nothing (\c -> content c && c /= end) <|>
234 P.try (TL.singleton <$> P.char end <* P.notFollowedBy (P.string end_))