]> Git — Sourcephile - haskell/symantic-xml.git/blob - Language/Symantic/XML/Read/Parser.hs
RNC: add some tests
[haskell/symantic-xml.git] / Language / Symantic / XML / Read / Parser.hs
1 {-# LANGUAGE ConstraintKinds #-}
2 {-# LANGUAGE FlexibleContexts #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE StrictData #-}
6 {-# LANGUAGE TypeFamilies #-}
7 module Language.Symantic.XML.Read.Parser where
8
9 import Control.Applicative (Applicative(..), Alternative(..))
10 import Control.Monad (Monad(..))
11 import Data.Bool
12 import Data.Char (Char)
13 import Data.Default.Class (Default(..))
14 import Data.Eq (Eq(..))
15 import Data.Function (($), (.))
16 import Data.Functor ((<$>))
17 import Data.List.NonEmpty (NonEmpty(..))
18 import Data.Maybe (Maybe(..))
19 import Data.Ord (Ord(..))
20 import Data.String (IsString)
21 import Prelude (Integer)
22 import Text.Show (Show(..))
23 import qualified Control.Monad.Trans.Reader as R
24 import qualified Data.HashMap.Strict as HM
25 import qualified Data.Set as Set
26 import qualified Data.Text.Lazy as TL
27 import qualified Text.Megaparsec as P
28 import qualified Text.Megaparsec.Char as P
29
30 import Language.Symantic.XML.Document hiding (XML, XMLs)
31 import qualified Language.Symantic.XML.Document as XML
32
33 -- | Specify |XML.XML|'s 'src' type parameter for parsing.
34 type XML = XML.XML (FileSource Offset)
35 -- | Specify |XML.XMLs|'s 'src' type parameter for parsing.
36 type XMLs = XML.XMLs (FileSource Offset)
37
38 -- * Type 'Parser'
39 -- | Convenient alias.
40 type Parser e s a =
41 Parsable e s a =>
42 R.ReaderT Reader (P.Parsec e s) a
43
44 -- ** Type 'Parsable'
45 type Parsable e s a =
46 ( P.Stream s
47 , P.Token s ~ Char
48 , Ord e
49 , IsString (P.Tokens s)
50 , P.ShowErrorComponent e
51 )
52
53 -- ** Type 'Reader'
54 data Reader = Reader
55 { reader_source :: FileSource Offset
56 , reader_ns_scope :: HM.HashMap NCName Namespace
57 , reader_ns_default :: Namespace
58 } deriving (Show)
59 instance Default Reader where
60 def = Reader
61 { reader_source = pure def
62 , reader_ns_scope = HM.fromList
63 [ ("xml" , xmlns_xml)
64 , ("xmlns", xmlns_xmlns)
65 ]
66 , reader_ns_default = ""
67 }
68
69 p_Offset :: Parser e s Offset
70 p_Offset = Offset <$> P.getOffset
71 {-# INLINE p_Offset #-}
72
73 p_Sourced :: Parser e s a -> Parser e s (Sourced (FileSource Offset) a)
74 p_Sourced pa = do
75 Reader{reader_source} <- R.ask
76 b <- P.getParserState
77 let fileRange_file = P.sourceName $ P.pstateSourcePos $ P.statePosState b
78 let fileRange_begin = Offset $ P.stateOffset b
79 a <- pa
80 e <- P.getParserState
81 let fileRange_end = Offset $ P.stateOffset e
82 return $ Sourced (setSource FileRange{..} reader_source) a
83
84 setSource :: FileRange pos -> FileSource pos -> FileSource pos
85 setSource fileRange (_curr:|next) = fileRange :| next
86
87 -- | Like 'p_Sourced' but uncoupled (through the use of 'p_SourcedEnd') for more flexibility.
88 p_SourcedBegin :: Parser e s a -> Parser e s a
89 p_SourcedBegin pa = do
90 b <- P.getParserState
91 let fileRange_file = P.sourceName $ P.pstateSourcePos $ P.statePosState b
92 let fileRange_begin = Offset $ P.stateOffset b
93 let fileRange_end = fileRange_begin
94 (`R.local` pa) $ \ro@Reader{..} ->
95 ro{ reader_source = setSource FileRange{..} reader_source }
96
97 -- | WARNING: Only to be used within a 'p_SourcedBegin'.
98 p_SourcedEnd :: Parser e s (a -> Sourced (FileSource Offset) a)
99 p_SourcedEnd = do
100 Reader{..} <- R.ask
101 e <- P.getParserState
102 let fileRange_end = Offset $ P.stateOffset e
103 return $ Sourced $
104 (\(curr:|path) -> curr{fileRange_end}:|path)
105 reader_source
106
107 {-
108 -- ** Type 'StreamSourced'
109 -- | Wrap 'TL.Text' to have a 'P.Stream' instance
110 -- whose 'P.advance1' method abuses the tab width state
111 -- to instead pass the line indent.
112 -- This in order to report correct 'P.SourcePos'
113 -- when parsing a 'Sourced' containing newlines.
114 newtype StreamSourced = StreamSourced { unStreamSourced :: TL.Text }
115 deriving (IsString,Eq,Ord)
116 instance P.Stream StreamSourced where
117 type Token StreamSourced = Char
118 type Tokens StreamSourced = TL.Text
119 take1_ (StreamSourced t) = (StreamSourced <$>) <$> P.take1_ t
120 takeN_ n (StreamSourced t) = (StreamSourced <$>) <$> P.takeN_ n t
121 takeWhile_ f (StreamSourced t) = StreamSourced <$> P.takeWhile_ f t
122 tokensToChunk _s = P.tokensToChunk (Proxy::Proxy TL.Text)
123 chunkToTokens _s = P.chunkToTokens (Proxy::Proxy TL.Text)
124 chunkLength _s = P.chunkLength (Proxy::Proxy TL.Text)
125 {-
126 advance1 _s indent (P.SourcePos n line col) c =
127 case c of
128 '\n' -> P.SourcePos n (line <> P.pos1) indent
129 _ -> P.SourcePos n line (col <> P.pos1)
130 advanceN s indent = TL.foldl' (P.advance1 s indent)
131 -}
132
133 -- | Wrapper around |P.runParser'|
134 -- to use given 'Sourced' as starting position.
135 runParserOnSourced ::
136 Parsable e StreamSourced a =>
137 Parser e StreamSourced a ->
138 Sourced FileSource TL.Text ->
139 Either (P.ParseError (P.Token StreamSourced) e) a
140 runParserOnSourced p (Sourced (FileRange inp bp _ep :| path) s) =
141 snd $
142 P.runParser' (R.runReaderT p ro <* P.eof)
143 P.State
144 { P.stateInput = StreamSourced s
145 , P.statePos = pure $ P.SourcePos inp (P.mkPos $ filePos_line bp) indent
146 , P.stateTabWidth = indent
147 , P.stateTokensProcessed = 0
148 }
149 where
150 indent = P.mkPos $ filePos_column bp
151 ro = def{ reader_source = fromMaybe (pure def) $ nonEmpty path }
152 -}
153
154 -- * Type 'Error'
155 data Error
156 = Error_CharRef_invalid Integer
157 -- ^ Well-formedness constraint: Legal Character.
158 --
159 -- Characters referred to using character references MUST match the production for Char.
160 | Error_EntityRef_unknown NCName
161 -- ^ Well-formedness constraint: Entity Declared
162 --
163 -- In a document without any DTD, a document with only an internal DTD
164 -- subset which contains no parameter entity references, or a document
165 -- with " standalone='yes' ", for an entity reference that does not occur
166 -- within the external subset or a parameter entity, the Name given in the
167 -- entity reference MUST match that in an entity declaration that does not
168 -- occur within the external subset or a parameter entity, except that
169 -- well-formed documents need not declare any of the following entities:
170 -- amp, lt, gt, apos, quot. The declaration of a general entity MUST
171 -- precede any reference to it which appears in a default value in an
172 -- attribute-list declaration.
173 --
174 -- Note that non-validating processors are not obligated to read and
175 -- process entity declarations occurring in parameter entities or in the
176 -- external subset; for such documents, the rule that an entity must be
177 -- declared is a well-formedness constraint only if standalone='yes'.
178 | Error_Closing_tag_unexpected QName QName
179 -- ^ Well-formedness constraint: Element Type Match.
180 --
181 -- The Name in an element's end-tag MUST match the element type in the start-tag.
182 | Error_Attribute_collision QName
183 -- ^ Well-formedness constraint: Unique Att Spec.
184 --
185 -- An attribute name MUST NOT appear more than once in the same start-tag or empty-element tag.
186 | Error_PI_reserved PName
187 -- ^ The target names " XML ", " xml ", and so on are reserved for standardization.
188 | Error_Namespace_prefix_unknown NCName
189 -- ^ Namespace constraint: Prefix Declared
190 --
191 -- The namespace prefix, unless it is xml or xmlns, MUST have been declared in a namespace declaration attribute in either the start-tag of the element where the prefix is used or in an ancestor element (i.e., an element in whose content the prefixed markup occurs).
192 | Error_Namespace_empty NCName
193 -- ^ Namespace constraint: No Prefix Undeclaring
194 --
195 -- In a namespace declaration for a prefix (i.e., where the NSAttName is a PrefixedAttName), the attribute value MUST NOT be empty.
196 | Error_Namespace_reserved Namespace
197 | Error_Namespace_reserved_prefix NCName
198 -- ^ Namespace constraint: Reserved Prefixes and Namespace Names
199 --
200 -- The prefix xml is by definition bound to the namespace name
201 -- http://www.w3.org/XML/1998/namespace. It MAY, but need not, be
202 -- declared, and MUST NOT be bound to any other namespace name. Other
203 -- prefixes MUST NOT be bound to this namespace name, and it MUST NOT be
204 -- declared as the default namespace.
205 --
206 -- The prefix xmlns is used only to declare namespace bindings and is by
207 -- definition bound to the namespace name http://www.w3.org/2000/xmlns/.
208 -- It MUST NOT be declared . Other prefixes MUST NOT be bound to this
209 -- namespace name, and it MUST NOT be declared as the default namespace.
210 -- Element names MUST NOT have the prefix xmlns.
211 --
212 -- All other prefixes beginning with the three-letter sequence x, m, l, in
213 -- any case combination, are reserved. This means that:
214 --
215 -- - users SHOULD NOT use them except as defined by later specifications
216 -- - processors MUST NOT treat them as fatal errors.
217 deriving (Eq,Ord,Show)
218 instance P.ShowErrorComponent Error where
219 showErrorComponent = show
220
221 -- * Helpers
222 p_error :: e -> Parser e s a
223 p_error = P.fancyFailure . Set.singleton . P.ErrorCustom
224
225 p_quoted :: P.Tokens s ~ TL.Text => (Char -> Parser e s a) -> Parser e s a
226 p_quoted p =
227 P.between (P.char '"') (P.char '"') (p '"') <|>
228 P.between (P.char '\'') (P.char '\'') (p '\'')
229
230 p_until :: P.Tokens s ~ TL.Text => (Char -> Bool) -> (Char, TL.Text) -> Parser e s TL.Text
231 p_until content (end, end_) =
232 (TL.concat <$>) $ P.many $
233 P.takeWhile1P Nothing (\c -> content c && c /= end) <|>
234 P.try (TL.singleton <$> P.char end <* P.notFollowedBy (P.string end_))
235
236 p_until1 :: P.Tokens s ~ TL.Text => (Char -> Bool) -> (Char, TL.Text) -> Parser e s TL.Text
237 p_until1 content (end, end_) =
238 (TL.concat <$>) $ P.some $
239 P.takeWhile1P Nothing (\c -> content c && c /= end) <|>
240 P.try (TL.singleton <$> P.char end <* P.notFollowedBy (P.string end_))