]> Git — Sourcephile - haskell/symantic-xml.git/blob - Language/Symantic/XML/Read/Parser.hs
Add more XML test files.
[haskell/symantic-xml.git] / Language / Symantic / XML / Read / Parser.hs
1 {-# LANGUAGE ConstraintKinds #-}
2 {-# LANGUAGE FlexibleContexts #-}
3 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
4 {-# LANGUAGE OverloadedStrings #-}
5 {-# LANGUAGE Rank2Types #-}
6 {-# LANGUAGE StrictData #-}
7 {-# LANGUAGE TypeFamilies #-}
8 module Language.Symantic.XML.Read.Parser where
9
10 import Control.Applicative (Applicative(..), Alternative(..))
11 import Control.Monad (Monad(..))
12 import Data.Bool
13 import Data.Char (Char)
14 import Data.Default.Class (Default(..))
15 import Data.Either (Either(..))
16 import Data.Eq (Eq(..))
17 import Data.Function (($), (.))
18 import Data.Functor ((<$>))
19 import Data.Functor.Identity (Identity(..))
20 import Data.List.NonEmpty (NonEmpty(..), nonEmpty)
21 import Data.Maybe (Maybe(..), fromMaybe)
22 import Data.Ord (Ord(..))
23 import Data.Proxy (Proxy(..))
24 import Data.Semigroup (Semigroup(..))
25 import Data.String (IsString)
26 import Data.Tuple (snd)
27 import Prelude (Integer)
28 import Text.Show (Show(..))
29 import qualified Control.Monad.Trans.Reader as R
30 import qualified Data.HashMap.Strict as HM
31 import qualified Data.Set as Set
32 import qualified Data.Text.Lazy as TL
33 import qualified Text.Megaparsec as P
34 import qualified Text.Megaparsec.Char as P
35
36 import Language.Symantic.XML.Document
37
38 -- * Type 'Parser'
39 -- | Convenient alias.
40 type Parser e s a =
41 Parsable e s a =>
42 R.ReaderT Reader (P.ParsecT e s Identity) a
43
44 -- ** Type 'Parsable'
45 type Parsable e s a =
46 ( P.Stream s
47 , P.Token s ~ Char
48 , Ord e
49 , IsString (P.Tokens s)
50 , P.ShowErrorComponent e
51 )
52
53 -- ** Type 'Reader'
54 data Reader = Reader
55 { reader_source :: FileSource
56 , reader_ns_scope :: HM.HashMap NCName Namespace
57 , reader_ns_default :: Namespace
58 } deriving (Show)
59 instance Default Reader where
60 def = Reader
61 { reader_source = pure def
62 , reader_ns_scope = HM.fromList
63 [ ("xml" , xmlns_xml)
64 , ("xmlns", xmlns_xmlns)
65 ]
66 , reader_ns_default = ""
67 }
68
69 p_FilePos :: Parser e s FilePos
70 p_FilePos = do
71 pos :| _ <- P.statePos <$> P.getParserState
72 return $ FilePos (P.unPos $ P.sourceLine pos) (P.unPos $ P.sourceColumn pos)
73
74 p_Sourced :: Parser e s a -> Parser e s (Sourced FileSource a)
75 p_Sourced pa = do
76 Reader{reader_source} <- R.ask
77 beginPos :| _ <- P.statePos <$> P.getParserState
78 a <- pa
79 fileRange_end <- p_FilePos
80 let fileRange = FileRange
81 { fileRange_file = P.sourceName beginPos
82 , fileRange_begin =
83 FilePos
84 (P.unPos $ P.sourceLine beginPos)
85 (P.unPos $ P.sourceColumn beginPos)
86 , fileRange_end
87 }
88 return $ Sourced (setSource fileRange reader_source) a
89
90 setSource :: FileRange -> FileSource -> FileSource
91 setSource fileRange (_curr:|next) = fileRange :| next
92
93 -- | Like 'p_Sourced' but uncoupled for more flexibility.
94 p_SourcedBegin :: Parser e s a -> Parser e s a
95 p_SourcedBegin pa = do
96 currPos :| _ <- P.statePos <$> P.getParserState
97 let fileRange_begin = FilePos
98 (P.unPos $ P.sourceLine currPos)
99 (P.unPos $ P.sourceColumn currPos)
100 let fileRange = FileRange
101 { fileRange_file = P.sourceName currPos
102 , fileRange_begin
103 , fileRange_end = fileRange_begin
104 }
105 (`R.local` pa) $ \ro@Reader{..} ->
106 ro{ reader_source = setSource fileRange reader_source }
107
108 -- | Only to be used within a 'p_SourcedBegin'.
109 p_SourcedEnd :: Parser e s (a -> Sourced FileSource a)
110 p_SourcedEnd = do
111 fileRange_end <- p_FilePos
112 Reader{..} <- R.ask
113 return $ Sourced $
114 (\(curr:|path) -> curr{fileRange_end}:|path)
115 reader_source
116
117 -- ** Type 'StreamSourced'
118 -- | Wrap 'TL.Text' to have a 'P.Stream' instance
119 -- whose 'P.advance1' method abuses the tab width state
120 -- to instead pass the line indent.
121 -- This in order to report correct 'P.SourcePos'
122 -- when parsing a 'Sourced' containing newlines.
123 newtype StreamSourced = StreamSourced { unStreamSourced :: TL.Text }
124 deriving (IsString,Eq,Ord)
125 instance P.Stream StreamSourced where
126 type Token StreamSourced = Char
127 type Tokens StreamSourced = TL.Text
128 take1_ (StreamSourced t) = (StreamSourced <$>) <$> P.take1_ t
129 takeN_ n (StreamSourced t) = (StreamSourced <$>) <$> P.takeN_ n t
130 takeWhile_ f (StreamSourced t) = StreamSourced <$> P.takeWhile_ f t
131 tokensToChunk _s = P.tokensToChunk (Proxy::Proxy TL.Text)
132 chunkToTokens _s = P.chunkToTokens (Proxy::Proxy TL.Text)
133 chunkLength _s = P.chunkLength (Proxy::Proxy TL.Text)
134 advance1 _s indent (P.SourcePos n line col) c =
135 case c of
136 '\n' -> P.SourcePos n (line <> P.pos1) indent
137 _ -> P.SourcePos n line (col <> P.pos1)
138 advanceN s indent = TL.foldl' (P.advance1 s indent)
139
140 -- | Wrapper around |P.runParser'|
141 -- to use given 'Sourced' as starting position.
142 runParserOnSourced ::
143 Parsable e StreamSourced a =>
144 Parser e StreamSourced a ->
145 Sourced FileSource TL.Text ->
146 Either (P.ParseError (P.Token StreamSourced) e) a
147 runParserOnSourced p (Sourced (FileRange inp bp _ep :| path) s) =
148 snd $
149 P.runParser' (R.runReaderT p ro <* P.eof)
150 P.State
151 { P.stateInput = StreamSourced s
152 , P.statePos = pure $ P.SourcePos inp (P.mkPos $ filePos_line bp) indent
153 , P.stateTabWidth = indent
154 , P.stateTokensProcessed = 0
155 }
156 where
157 indent = P.mkPos $ filePos_column bp
158 ro = def{ reader_source = fromMaybe (pure def) $ nonEmpty path }
159
160
161 -- * Type 'Error'
162 data Error
163 = Error_CharRef_invalid Integer
164 -- ^ Well-formedness constraint: Legal Character.
165 --
166 -- Characters referred to using character references MUST match the production for Char.
167 | Error_EntityRef_unknown NCName
168 -- ^ Well-formedness constraint: Entity Declared
169 --
170 -- In a document without any DTD, a document with only an internal DTD
171 -- subset which contains no parameter entity references, or a document
172 -- with " standalone='yes' ", for an entity reference that does not occur
173 -- within the external subset or a parameter entity, the Name given in the
174 -- entity reference MUST match that in an entity declaration that does not
175 -- occur within the external subset or a parameter entity, except that
176 -- well-formed documents need not declare any of the following entities:
177 -- amp, lt, gt, apos, quot. The declaration of a general entity MUST
178 -- precede any reference to it which appears in a default value in an
179 -- attribute-list declaration.
180 --
181 -- Note that non-validating processors are not obligated to read and
182 -- process entity declarations occurring in parameter entities or in the
183 -- external subset; for such documents, the rule that an entity must be
184 -- declared is a well-formedness constraint only if standalone='yes'.
185 | Error_Closing_tag_unexpected QName QName
186 -- ^ Well-formedness constraint: Element Type Match.
187 --
188 -- The Name in an element's end-tag MUST match the element type in the start-tag.
189 | Error_Attribute_collision QName
190 -- ^ Well-formedness constraint: Unique Att Spec.
191 --
192 -- An attribute name MUST NOT appear more than once in the same start-tag or empty-element tag.
193 | Error_PI_reserved PName
194 -- ^ The target names " XML ", " xml ", and so on are reserved for standardization.
195 | Error_Namespace_prefix_unknown NCName
196 -- ^ Namespace constraint: Prefix Declared
197 --
198 -- The namespace prefix, unless it is xml or xmlns, MUST have been declared in a namespace declaration attribute in either the start-tag of the element where the prefix is used or in an ancestor element (i.e., an element in whose content the prefixed markup occurs).
199 | Error_Namespace_empty NCName
200 -- ^ Namespace constraint: No Prefix Undeclaring
201 --
202 -- In a namespace declaration for a prefix (i.e., where the NSAttName is a PrefixedAttName), the attribute value MUST NOT be empty.
203 | Error_Namespace_reserved Namespace
204 | Error_Namespace_reserved_prefix NCName
205 -- ^ Namespace constraint: Reserved Prefixes and Namespace Names
206 --
207 -- The prefix xml is by definition bound to the namespace name
208 -- http://www.w3.org/XML/1998/namespace. It MAY, but need not, be
209 -- declared, and MUST NOT be bound to any other namespace name. Other
210 -- prefixes MUST NOT be bound to this namespace name, and it MUST NOT be
211 -- declared as the default namespace.
212 --
213 -- The prefix xmlns is used only to declare namespace bindings and is by
214 -- definition bound to the namespace name http://www.w3.org/2000/xmlns/.
215 -- It MUST NOT be declared . Other prefixes MUST NOT be bound to this
216 -- namespace name, and it MUST NOT be declared as the default namespace.
217 -- Element names MUST NOT have the prefix xmlns.
218 --
219 -- All other prefixes beginning with the three-letter sequence x, m, l, in
220 -- any case combination, are reserved. This means that:
221 --
222 -- - users SHOULD NOT use them except as defined by later specifications
223 -- - processors MUST NOT treat them as fatal errors.
224 deriving (Eq,Ord,Show)
225 instance P.ShowErrorComponent Error where
226 showErrorComponent = show
227
228 -- * Helpers
229 p_error :: e -> Parser e s a
230 p_error = P.fancyFailure . Set.singleton . P.ErrorCustom
231
232 p_quoted :: P.Tokens s ~ TL.Text => (Char -> Parser e s a) -> Parser e s a
233 p_quoted p =
234 P.between (P.char '"') (P.char '"') (p '"') <|>
235 P.between (P.char '\'') (P.char '\'') (p '\'')
236
237 p_until :: P.Tokens s ~ TL.Text => (Char -> Bool) -> (Char, TL.Text) -> Parser e s TL.Text
238 p_until content (end, end_) =
239 (TL.concat <$>) $ P.many $
240 P.takeWhile1P Nothing (\c -> content c && c /= end) <|>
241 P.try (TL.singleton <$> P.char end <* P.notFollowedBy (P.string end_))
242
243 p_until1 :: P.Tokens s ~ TL.Text => (Char -> Bool) -> (Char, TL.Text) -> Parser e s TL.Text
244 p_until1 content (end, end_) =
245 (TL.concat <$>) $ P.some $
246 P.takeWhile1P Nothing (\c -> content c && c /= end) <|>
247 P.try (TL.singleton <$> P.char end <* P.notFollowedBy (P.string end_))