]> Git — Sourcephile - haskell/symantic-xml.git/blob - Language/Symantic/XML/Read/Parser.hs
init
[haskell/symantic-xml.git] / Language / Symantic / XML / Read / Parser.hs
1 {-# LANGUAGE ConstraintKinds #-}
2 {-# LANGUAGE DeriveFunctor #-}
3 {-# LANGUAGE FlexibleContexts #-}
4 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
5 {-# LANGUAGE OverloadedStrings #-}
6 {-# LANGUAGE Rank2Types #-}
7 {-# LANGUAGE TypeFamilies #-}
8 module Language.Symantic.XML.Read.Parser where
9
10 import Control.Applicative (Applicative(..), Alternative(..))
11 import Control.Monad (Monad(..))
12 import Data.Bool
13 import Data.Char (Char)
14 import Data.Default.Class (Default(..))
15 import Data.Either (Either(..))
16 import Data.Eq (Eq(..))
17 import Data.Function (($), (.))
18 import Data.Functor (Functor, (<$>))
19 import Data.Functor.Identity (Identity(..))
20 import Data.List.NonEmpty (NonEmpty(..), nonEmpty)
21 import Data.Maybe (Maybe(..), fromMaybe)
22 import Data.Ord (Ord(..))
23 import Data.Proxy (Proxy(..))
24 import Data.Semigroup (Semigroup(..))
25 import Data.Sequence (Seq)
26 import Data.String (IsString)
27 import Data.Tuple (snd)
28 import Prelude (Int, Integer, Num(..), fromIntegral)
29 import System.FilePath (FilePath)
30 import Text.Show (Show(..), showChar, showString, showParen)
31 import qualified Control.Monad.Trans.Reader as R
32 import qualified Data.HashMap.Strict as HM
33 import qualified Data.Set as Set
34 import qualified Data.Text as Text
35 import qualified Data.Text.Lazy as TL
36 import qualified Data.TreeSeq.Strict as TS
37 import qualified Text.Megaparsec as P
38 import qualified Text.Megaparsec.Char as P
39
40 import Language.Symantic.XML.Document
41
42 -- * Type 'XML'
43 type XML = TS.Tree (Sourced FileSource Node)
44 type XMLs = Seq XML
45
46 -- * Type 'Parser'
47 -- | Convenient alias.
48 type Parser e s a =
49 Parsable e s a =>
50 R.ReaderT Reader (P.ParsecT e s Identity) a
51
52 -- ** Type 'Parsable'
53 type Parsable e s a =
54 ( P.Stream s
55 , P.Token s ~ Char
56 , Ord e
57 , IsString (P.Tokens s)
58 , P.ShowErrorComponent e
59 )
60
61 -- ** Type 'Reader'
62 data Reader = Reader
63 { reader_source :: !FileSource
64 , reader_ns_scope :: !(HM.HashMap NCName Namespace)
65 , reader_ns_default :: !Namespace
66 } deriving (Show)
67 instance Default Reader where
68 def = Reader
69 { reader_source = pure def
70 , reader_ns_scope = HM.fromList
71 [ ("xml" , xmlns_xml)
72 , ("xmlns", xmlns_xmlns)
73 ]
74 , reader_ns_default = ""
75 }
76
77 -- * Type 'Sourced'
78 data Sourced src a
79 = Sourced
80 { source :: !src
81 , unSourced :: !a
82 } deriving (Eq, Ord, Functor)
83 instance (Show src, Show a) => Show (Sourced src a) where
84 showsPrec p Sourced{..} =
85 showParen (p > 10) $
86 showsPrec 11 unSourced .
87 showString " @" . showsPrec 10 source
88 instance (FromPad a, Semigroup a) => Semigroup (Sourced FileSource a) where
89 Sourced (FileRange fx bx ex :| lx) x <> Sourced (FileRange _fy by ey :| _ly) y =
90 Sourced (FileRange fx bx ey :| lx) $
91 x<>fromPad (FilePos lines columns)<>y
92 where
93 lines = filePos_line by - filePos_line ex
94 columns = filePos_column by - filePos_column (if lines <= 0 then ex else bx)
95
96 p_Sourced :: Parser e s a -> Parser e s (Sourced FileSource a)
97 p_Sourced pa = do
98 Reader{reader_source} <- R.ask
99 beginPos :| _ <- P.statePos <$> P.getParserState
100 a <- pa
101 fileRange_end <- p_FilePos
102 let fileRange = FileRange
103 { fileRange_file = P.sourceName beginPos
104 , fileRange_begin =
105 FilePos
106 (P.unPos $ P.sourceLine beginPos)
107 (P.unPos $ P.sourceColumn beginPos)
108 , fileRange_end
109 }
110 return $ Sourced (setSource fileRange reader_source) a
111
112 setSource :: FileRange -> FileSource -> FileSource
113 setSource fileRange (_curr:|next) = fileRange :| next
114
115 -- | Like 'p_Sourced' but uncoupled for more flexibility.
116 p_SourcedBegin :: Parser e s a -> Parser e s a
117 p_SourcedBegin pa = do
118 currPos :| _ <- P.statePos <$> P.getParserState
119 let fileRange_begin = FilePos
120 (P.unPos $ P.sourceLine currPos)
121 (P.unPos $ P.sourceColumn currPos)
122 let fileRange = FileRange
123 { fileRange_file = P.sourceName currPos
124 , fileRange_begin
125 , fileRange_end = fileRange_begin
126 }
127 (`R.local` pa) $ \ro@Reader{..} ->
128 ro{ reader_source = setSource fileRange reader_source }
129
130 -- | Only to be used within a 'p_SourcedBegin'.
131 p_SourcedEnd :: Parser e s (a -> Sourced FileSource a)
132 p_SourcedEnd = do
133 fileRange_end <- p_FilePos
134 Reader{..} <- R.ask
135 return $ Sourced $
136 (\(curr:|path) -> curr{fileRange_end}:|path)
137 reader_source
138
139 -- | Wrapper around |P.runParser'|
140 -- to use given 'Sourced' as starting position.
141 runParserOnSourced ::
142 Parsable e StreamSourced a =>
143 Parser e StreamSourced a ->
144 Sourced FileSource TL.Text ->
145 Either (P.ParseError (P.Token StreamSourced) e) a
146 runParserOnSourced p (Sourced (FileRange inp bp _ep :| path) s) =
147 snd $
148 P.runParser' (R.runReaderT p ro <* P.eof)
149 P.State
150 { P.stateInput = StreamSourced s
151 , P.statePos = pure $ P.SourcePos inp (P.mkPos $ filePos_line bp) indent
152 , P.stateTabWidth = indent
153 , P.stateTokensProcessed = 0
154 }
155 where
156 indent = P.mkPos $ filePos_column bp
157 ro = def{ reader_source = fromMaybe (pure def) $ nonEmpty path }
158
159 -- ** Class 'NoSource'
160 class NoSource src where
161 noSource :: src
162 instance NoSource FileSource where
163 noSource = noSource :| []
164 instance NoSource FileRange where
165 noSource = FileRange "" filePos1 filePos1
166 {-
167 instance (FromPad a, Semigroup a, Monoid a) => Monoid (Sourced a) where
168 mempty = sourced0 mempty
169 mappend = (<>)
170 -}
171
172 notSourced :: NoSource src => a -> Sourced src a
173 notSourced = Sourced noSource
174
175 -- ** Type 'StreamSourced'
176 -- | Wrap 'TL.Text' to have a 'P.Stream' instance
177 -- whose 'P.advance1' method abuses the tab width state
178 -- to instead pass the line indent.
179 -- This in order to report correct 'P.SourcePos'
180 -- when parsing a 'Sourced' containing newlines.
181 newtype StreamSourced = StreamSourced { unStreamSourced :: TL.Text }
182 deriving (IsString,Eq,Ord)
183 instance P.Stream StreamSourced where
184 type Token StreamSourced = Char
185 type Tokens StreamSourced = TL.Text
186 take1_ (StreamSourced t) = (StreamSourced <$>) <$> P.take1_ t
187 takeN_ n (StreamSourced t) = (StreamSourced <$>) <$> P.takeN_ n t
188 takeWhile_ f (StreamSourced t) = StreamSourced <$> P.takeWhile_ f t
189 tokensToChunk _s = P.tokensToChunk (Proxy::Proxy TL.Text)
190 chunkToTokens _s = P.chunkToTokens (Proxy::Proxy TL.Text)
191 chunkLength _s = P.chunkLength (Proxy::Proxy TL.Text)
192 advance1 _s indent (P.SourcePos n line col) c =
193 case c of
194 '\n' -> P.SourcePos n (line <> P.pos1) indent
195 _ -> P.SourcePos n line (col <> P.pos1)
196 advanceN s indent = TL.foldl' (P.advance1 s indent)
197
198 -- * Type 'FileSource'
199 type FileSource = NonEmpty FileRange
200
201 -- ** Type 'FileRange'
202 data FileRange
203 = FileRange
204 { fileRange_file :: !FilePath
205 , fileRange_begin :: !FilePos
206 , fileRange_end :: !FilePos
207 } deriving (Eq, Ord)
208 instance Default FileRange where
209 def = FileRange "" filePos1 filePos1
210 instance Show FileRange where
211 showsPrec _p FileRange{..} =
212 showString fileRange_file .
213 showChar '#' . showsPrec 10 fileRange_begin .
214 showChar '-' . showsPrec 10 fileRange_end
215
216 -- *** Type 'FilePos'
217 -- | Absolute text file position.
218 data FilePos = FilePos
219 { filePos_line :: {-# UNPACK #-} !LineNum
220 , filePos_column :: {-# UNPACK #-} !ColNum
221 } deriving (Eq, Ord)
222 instance Default FilePos where
223 def = filePos1
224 instance Show FilePos where
225 showsPrec _p FilePos{..} =
226 showsPrec 11 filePos_line .
227 showChar ':' .
228 showsPrec 11 filePos_column
229
230 filePos1 :: FilePos
231 filePos1 = FilePos 1 1
232
233 p_FilePos :: Parser e s FilePos
234 p_FilePos = do
235 pos :| _ <- P.statePos <$> P.getParserState
236 return $ FilePos (P.unPos $ P.sourceLine pos) (P.unPos $ P.sourceColumn pos)
237
238 -- **** Type 'LineNum'
239 type LineNum = Int
240
241 -- **** Type 'ColNum'
242 type ColNum = Int
243
244 -- **** Class 'FromPad'
245 class FromPad a where
246 fromPad :: FilePos -> a
247 instance FromPad Text.Text where
248 fromPad FilePos{..} =
249 Text.replicate filePos_line "\n" <>
250 Text.replicate filePos_column " "
251 instance FromPad TL.Text where
252 fromPad FilePos{..} =
253 TL.replicate (fromIntegral filePos_line) "\n" <>
254 TL.replicate (fromIntegral filePos_column) " "
255
256 -- * Type 'Error'
257 data Error
258 = Error_CharRef_invalid Integer
259 -- ^ Well-formedness constraint: Legal Character.
260 --
261 -- Characters referred to using character references MUST match the production for Char.
262 | Error_EntityRef_unknown Name
263 -- ^ Well-formedness constraint: Entity Declared
264 --
265 -- In a document without any DTD, a document with only an internal DTD
266 -- subset which contains no parameter entity references, or a document
267 -- with " standalone='yes' ", for an entity reference that does not occur
268 -- within the external subset or a parameter entity, the Name given in the
269 -- entity reference MUST match that in an entity declaration that does not
270 -- occur within the external subset or a parameter entity, except that
271 -- well-formed documents need not declare any of the following entities:
272 -- amp, lt, gt, apos, quot. The declaration of a general entity MUST
273 -- precede any reference to it which appears in a default value in an
274 -- attribute-list declaration.
275 --
276 -- Note that non-validating processors are not obligated to read and
277 -- process entity declarations occurring in parameter entities or in the
278 -- external subset; for such documents, the rule that an entity must be
279 -- declared is a well-formedness constraint only if standalone='yes'.
280 | Error_Closing_tag_unexpected QName QName
281 -- ^ Well-formedness constraint: Element Type Match.
282 --
283 -- The Name in an element's end-tag MUST match the element type in the start-tag.
284 | Error_Attribute_collision QName
285 -- ^ Well-formedness constraint: Unique Att Spec.
286 --
287 -- An attribute name MUST NOT appear more than once in the same start-tag or empty-element tag.
288 | Error_PI_reserved PName
289 -- ^ The target names " XML ", " xml ", and so on are reserved for standardization.
290 | Error_Namespace_prefix_unknown NCName
291 -- ^ Namespace constraint: Prefix Declared
292 --
293 -- The namespace prefix, unless it is xml or xmlns, MUST have been declared in a namespace declaration attribute in either the start-tag of the element where the prefix is used or in an ancestor element (i.e., an element in whose content the prefixed markup occurs).
294 | Error_Namespace_empty NCName
295 -- ^ Namespace constraint: No Prefix Undeclaring
296 --
297 -- In a namespace declaration for a prefix (i.e., where the NSAttName is a PrefixedAttName), the attribute value MUST NOT be empty.
298 | Error_Namespace_reserved Namespace
299 | Error_Namespace_reserved_prefix NCName
300 -- ^ Namespace constraint: Reserved Prefixes and Namespace Names
301 --
302 -- The prefix xml is by definition bound to the namespace name
303 -- http://www.w3.org/XML/1998/namespace. It MAY, but need not, be
304 -- declared, and MUST NOT be bound to any other namespace name. Other
305 -- prefixes MUST NOT be bound to this namespace name, and it MUST NOT be
306 -- declared as the default namespace.
307 --
308 -- The prefix xmlns is used only to declare namespace bindings and is by
309 -- definition bound to the namespace name http://www.w3.org/2000/xmlns/.
310 -- It MUST NOT be declared . Other prefixes MUST NOT be bound to this
311 -- namespace name, and it MUST NOT be declared as the default namespace.
312 -- Element names MUST NOT have the prefix xmlns.
313 --
314 -- All other prefixes beginning with the three-letter sequence x, m, l, in
315 -- any case combination, are reserved. This means that:
316 --
317 -- - users SHOULD NOT use them except as defined by later specifications
318 -- - processors MUST NOT treat them as fatal errors.
319 deriving (Eq,Ord,Show)
320 instance P.ShowErrorComponent Error where
321 showErrorComponent = show
322
323 -- * Helpers
324 p_error :: e -> Parser e s a
325 p_error = P.fancyFailure . Set.singleton . P.ErrorCustom
326
327 p_quoted :: P.Tokens s ~ TL.Text => (Char -> Parser e s a) -> Parser e s a
328 p_quoted p =
329 P.between (P.char '"') (P.char '"') (p '"') <|>
330 P.between (P.char '\'') (P.char '\'') (p '\'')
331
332 p_until :: P.Tokens s ~ TL.Text => (Char -> Bool) -> (Char, TL.Text) -> Parser e s TL.Text
333 p_until content (end, end_) =
334 (TL.concat <$>) $ P.many $
335 P.takeWhile1P Nothing (\c -> content c && c /= end) <|>
336 P.try (TL.singleton <$> P.char end <* P.notFollowedBy (P.string end_))
337
338 p_until1 :: P.Tokens s ~ TL.Text => (Char -> Bool) -> (Char, TL.Text) -> Parser e s TL.Text
339 p_until1 content (end, end_) =
340 (TL.concat <$>) $ P.some $
341 P.takeWhile1P Nothing (\c -> content c && c /= end) <|>
342 P.try (TL.singleton <$> P.char end <* P.notFollowedBy (P.string end_))