]> Git — Sourcephile - haskell/symantic-xml.git/blob - Language/Symantic/XML/Read.hs
Add more XML test files.
[haskell/symantic-xml.git] / Language / Symantic / XML / Read.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Language.Symantic.XML.Read
9 ( module Language.Symantic.XML.Read.Parser
10 , module Language.Symantic.XML.Read
11 ) where
12
13 import Control.Arrow (left)
14 import Control.Applicative (Applicative(..), Alternative(..))
15 import Control.Monad (Monad(..), void, unless, forM, join)
16 import Data.Bool
17 import Data.Char (Char)
18 import Data.Default.Class (Default(..))
19 import Data.Either (Either(..))
20 import Data.Eq (Eq(..))
21 import Data.Foldable (Foldable(..))
22 import Data.Function (($), (.), const)
23 import Data.Functor ((<$>), (<$))
24 import Data.Maybe (Maybe(..), maybe)
25 import Data.Ord (Ord(..))
26 import Data.Semigroup (Semigroup(..))
27 import Data.String (String)
28 import Data.TreeSeq.Strict (Tree(..))
29 import Data.Tuple (snd)
30 import Prelude (Num(..), Enum(..), Bounded(..), Integer, toInteger)
31 import System.IO (FilePath, IO)
32 import Text.Megaparsec ((<?>))
33 import Text.Show (Show(..))
34 import qualified Control.Exception as Exn
35 import qualified Control.Monad.Trans.Reader as R
36 import qualified Data.ByteString.Lazy as BSL
37 import qualified Data.Char as Char
38 import qualified Data.Char.Properties.XMLCharProps as XC
39 import qualified Data.HashMap.Strict as HM
40 import qualified Data.List as List
41 import qualified Data.Sequence as Seq
42 import qualified Data.Text.Encoding.Error as TL
43 import qualified Data.Text.Lazy as TL
44 import qualified Data.Text.Lazy.Encoding as TL
45 import qualified Data.TreeSeq.Strict as TS
46 import qualified System.IO.Error as IO
47 import qualified Text.Megaparsec as P
48 import qualified Text.Megaparsec.Char as P
49
50 import Language.Symantic.XML.Document
51 import Language.Symantic.XML.Read.Parser
52
53 readXML :: FilePath -> TL.Text -> Either (P.ParseError (P.Token TL.Text) Error) XMLs
54 readXML filePath stateInput =
55 snd $
56 P.runParser'
57 (R.runReaderT p_document def)
58 P.State
59 { P.stateInput
60 , P.statePos = pure $ P.initialPos filePath
61 , P.stateTabWidth = P.pos1 -- NOTE: do not expand tabs.
62 , P.stateTokensProcessed = 0
63 }
64
65 readFile :: FilePath -> IO (Either ErrorRead TL.Text)
66 readFile fp =
67 (left ErrorRead_Unicode . TL.decodeUtf8' <$> BSL.readFile fp)
68 `Exn.catch` \e ->
69 if IO.isAlreadyInUseError e
70 || IO.isDoesNotExistError e
71 || IO.isPermissionError e
72 then return $ Left $ ErrorRead_IO e
73 else IO.ioError e
74
75 -- * Type 'ErrorRead'
76 data ErrorRead
77 = ErrorRead_IO IO.IOError
78 | ErrorRead_Unicode TL.UnicodeException
79 deriving (Show)
80
81 -- * Document
82 p_document :: P.Tokens s ~ TL.Text => Parser Error s XMLs
83 p_document = do
84 ps <- p_prolog
85 e <- p_Element
86 ms <- P.many p_Misc
87 P.eof
88 return (ps <> pure e <> join (Seq.fromList ms))
89
90 -- ** Prolog
91 p_prolog :: P.Tokens s ~ TL.Text => Parser Error s XMLs
92 p_prolog = do
93 xmlDecl <- P.option Seq.empty $ pure <$> p_XMLDecl
94 ms <- P.many p_Misc
95 return (xmlDecl <> join (Seq.fromList ms))
96
97 -- ** Misc
98 p_Misc :: P.Tokens s ~ TL.Text => Parser Error s XMLs
99 p_Misc =
100 P.try (pure <$> p_Comment)
101 <|> P.try (pure <$> p_PI)
102 <|> pure <$> p_S
103
104 -- ** XMLDecl
105 p_XMLDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
106 p_XMLDecl = P.label "XMLDecl" $ do
107 Sourced src as <- p_Sourced $ P.between (P.string "<?xml") (P.string "?>") $ do
108 vi <- pure <$> p_VersionInfo
109 ed <- P.option Seq.empty $ pure <$> p_EncodingDecl
110 sd <- P.option Seq.empty $ pure <$> p_SDDecl
111 p_Spaces
112 return $ vi <> ed <> sd
113 return $ Tree (Sourced src $ NodePI "xml" "") as
114
115 p_VersionInfo :: P.Tokens s ~ TL.Text => Parser Error s XML
116 p_VersionInfo = P.label "VersionInfo" $ do
117 Sourced c v <- p_Sourced $ do
118 P.try (() <$ p_Spaces1 <* P.string "version")
119 p_Eq
120 p_quoted $ const $ p_Sourced $
121 (<>)
122 <$> P.string "1."
123 <*> P.takeWhile1P Nothing Char.isDigit
124 return $ Tree (Sourced c $ NodeAttr "version") $ pure $
125 TS.tree0 $ NodeText . pure . TextLexemePlain <$> v
126
127 p_EncodingDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
128 p_EncodingDecl = P.label "EncodingDecl" $ do
129 Sourced c v <- p_Sourced $ do
130 P.try (() <$ p_Spaces1 <* P.string "encoding")
131 p_Eq
132 p_quoted $ const $ p_Sourced p_EncName
133 return $ Tree (Sourced c $ NodeAttr "encoding") $ pure $
134 TS.tree0 $ NodeText . pure . TextLexemePlain <$> v
135
136 p_EncName :: P.Tokens s ~ TL.Text => Parser Error s TL.Text
137 p_EncName = P.label "EncName" $ do
138 P.notFollowedBy (P.satisfy $ not . isAlpha)
139 P.takeWhile1P Nothing $ \c ->
140 isAlpha c || Char.isDigit c ||
141 c=='.' || c=='_' || c=='-'
142 where isAlpha c = Char.isAsciiLower c || Char.isAsciiUpper c
143
144 -- *** SDDecl
145 p_SDDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
146 p_SDDecl = P.label "SDDecl" $ do
147 p_SourcedBegin $ do
148 Sourced ca () <- P.try (p_Sourced $ () <$ p_Spaces1 <* P.string "standalone")
149 p_Eq
150 v <- p_quoted $ const $ p_Sourced $ P.string "yes" <|> P.string "no"
151 return $ Tree (Sourced ca $ NodeAttr "standalone") $ pure $
152 TS.tree0 $ NodeText . pure . TextLexemePlain <$> v
153
154 -- ** CharData
155 p_CharData :: P.Tokens s ~ TL.Text => Parser e s [TextLexeme]
156 p_CharData =
157 escapeText
158 <$> p_until1 (\c -> XC.isXmlChar c && c/='<' && c/='&') (']',"]>")
159
160 -- ** Comment
161 p_Comment :: P.Tokens s ~ TL.Text => Parser Error s XML
162 p_Comment = p_SourcedBegin $ P.string "<!--" *> p_Comment__
163 p_Comment_ :: P.Tokens s ~ TL.Text => Parser Error s XML
164 p_Comment_ = P.string "--" *> p_Comment__
165 p_Comment__:: P.Tokens s ~ TL.Text => Parser Error s XML
166 p_Comment__ = P.label "Comment" $ do
167 c <- p_until XC.isXmlChar ('-', "-")
168 void $ P.string "-->"
169 cell <- p_SourcedEnd
170 return $ TS.tree0 (cell $ NodeComment c)
171
172 -- ** CDATA
173 p_CDSect :: P.Tokens s ~ TL.Text => Parser Error s XML
174 p_CDSect = p_SourcedBegin $ P.string "<![CDATA[" *> p_CDSect__
175 p_CDSect_ :: P.Tokens s ~ TL.Text => Parser Error s XML
176 p_CDSect_ = P.string "[CDATA[" *> p_CDSect__
177 p_CDSect__ :: P.Tokens s ~ TL.Text => Parser Error s XML
178 p_CDSect__ = P.label "CDSect" $ do
179 c <- p_until XC.isXmlChar (']', "]>")
180 void $ P.string "]]>"
181 cell <- p_SourcedEnd
182 return $ TS.tree0 $ cell $ NodeCDATA c
183
184 -- ** PI
185 p_PI :: P.Tokens s ~ TL.Text => Parser Error s XML
186 p_PI = p_SourcedBegin $ P.string "<?" *> p_PI__
187 p_PI_ :: P.Tokens s ~ TL.Text => Parser Error s XML
188 p_PI_ = P.char '?' *> p_PI__
189 p_PI__ :: P.Tokens s ~ TL.Text => Parser Error s XML
190 p_PI__ = P.label "PI" $ do
191 n <- p_PITarget
192 v <- P.option "" $ P.try $ p_Spaces1 *> p_until XC.isXmlChar ('?', ">")
193 void $ P.string "?>"
194 cell <- p_SourcedEnd
195 return $ TS.tree0 $ cell $ NodePI n v
196 p_PITarget :: P.Tokens s ~ TL.Text => Parser Error s PName
197 p_PITarget = do
198 n <- p_PName
199 case n of
200 PName{pNameSpace=Nothing, pNameLocal=NCName l}
201 | "xml" == TL.toLower l -> p_error $ Error_PI_reserved n
202 _ -> return n
203
204 -- ** Element
205 p_Element :: P.Tokens s ~ TL.Text => Parser Error s XML
206 p_Element = p_SourcedBegin $ (P.char '<' *> p_Element_)
207 p_Element_ :: P.Tokens s ~ TL.Text => Parser Error s XML
208 p_Element_ = P.label "Element" p_STag
209
210 -- *** STag
211 p_STag :: P.Tokens s ~ TL.Text => Parser Error s XML
212 p_STag = do
213 n <- p_PName
214 as <- P.many $ P.try $ p_Spaces1 *> p_Attribute
215 p_Spaces
216 ro <- R.ask
217 elemNS :: HM.HashMap NCName Namespace <-
218 (HM.fromList . List.concat <$>) $ forM as $ \case
219 Sourced _ (PName{..}, Sourced _ av)
220 | ns <- Namespace $ flatText av
221 , Nothing <- pNameSpace
222 , NCName "xmlns" <- pNameLocal ->
223 -- NOTE: default namespace declaration.
224 case ns of
225 _ | ns == xmlns_xml -- DOC: it MUST NOT be declared as the default namespace
226 || ns == xmlns_xmlns -- DOC: it MUST NOT be declared as the default namespace
227 -> p_error $ Error_Namespace_reserved ns
228 _ -> return [(NCName "" , ns)]
229 | ns <- Namespace $ flatText av
230 , Just (NCName "xmlns") <- pNameSpace ->
231 -- NOTE: namespace prefix declaration.
232 case unNCName pNameLocal of
233 "xml" -- DOC: It MAY, but need not, be declared,
234 -- and MUST NOT be bound to any other namespace name.
235 | ns == xmlns_xml -> return []
236 | otherwise -> p_error $ Error_Namespace_reserved_prefix pNameLocal
237 "xmlns" -- DOC: It MUST NOT be declared
238 -> p_error $ Error_Namespace_reserved_prefix pNameLocal
239 local | "xml" <- TL.toLower $ TL.take 3 local -> return []
240 -- DOC: All other prefixes beginning with the three-letter
241 -- sequence x, m, l, in any case combination, are reserved.
242 -- This means that: processors MUST NOT treat them as fatal errors.
243 _ | ns == xmlns_xml -- DOC: Other prefixes MUST NOT be bound to this namespace name.
244 || ns == xmlns_xmlns -- DOC: Other prefixes MUST NOT be bound to this namespace name.
245 -> p_error $ Error_Namespace_reserved ns
246 _ -> return [(pNameLocal, ns)]
247 | otherwise -> return []
248 let scopeNS = elemNS <> reader_ns_scope ro
249 let defaultNS = HM.lookupDefault (reader_ns_default ro) (NCName "") scopeNS
250 let lookupNamePrefix prefix =
251 maybe (p_error $ Error_Namespace_prefix_unknown prefix) return $
252 HM.lookup prefix scopeNS
253 elemName :: QName <-
254 -- NOTE: expand element's QName.
255 case pNameSpace n of
256 Nothing -> return QName{qNameSpace=defaultNS, qNameLocal=pNameLocal n}
257 -- DOC: If there is a default namespace declaration in scope,
258 -- the expanded name corresponding to an unprefixed element name
259 -- has the URI of the default namespace as its namespace name.
260 Just prefix
261 | NCName "xmlns" <- prefix ->
262 -- DOC: Element names MUST NOT have the prefix xmlns.
263 p_error $ Error_Namespace_reserved_prefix prefix
264 | otherwise -> do
265 ns <- lookupNamePrefix prefix
266 return QName{qNameSpace=ns, qNameLocal=pNameLocal n}
267 elemAttrs :: [Sourced FileSource (QName, Sourced FileSource Text)] <-
268 -- NOTE: expand attributes' PName into QName.
269 forM as $ \s@Sourced{unSourced=(an, av)} -> do
270 ns <- maybe (return "") lookupNamePrefix $ pNameSpace an
271 let qn = QName{qNameSpace=ns, qNameLocal=pNameLocal an}
272 return s{unSourced=(qn, av)}
273 -- NOTE: check for attribute collision.
274 let attrsByQName :: HM.HashMap QName [Sourced FileSource (QName, Sourced FileSource Text)] =
275 HM.fromListWith (<>) $ (<$> elemAttrs) $ \a@(Sourced _c (an, _av)) -> (an, [a])
276 case HM.toList $ HM.filter (\x -> length x > 1) attrsByQName of
277 (an, _):_ -> p_error $ Error_Attribute_collision an
278 _ -> return ()
279 elemAttrsXML :: XMLs <- (Seq.fromList <$>) $
280 forM elemAttrs $ \(Sourced sa (an, av)) -> do
281 return $ TS.Tree (Sourced sa $ NodeAttr an) $
282 pure $ TS.tree0 $ NodeText <$> av
283 content :: XMLs <-
284 elemAttrsXML <$ P.string "/>" <|>
285 R.local
286 (const ro
287 { reader_ns_scope = scopeNS
288 , reader_ns_default = defaultNS
289 })
290 ((elemAttrsXML <>) <$ P.char '>' <*> p_content <* p_ETag elemName)
291 cell <- p_SourcedEnd
292 return $ Tree (cell $ NodeElem elemName) content
293
294 -- *** Attribute
295 p_Attribute :: P.Tokens s ~ TL.Text => Parser Error s (Sourced FileSource (PName, Sourced FileSource Text))
296 p_Attribute = p_Sourced $ (,) <$> p_PName <* p_Eq <*> p_AttValue
297
298 p_AttValue :: P.Tokens s ~ TL.Text => Parser Error s (Sourced FileSource Text)
299 p_AttValue = P.label "AttValue" $ p_quoted p_AttValueText
300
301 p_AttValueText :: P.Tokens s ~ TL.Text => Char -> Parser Error s (Sourced FileSource Text)
302 p_AttValueText q = p_Sourced $
303 P.many
304 ( p_Reference
305 <|> TextLexemePlain <$> P.takeWhile1P Nothing (\c ->
306 XC.isXmlChar c &&
307 c `List.notElem` (q:"<&'\">"))
308 <|> TextLexemeEntityRef entityRef_gt <$ P.char '>'
309 <|> (if q == '\''
310 then TextLexemeEntityRef entityRef_quot <$ P.char '"'
311 else TextLexemeEntityRef entityRef_apos <$ P.char '\'')
312 )
313
314 -- * content
315 p_content :: P.Tokens s ~ TL.Text => Parser Error s XMLs
316 p_content =
317 (Seq.fromList <$>) $ P.many $
318 (p_SourcedBegin $ do
319 P.try $ P.char '<' *> P.notFollowedBy (P.char '/')
320 p_Element_ <|> p_PI_ <|> (P.char '!' *> (p_Comment_ <|> p_CDSect_))
321 )
322 <|> ((tree0 <$>) $ p_Sourced $ NodeText . List.concat
323 <$> P.some (p_CharData <|> pure <$> p_Reference))
324
325 -- *** ETag
326 p_ETag :: P.Tokens s ~ TL.Text => QName -> Parser Error s ()
327 p_ETag expected = do
328 got <- P.string "</" *> p_QName <* p_Spaces <* P.char '>'
329 unless (got == expected) $
330 p_error $ Error_Closing_tag_unexpected got expected
331
332 -- * Name
333 p_Name :: P.Tokens s ~ TL.Text => Parser Error s Name
334 p_Name = P.label "Name" $
335 Name
336 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNameStartChar)
337 <*> P.takeWhile1P Nothing XC.isXmlNameChar
338
339 -- * PName
340 p_PName :: P.Tokens s ~ TL.Text => Parser e s PName
341 p_PName = P.label "PName" $ do
342 n <- p_NCName
343 s <- P.optional $ P.try $ P.char ':' *> p_NCName
344 return $ case s of
345 Nothing -> PName{pNameSpace=Nothing, pNameLocal=n}
346 Just l -> PName{pNameSpace=Just n , pNameLocal=l}
347
348 -- * QName
349 p_QName :: P.Tokens s ~ TL.Text => Parser Error s QName
350 p_QName = P.label "QName" $ do
351 n <- p_NCName
352 s <- P.optional $ P.try $ P.char ':' *> p_NCName
353 Reader{..} <- R.ask
354 case s of
355 Nothing -> return QName{qNameSpace=reader_ns_default, qNameLocal=n}
356 Just l ->
357 case HM.lookup n reader_ns_scope of
358 Nothing -> p_error $ Error_Namespace_prefix_unknown n
359 Just ns -> return QName{qNameSpace=ns, qNameLocal=l}
360
361 -- ** NCName
362 p_NCName :: P.Tokens s ~ TL.Text => Parser e s NCName
363 p_NCName = P.label "NCName" $
364 NCName
365 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNCNameStartChar)
366 <*> P.takeWhile1P Nothing XC.isXmlNCNameChar
367
368 -- * Reference
369 p_Reference :: P.Tokens s ~ TL.Text => Parser Error s TextLexeme
370 p_Reference =
371 TextLexemeCharRef <$> p_CharRef <|>
372 TextLexemeEntityRef <$> p_EntityRef
373
374 -- ** EntityRef
375 p_EntityRef :: P.Tokens s ~ TL.Text => Parser Error s EntityRef
376 p_EntityRef = P.label "EntityRef" $ do
377 ref <- P.char '&' *> p_NCName <* P.char ';'
378 EntityRef ref <$> lookupEntityRef ref
379 where
380 lookupEntityRef (NCName "lt" ) = pure "<"
381 lookupEntityRef (NCName "gt" ) = pure ">"
382 lookupEntityRef (NCName "amp" ) = pure "&"
383 lookupEntityRef (NCName "apos") = pure "'"
384 lookupEntityRef (NCName "quot") = pure "\""
385 lookupEntityRef n = p_error $ Error_EntityRef_unknown n
386
387 -- ** CharRef
388 p_CharRef :: P.Tokens s ~ TL.Text => Parser Error s CharRef
389 p_CharRef = P.label "CharRef" $
390 do
391 ref <- readHexadecimal
392 <$ P.string "&#x"
393 <*> P.some P.hexDigitChar
394 <* P.char ';'
395 check ref
396 <|> do
397 ref <- readDecimal
398 <$ P.string "&#"
399 <*> P.some P.digitChar
400 <* P.char ';'
401 check ref
402 where
403 check i =
404 let c = toEnum (fromInteger i) in
405 if i <= toInteger (fromEnum (maxBound::Char))
406 && XC.isXmlChar c
407 then pure $ CharRef c
408 else p_error $ Error_CharRef_invalid i
409
410 readInt :: Integer -> String -> Integer
411 readInt base digits =
412 sign * List.foldl' acc 0 (List.concatMap digToInt digits1)
413 where
414 acc q r = q*base + r
415 (sign, digits1) =
416 case digits of
417 [] -> (1, digits)
418 c:ds | c == '-' -> (-1, ds)
419 | c == '+' -> ( 1, ds)
420 | otherwise -> ( 1, digits)
421 ord = toInteger . Char.ord
422 digToInt c
423 | Char.isDigit c = [ord c - ord '0']
424 | Char.isAsciiLower c = [ord c - ord 'a' + 10]
425 | Char.isAsciiUpper c = [ord c - ord 'A' + 10]
426 | otherwise = []
427
428 readDecimal :: String -> Integer
429 readDecimal = readInt 10
430
431 readHexadecimal :: String -> Integer
432 readHexadecimal = readInt 16
433
434 -- * Char
435 p_Char :: P.Tokens s ~ TL.Text => Parser e s Char
436 p_Char = P.label "Char" $ P.satisfy XC.isXmlCharCR <|> p_CRLF
437 {-# INLINE p_Char #-}
438
439 -- ** Space
440 -- | Map '\r' and '\r\n' to '\n'.
441 p_CRLF :: P.Tokens s ~ TL.Text => Parser e s Char
442 p_CRLF = P.label "CRLF" $
443 P.char '\r' *> P.option '\n' (P.char '\n')
444
445 p_Space :: P.Tokens s ~ TL.Text => Parser e s Char
446 p_Space = P.label "Space" $
447 P.satisfy XC.isXmlSpaceCharCR <|> p_CRLF
448 {-# INLINE p_Space #-}
449
450 p_Spaces :: P.Tokens s ~ TL.Text => Parser e s ()
451 p_Spaces = P.label "Spaces" $
452 void $ P.takeWhileP Nothing XC.isXmlSpaceChar
453 {-# INLINE p_Spaces #-}
454
455 p_S :: P.Tokens s ~ TL.Text => Parser Error s XML
456 p_S = P.label "Spaces" $
457 (\ts -> TS.tree0 (NodeText . pure . TextLexemePlain . TL.concat <$> ts))
458 <$> p_Sourced (P.some $
459 P.takeWhile1P Nothing XC.isXmlSpaceCharCR <|>
460 TL.singleton <$> p_CRLF)
461
462 p_Spaces1 :: P.Tokens s ~ TL.Text => Parser e s ()
463 p_Spaces1 = P.label "Spaces1" $
464 void $ P.takeWhile1P Nothing XC.isXmlSpaceChar
465 {-# INLINE p_Spaces1 #-}
466
467 -- * Eq
468 p_separator :: P.Tokens s ~ TL.Text => Char -> Parser e s ()
469 p_separator c = P.try (() <$ p_Spaces <* P.char c) <* p_Spaces <?> [c]
470
471 p_Eq :: P.Tokens s ~ TL.Text => Parser e s ()
472 p_Eq = p_separator '=' <?> "Eq"