]> Git — Sourcephile - haskell/symantic-xml.git/blob - Language/Symantic/XML/Read.hs
RNC: add some tests
[haskell/symantic-xml.git] / Language / Symantic / XML / Read.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Language.Symantic.XML.Read
9 ( module Language.Symantic.XML.Read.Parser
10 , module Language.Symantic.XML.Read
11 ) where
12
13 import Control.Arrow (left)
14 import Control.Applicative (Applicative(..), Alternative(..))
15 import Control.Monad (Monad(..), void, unless, forM, join)
16 import Data.Bool
17 import Data.Char (Char)
18 import Data.Default.Class (Default(..))
19 import Data.Either (Either(..))
20 import Data.Eq (Eq(..))
21 import Data.Foldable (Foldable(..))
22 import Data.Function (($), (.), const)
23 import Data.Functor ((<$>), (<$))
24 import Data.Maybe (Maybe(..), maybe)
25 import Data.Monoid (Monoid(..))
26 import Data.Ord (Ord(..))
27 import Data.Semigroup (Semigroup(..))
28 import Data.String (String)
29 import Data.TreeSeq.Strict (Tree(..))
30 import Data.Tuple (snd)
31 import Prelude (Num(..), Enum(..), Bounded(..), Integer, toInteger)
32 import System.IO (FilePath, IO)
33 import Text.Megaparsec ((<?>))
34 import Text.Show (Show(..))
35 import qualified Control.Exception as Exn
36 import qualified Control.Monad.Trans.Reader as R
37 import qualified Data.ByteString.Lazy as BSL
38 import qualified Data.Char as Char
39 import qualified Data.Char.Properties.XMLCharProps as XC
40 import qualified Data.HashMap.Strict as HM
41 import qualified Data.List as List
42 import qualified Data.Sequence as Seq
43 import qualified Data.Text.Encoding.Error as TL
44 import qualified Data.Text.Lazy as TL
45 import qualified Data.Text.Lazy.Encoding as TL
46 import qualified Data.TreeSeq.Strict as TS
47 import qualified System.IO.Error as IO
48 import qualified Text.Megaparsec as P
49 import qualified Text.Megaparsec.Char as P
50
51 import Language.Symantic.XML.Document hiding (XML, XMLs)
52 import Language.Symantic.XML.Read.Parser
53
54 readXML :: FilePath -> TL.Text -> Either (P.ParseErrorBundle TL.Text Error) XMLs
55 readXML filePath stateInput =
56 snd $
57 P.runParser'
58 (R.runReaderT p_document def)
59 P.State
60 { P.stateInput
61 , P.stateOffset = 0
62 , P.statePosState = P.PosState
63 { P.pstateInput = stateInput
64 , P.pstateOffset = 0
65 , P.pstateSourcePos = P.initialPos filePath
66 , P.pstateTabWidth = P.pos1
67 , P.pstateLinePrefix = ""
68 }
69 }
70
71 readFile :: FilePath -> IO (Either ErrorRead TL.Text)
72 readFile fp =
73 (left ErrorRead_Unicode . TL.decodeUtf8' <$> BSL.readFile fp)
74 `Exn.catch` \e ->
75 if IO.isAlreadyInUseError e
76 || IO.isDoesNotExistError e
77 || IO.isPermissionError e
78 then return $ Left $ ErrorRead_IO e
79 else IO.ioError e
80
81 -- * Type 'ErrorRead'
82 data ErrorRead
83 = ErrorRead_IO IO.IOError
84 | ErrorRead_Unicode TL.UnicodeException
85 deriving (Show)
86
87 -- * Document
88 p_document :: P.Tokens s ~ TL.Text => Parser Error s XMLs
89 p_document = do
90 ps <- p_prolog
91 e <- p_Element
92 ms <- P.many p_Misc
93 P.eof
94 return (ps <> pure e <> join (Seq.fromList ms))
95
96 -- ** Prolog
97 p_prolog :: P.Tokens s ~ TL.Text => Parser Error s XMLs
98 p_prolog = do
99 xmlDecl <- P.option Seq.empty $ pure <$> p_XMLDecl
100 ms <- P.many p_Misc
101 return (xmlDecl <> join (Seq.fromList ms))
102
103 -- ** Misc
104 p_Misc :: P.Tokens s ~ TL.Text => Parser Error s XMLs
105 p_Misc =
106 P.try (pure <$> p_Comment)
107 <|> P.try (pure <$> p_PI)
108 <|> pure <$> p_S
109
110 -- ** XMLDecl
111 p_XMLDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
112 p_XMLDecl = P.label "XMLDecl" $ do
113 Sourced src as <- p_Sourced $ P.between (P.string "<?xml") (P.string "?>") $ do
114 vi <- pure <$> p_VersionInfo
115 ed <- P.option Seq.empty $ pure <$> p_EncodingDecl
116 sd <- P.option Seq.empty $ pure <$> p_SDDecl
117 p_Spaces
118 return $ vi <> ed <> sd
119 return $ Tree (Sourced src $ NodePI "xml" "") as
120
121 p_VersionInfo :: P.Tokens s ~ TL.Text => Parser Error s XML
122 p_VersionInfo = P.label "VersionInfo" $ do
123 Sourced c v <- p_Sourced $ do
124 P.try (() <$ p_Spaces1 <* P.string "version")
125 p_Eq
126 p_quoted $ const $ p_Sourced $
127 (<>)
128 <$> P.string "1."
129 <*> P.takeWhile1P Nothing Char.isDigit
130 return $ Tree (Sourced c $ NodeAttr "version") $ pure $
131 TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v
132
133 p_EncodingDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
134 p_EncodingDecl = P.label "EncodingDecl" $ do
135 Sourced c v <- p_Sourced $ do
136 P.try (() <$ p_Spaces1 <* P.string "encoding")
137 p_Eq
138 p_quoted $ const $ p_Sourced p_EncName
139 return $ Tree (Sourced c $ NodeAttr "encoding") $ pure $
140 TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v
141
142 p_EncName :: P.Tokens s ~ TL.Text => Parser Error s TL.Text
143 p_EncName = P.label "EncName" $ do
144 P.notFollowedBy (P.satisfy $ not . isAlpha)
145 P.takeWhile1P Nothing $ \c ->
146 isAlpha c || Char.isDigit c ||
147 c=='.' || c=='_' || c=='-'
148 where isAlpha c = Char.isAsciiLower c || Char.isAsciiUpper c
149
150 -- *** SDDecl
151 p_SDDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
152 p_SDDecl = P.label "SDDecl" $ do
153 p_SourcedBegin $ do
154 Sourced ca () <- P.try (p_Sourced $ () <$ p_Spaces1 <* P.string "standalone")
155 p_Eq
156 v <- p_quoted $ const $ p_Sourced $ P.string "yes" <|> P.string "no"
157 return $ Tree (Sourced ca $ NodeAttr "standalone") $ pure $
158 TS.tree0 $ NodeText . EscapedText . pure . EscapedPlain <$> v
159
160 -- ** CharData
161 p_CharData :: P.Tokens s ~ TL.Text => Parser e s EscapedText
162 p_CharData =
163 escapeText
164 <$> p_until1 (\c -> XC.isXmlChar c && c/='<' && c/='&') (']',"]>")
165
166 -- ** Comment
167 p_Comment :: P.Tokens s ~ TL.Text => Parser Error s XML
168 p_Comment = p_SourcedBegin $ P.string "<!--" *> p_Comment__
169 p_Comment_ :: P.Tokens s ~ TL.Text => Parser Error s XML
170 p_Comment_ = P.string "--" *> p_Comment__
171 p_Comment__:: P.Tokens s ~ TL.Text => Parser Error s XML
172 p_Comment__ = P.label "Comment" $ do
173 c <- p_until XC.isXmlChar ('-', "-")
174 void $ P.string "-->"
175 cell <- p_SourcedEnd
176 return $ TS.tree0 (cell $ NodeComment c)
177
178 -- ** CDATA
179 p_CDSect :: P.Tokens s ~ TL.Text => Parser Error s XML
180 p_CDSect = p_SourcedBegin $ P.string "<![CDATA[" *> p_CDSect__
181 p_CDSect_ :: P.Tokens s ~ TL.Text => Parser Error s XML
182 p_CDSect_ = P.string "[CDATA[" *> p_CDSect__
183 p_CDSect__ :: P.Tokens s ~ TL.Text => Parser Error s XML
184 p_CDSect__ = P.label "CDSect" $ do
185 c <- p_until XC.isXmlChar (']', "]>")
186 void $ P.string "]]>"
187 cell <- p_SourcedEnd
188 return $ TS.tree0 $ cell $ NodeCDATA c
189
190 -- ** PI
191 p_PI :: P.Tokens s ~ TL.Text => Parser Error s XML
192 p_PI = p_SourcedBegin $ P.string "<?" *> p_PI__
193 p_PI_ :: P.Tokens s ~ TL.Text => Parser Error s XML
194 p_PI_ = P.char '?' *> p_PI__
195 p_PI__ :: P.Tokens s ~ TL.Text => Parser Error s XML
196 p_PI__ = P.label "PI" $ do
197 n <- p_PITarget
198 v <- P.option "" $ P.try $ p_Spaces1 *> p_until XC.isXmlChar ('?', ">")
199 void $ P.string "?>"
200 cell <- p_SourcedEnd
201 return $ TS.tree0 $ cell $ NodePI n v
202 p_PITarget :: P.Tokens s ~ TL.Text => Parser Error s PName
203 p_PITarget = do
204 n <- p_PName
205 case n of
206 PName{pNameSpace=Nothing, pNameLocal=NCName l}
207 | "xml" == TL.toLower l -> p_error $ Error_PI_reserved n
208 _ -> return n
209
210 -- ** Element
211 p_Element :: P.Tokens s ~ TL.Text => Parser Error s XML
212 p_Element = p_SourcedBegin $ (P.char '<' *> p_Element_)
213 p_Element_ :: P.Tokens s ~ TL.Text => Parser Error s XML
214 p_Element_ = P.label "Element" p_STag
215
216 -- *** STag
217 p_STag :: P.Tokens s ~ TL.Text => Parser Error s XML
218 p_STag = do
219 n <- p_PName
220 as <- P.many $ P.try $ p_Spaces1 *> p_Attribute
221 p_Spaces
222 ro <- R.ask
223 elemNS :: HM.HashMap NCName Namespace <-
224 (HM.fromList . List.concat <$>) $ forM as $ \case
225 Sourced _ (PName{..}, Sourced _ av)
226 | ns <- Namespace $ unescapeText av
227 , Nothing <- pNameSpace
228 , NCName "xmlns" <- pNameLocal ->
229 -- NOTE: default namespace declaration.
230 case ns of
231 _ | ns == xmlns_xml -- DOC: it MUST NOT be declared as the default namespace
232 || ns == xmlns_xmlns -- DOC: it MUST NOT be declared as the default namespace
233 -> p_error $ Error_Namespace_reserved ns
234 _ -> return [(NCName "" , ns)]
235 | ns <- Namespace $ unescapeText av
236 , Just (NCName "xmlns") <- pNameSpace ->
237 -- NOTE: namespace prefix declaration.
238 case unNCName pNameLocal of
239 "xml" -- DOC: It MAY, but need not, be declared,
240 -- and MUST NOT be bound to any other namespace name.
241 | ns == xmlns_xml -> return []
242 | otherwise -> p_error $ Error_Namespace_reserved_prefix pNameLocal
243 "xmlns" -- DOC: It MUST NOT be declared
244 -> p_error $ Error_Namespace_reserved_prefix pNameLocal
245 local | "xml" <- TL.toLower $ TL.take 3 local -> return []
246 -- DOC: All other prefixes beginning with the three-letter
247 -- sequence x, m, l, in any case combination, are reserved.
248 -- This means that: processors MUST NOT treat them as fatal errors.
249 _ | ns == xmlns_xml -- DOC: Other prefixes MUST NOT be bound to this namespace name.
250 || ns == xmlns_xmlns -- DOC: Other prefixes MUST NOT be bound to this namespace name.
251 -> p_error $ Error_Namespace_reserved ns
252 _ -> return [(pNameLocal, ns)]
253 | otherwise -> return []
254 let scopeNS = elemNS <> reader_ns_scope ro
255 let defaultNS = HM.lookupDefault (reader_ns_default ro) (NCName "") scopeNS
256 let lookupNamePrefix prefix =
257 maybe (p_error $ Error_Namespace_prefix_unknown prefix) return $
258 HM.lookup prefix scopeNS
259 elemName :: QName <-
260 -- NOTE: expand element's QName.
261 case pNameSpace n of
262 Nothing -> return QName{qNameSpace=defaultNS, qNameLocal=pNameLocal n}
263 -- DOC: If there is a default namespace declaration in scope,
264 -- the expanded name corresponding to an unprefixed element name
265 -- has the URI of the default namespace as its namespace name.
266 Just prefix
267 | NCName "xmlns" <- prefix ->
268 -- DOC: Element names MUST NOT have the prefix xmlns.
269 p_error $ Error_Namespace_reserved_prefix prefix
270 | otherwise -> do
271 ns <- lookupNamePrefix prefix
272 return QName{qNameSpace=ns, qNameLocal=pNameLocal n}
273 elemAttrs :: [FileSourced (QName, FileSourced EscapedText)] <-
274 -- NOTE: expand attributes' PName into QName.
275 forM as $ \s@Sourced{unSourced=(an, av)} -> do
276 ns <- maybe (return "") lookupNamePrefix $ pNameSpace an
277 let qn = QName{qNameSpace=ns, qNameLocal=pNameLocal an}
278 return s{unSourced=(qn, av)}
279 -- NOTE: check for attribute collision.
280 let attrsByQName :: HM.HashMap QName [FileSourced (QName, FileSourced EscapedText)] =
281 HM.fromListWith (<>) $ (<$> elemAttrs) $ \a@(Sourced _c (an, _av)) -> (an, [a])
282 case HM.toList $ HM.filter (\x -> length x > 1) attrsByQName of
283 (an, _):_ -> p_error $ Error_Attribute_collision an
284 _ -> return ()
285 elemAttrsXML :: XMLs <- (Seq.fromList <$>) $
286 forM elemAttrs $ \(Sourced sa (an, av)) -> do
287 return $ TS.Tree (Sourced sa $ NodeAttr an) $
288 pure $ TS.tree0 $ NodeText <$> av
289 content :: XMLs <-
290 elemAttrsXML <$ P.string "/>" <|>
291 R.local
292 (const ro
293 { reader_ns_scope = scopeNS
294 , reader_ns_default = defaultNS
295 })
296 ((elemAttrsXML <>) <$ P.char '>' <*> p_content <* p_ETag elemName)
297 cell <- p_SourcedEnd
298 return $ Tree (cell $ NodeElem elemName) content
299
300 -- *** Attribute
301 p_Attribute :: P.Tokens s ~ TL.Text => Parser Error s (FileSourced (PName, FileSourced EscapedText))
302 p_Attribute = p_Sourced $ (,) <$> p_PName <* p_Eq <*> p_AttValue
303
304 p_AttValue :: P.Tokens s ~ TL.Text => Parser Error s (FileSourced EscapedText)
305 p_AttValue = P.label "AttValue" $ p_quoted p_AttValueText
306
307 p_AttValueText :: P.Tokens s ~ TL.Text => Char -> Parser Error s (FileSourced EscapedText)
308 p_AttValueText q = p_Sourced $
309 EscapedText . Seq.fromList <$> P.many
310 ( p_Reference
311 <|> EscapedPlain <$> P.takeWhile1P Nothing (\c ->
312 XC.isXmlChar c &&
313 c `List.notElem` (q:"<&'\">"))
314 <|> EscapedEntityRef entityRef_gt <$ P.char '>'
315 <|> (if q == '\''
316 then EscapedEntityRef entityRef_quot <$ P.char '"'
317 else EscapedEntityRef entityRef_apos <$ P.char '\'')
318 )
319
320 -- * content
321 p_content :: P.Tokens s ~ TL.Text => Parser Error s XMLs
322 p_content =
323 (Seq.fromList <$>) $ P.many $
324 (p_SourcedBegin $ do
325 P.try $ P.char '<' *> P.notFollowedBy (P.char '/')
326 p_Element_ <|> p_PI_ <|> (P.char '!' *> (p_Comment_ <|> p_CDSect_))
327 )
328 <|> ((tree0 <$>) $ p_Sourced $ NodeText . mconcat
329 <$> P.some (p_CharData <|> EscapedText . pure <$> p_Reference))
330
331 -- *** ETag
332 p_ETag :: P.Tokens s ~ TL.Text => QName -> Parser Error s ()
333 p_ETag expected = do
334 got <- P.string "</" *> p_QName <* p_Spaces <* P.char '>'
335 unless (got == expected) $
336 p_error $ Error_Closing_tag_unexpected got expected
337
338 -- * Name
339 p_Name :: P.Tokens s ~ TL.Text => Parser Error s Name
340 p_Name = P.label "Name" $
341 Name
342 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNameStartChar)
343 <*> P.takeWhile1P Nothing XC.isXmlNameChar
344
345 -- * PName
346 p_PName :: P.Tokens s ~ TL.Text => Parser e s PName
347 p_PName = P.label "PName" $ do
348 n <- p_NCName
349 s <- P.optional $ P.try $ P.char ':' *> p_NCName
350 return $ case s of
351 Nothing -> PName{pNameSpace=Nothing, pNameLocal=n}
352 Just l -> PName{pNameSpace=Just n , pNameLocal=l}
353
354 -- * QName
355 p_QName :: P.Tokens s ~ TL.Text => Parser Error s QName
356 p_QName = P.label "QName" $ do
357 n <- p_NCName
358 s <- P.optional $ P.try $ P.char ':' *> p_NCName
359 Reader{..} <- R.ask
360 case s of
361 Nothing -> return QName{qNameSpace=reader_ns_default, qNameLocal=n}
362 Just l ->
363 case HM.lookup n reader_ns_scope of
364 Nothing -> p_error $ Error_Namespace_prefix_unknown n
365 Just ns -> return QName{qNameSpace=ns, qNameLocal=l}
366
367 -- ** NCName
368 p_NCName :: P.Tokens s ~ TL.Text => Parser e s NCName
369 p_NCName = P.label "NCName" $
370 NCName
371 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNCNameStartChar)
372 <*> P.takeWhile1P Nothing XC.isXmlNCNameChar
373
374 -- * Reference
375 p_Reference :: P.Tokens s ~ TL.Text => Parser Error s Escaped
376 p_Reference =
377 EscapedCharRef <$> p_CharRef <|>
378 EscapedEntityRef <$> p_EntityRef
379
380 -- ** EntityRef
381 p_EntityRef :: P.Tokens s ~ TL.Text => Parser Error s EntityRef
382 p_EntityRef = P.label "EntityRef" $ do
383 ref <- P.char '&' *> p_NCName <* P.char ';'
384 EntityRef ref <$> lookupEntityRef ref
385 where
386 lookupEntityRef (NCName "lt" ) = pure "<"
387 lookupEntityRef (NCName "gt" ) = pure ">"
388 lookupEntityRef (NCName "amp" ) = pure "&"
389 lookupEntityRef (NCName "apos") = pure "'"
390 lookupEntityRef (NCName "quot") = pure "\""
391 lookupEntityRef n = p_error $ Error_EntityRef_unknown n
392
393 -- ** CharRef
394 p_CharRef :: P.Tokens s ~ TL.Text => Parser Error s CharRef
395 p_CharRef = P.label "CharRef" $
396 do
397 ref <- readHexadecimal
398 <$ P.string "&#x"
399 <*> P.some P.hexDigitChar
400 <* P.char ';'
401 check ref
402 <|> do
403 ref <- readDecimal
404 <$ P.string "&#"
405 <*> P.some P.digitChar
406 <* P.char ';'
407 check ref
408 where
409 check i =
410 let c = toEnum (fromInteger i) in
411 if i <= toInteger (fromEnum (maxBound::Char))
412 && XC.isXmlChar c
413 then pure $ CharRef c
414 else p_error $ Error_CharRef_invalid i
415
416 readInt :: Integer -> String -> Integer
417 readInt base digits =
418 sign * List.foldl' acc 0 (List.concatMap digToInt digits1)
419 where
420 acc q r = q*base + r
421 (sign, digits1) =
422 case digits of
423 [] -> (1, digits)
424 c:ds | c == '-' -> (-1, ds)
425 | c == '+' -> ( 1, ds)
426 | otherwise -> ( 1, digits)
427 ord = toInteger . Char.ord
428 digToInt c
429 | Char.isDigit c = [ord c - ord '0']
430 | Char.isAsciiLower c = [ord c - ord 'a' + 10]
431 | Char.isAsciiUpper c = [ord c - ord 'A' + 10]
432 | otherwise = []
433
434 readDecimal :: String -> Integer
435 readDecimal = readInt 10
436
437 readHexadecimal :: String -> Integer
438 readHexadecimal = readInt 16
439
440 -- * Char
441 p_Char :: P.Tokens s ~ TL.Text => Parser e s Char
442 p_Char = P.label "Char" $ P.satisfy XC.isXmlCharCR <|> p_CRLF
443 {-# INLINE p_Char #-}
444
445 -- ** Space
446 -- | Map '\r' and '\r\n' to '\n'.
447 p_CRLF :: P.Tokens s ~ TL.Text => Parser e s Char
448 p_CRLF = P.label "CRLF" $
449 P.char '\r' *> P.option '\n' (P.char '\n')
450
451 p_Space :: P.Tokens s ~ TL.Text => Parser e s Char
452 p_Space = P.label "Space" $
453 P.satisfy XC.isXmlSpaceCharCR <|> p_CRLF
454 {-# INLINE p_Space #-}
455
456 p_Spaces :: P.Tokens s ~ TL.Text => Parser e s ()
457 p_Spaces = P.label "Spaces" $
458 void $ P.takeWhileP Nothing XC.isXmlSpaceChar
459 {-# INLINE p_Spaces #-}
460
461 p_S :: P.Tokens s ~ TL.Text => Parser Error s XML
462 p_S = P.label "Spaces" $
463 (\ts -> TS.tree0 (NodeText . EscapedText . pure . EscapedPlain . TL.concat <$> ts))
464 <$> p_Sourced (P.some $
465 P.takeWhile1P Nothing XC.isXmlSpaceCharCR <|>
466 TL.singleton <$> p_CRLF)
467
468 p_Spaces1 :: P.Tokens s ~ TL.Text => Parser e s ()
469 p_Spaces1 = P.label "Spaces1" $
470 void $ P.takeWhile1P Nothing XC.isXmlSpaceChar
471 {-# INLINE p_Spaces1 #-}
472
473 -- * Eq
474 p_separator :: P.Tokens s ~ TL.Text => Char -> Parser e s ()
475 p_separator c = P.try (() <$ p_Spaces <* P.char c) <* p_Spaces <?> [c]
476
477 p_Eq :: P.Tokens s ~ TL.Text => Parser e s ()
478 p_Eq = p_separator '=' <?> "Eq"