]> Git — Sourcephile - haskell/symantic-xml.git/blob - Language/Symantic/XML/Read.hs
Add indentation support in XML.Write
[haskell/symantic-xml.git] / Language / Symantic / XML / Read.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Language.Symantic.XML.Read
9 ( module Language.Symantic.XML.Read.Parser
10 , module Language.Symantic.XML.Read
11 ) where
12
13 import Control.Applicative (Applicative(..), Alternative(..))
14 import Control.Monad (Monad(..), void, unless, forM, join)
15 import Data.Bool
16 import Data.Char (Char)
17 import Data.Default.Class (Default(..))
18 import Data.Either (Either(..))
19 import Data.Eq (Eq(..))
20 import Data.Foldable (Foldable(..))
21 import Data.Function (($), (.), const)
22 import Data.Functor ((<$>), (<$))
23 import Data.Maybe (Maybe(..), maybe)
24 import Data.Ord (Ord(..))
25 import Data.Semigroup (Semigroup(..))
26 import Data.String (String)
27 import Data.TreeSeq.Strict (Tree(..))
28 import Data.Tuple (snd)
29 import Prelude (Num(..), Enum(..), Bounded(..), Integer, toInteger)
30 import System.IO (FilePath, IO)
31 import Text.Megaparsec ((<?>))
32 import qualified Control.Monad.Trans.Reader as R
33 import qualified Data.ByteString.Lazy as BSL
34 import qualified Data.Char as Char
35 import qualified Data.Char.Properties.XMLCharProps as XC
36 import qualified Data.HashMap.Strict as HM
37 import qualified Data.List as List
38 import qualified Data.Sequence as Seq
39 import qualified Data.Text.Lazy as TL
40 import qualified Data.Text.Lazy.Encoding as TL
41 import qualified Data.TreeSeq.Strict as TS
42 import qualified System.IO.Error as IO
43 import qualified Text.Megaparsec as P
44 import qualified Text.Megaparsec.Char as P
45
46 import Language.Symantic.XML.Document
47 import Language.Symantic.XML.Read.Parser
48
49 readXML :: FilePath -> TL.Text -> Either (P.ParseError (P.Token TL.Text) Error) XMLs
50 readXML filePath stateInput =
51 snd $
52 P.runParser'
53 (R.runReaderT p_document def)
54 P.State
55 { P.stateInput
56 , P.statePos = pure $ P.initialPos filePath
57 , P.stateTabWidth = P.pos1 -- NOTE: do not expand tabs.
58 , P.stateTokensProcessed = 0
59 }
60
61 readFile :: FilePath -> IO (Either IO.IOError TL.Text)
62 readFile fp =
63 (Right . TL.decodeUtf8 <$> BSL.readFile fp)
64 `IO.catchIOError` \e ->
65 if IO.isAlreadyInUseError e
66 || IO.isDoesNotExistError e
67 || IO.isPermissionError e
68 then return $ Left e
69 else IO.ioError e
70
71 -- * Document
72 p_document :: P.Tokens s ~ TL.Text => Parser Error s XMLs
73 p_document = do
74 ps <- p_prolog
75 e <- p_Element
76 ms <- P.many p_Misc
77 P.eof
78 return (ps <> pure e <> join (Seq.fromList ms))
79
80 -- ** Prolog
81 p_prolog :: P.Tokens s ~ TL.Text => Parser Error s XMLs
82 p_prolog = do
83 xmlDecl <- P.option Seq.empty $ pure <$> p_XMLDecl
84 ms <- P.many p_Misc
85 return (xmlDecl <> join (Seq.fromList ms))
86
87 -- ** Misc
88 p_Misc :: P.Tokens s ~ TL.Text => Parser Error s XMLs
89 p_Misc =
90 P.try (pure <$> p_Comment)
91 <|> P.try (pure <$> p_PI)
92 <|> pure <$> p_S
93
94 -- ** XMLDecl
95 p_XMLDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
96 p_XMLDecl = P.label "XMLDecl" $ do
97 Sourced src as <- p_Sourced $ P.between (P.string "<?xml") (P.string "?>") $ do
98 vi <- pure <$> p_VersionInfo
99 ed <- P.option Seq.empty $ pure <$> p_EncodingDecl
100 sd <- P.option Seq.empty $ pure <$> p_SDDecl
101 p_Spaces
102 return $ vi <> ed <> sd
103 return $ Tree (Sourced src $ NodePI "xml" "") as
104
105 p_VersionInfo :: P.Tokens s ~ TL.Text => Parser Error s XML
106 p_VersionInfo = P.label "VersionInfo" $ do
107 Sourced c v <- p_Sourced $ do
108 P.try (() <$ p_Spaces <* P.string "version")
109 p_Eq
110 p_quoted $ const $ p_Sourced $
111 (<>)
112 <$> P.string "1."
113 <*> P.takeWhile1P Nothing Char.isDigit
114 return $ Tree (Sourced c $ NodeAttr "version") $ pure $
115 TS.tree0 $ NodeText . pure . TextLexemePlain <$> v
116
117 p_EncodingDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
118 p_EncodingDecl = P.label "EncodingDecl" $ do
119 Sourced c v <- p_Sourced $ do
120 P.try (() <$ p_Spaces <* P.string "encoding")
121 p_Eq
122 p_quoted $ const $ p_Sourced p_EncName
123 return $ Tree (Sourced c $ NodeAttr "encoding") $ pure $
124 TS.tree0 $ NodeText . pure . TextLexemePlain <$> v
125
126 p_EncName :: P.Tokens s ~ TL.Text => Parser Error s TL.Text
127 p_EncName = P.label "EncName" $ do
128 P.notFollowedBy (P.satisfy $ not . isAlpha)
129 P.takeWhile1P Nothing $ \c ->
130 isAlpha c || Char.isDigit c ||
131 c=='.' || c=='_' || c=='-'
132 where isAlpha c = Char.isAsciiLower c || Char.isAsciiUpper c
133
134 -- *** SDDecl
135 p_SDDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
136 p_SDDecl = P.label "SDDecl" $ do
137 p_SourcedBegin $ do
138 Sourced ca () <- P.try (p_Sourced $ () <$ p_Spaces <* P.string "standalone")
139 p_Eq
140 v <- p_quoted $ const $ p_Sourced $ P.string "yes" <|> P.string "no"
141 return $ Tree (Sourced ca $ NodeAttr "standalone") $ pure $
142 TS.tree0 $ NodeText . pure . TextLexemePlain <$> v
143
144 -- ** CharData
145 p_CharData :: P.Tokens s ~ TL.Text => Parser e s XML
146 p_CharData =
147 (\t -> TS.tree0 (NodeText . escapeText <$> t))
148 <$> p_Sourced (p_until1 (\c -> XC.isXmlChar c && c/='<' && c/='&') (']',"]>"))
149
150 -- ** Comment
151 p_Comment :: P.Tokens s ~ TL.Text => Parser Error s XML
152 p_Comment = p_SourcedBegin $ P.string "<!--" *> p_Comment__
153 p_Comment_ :: P.Tokens s ~ TL.Text => Parser Error s XML
154 p_Comment_ = P.string "--" *> p_Comment__
155 p_Comment__:: P.Tokens s ~ TL.Text => Parser Error s XML
156 p_Comment__ = P.label "Comment" $ do
157 c <- p_until XC.isXmlChar ('-', "-")
158 void $ P.string "-->"
159 cell <- p_SourcedEnd
160 return $ TS.tree0 (cell $ NodeComment c)
161
162 -- ** CDATA
163 p_CDSect :: P.Tokens s ~ TL.Text => Parser Error s XML
164 p_CDSect = p_SourcedBegin $ P.string "<![CDATA[" *> p_CDSect__
165 p_CDSect_ :: P.Tokens s ~ TL.Text => Parser Error s XML
166 p_CDSect_ = P.string "[CDATA[" *> p_CDSect__
167 p_CDSect__ :: P.Tokens s ~ TL.Text => Parser Error s XML
168 p_CDSect__ = P.label "CDSect" $ do
169 c <- p_until XC.isXmlChar (']', "]>")
170 void $ P.string "]]>"
171 cell <- p_SourcedEnd
172 return $ TS.tree0 $ cell $ NodeCDATA c
173
174 -- ** PI
175 p_PI :: P.Tokens s ~ TL.Text => Parser Error s XML
176 p_PI = p_SourcedBegin $ P.string "<?" *> p_PI__
177 p_PI_ :: P.Tokens s ~ TL.Text => Parser Error s XML
178 p_PI_ = P.char '?' *> p_PI__
179 p_PI__ :: P.Tokens s ~ TL.Text => Parser Error s XML
180 p_PI__ = P.label "PI" $ do
181 n <- p_PITarget
182 v <- P.option "" $ P.try $ p_Spaces1 *> p_until XC.isXmlChar ('?', ">")
183 void $ P.string "?>"
184 cell <- p_SourcedEnd
185 return $ TS.tree0 $ cell $ NodePI n v
186 p_PITarget :: P.Tokens s ~ TL.Text => Parser Error s PName
187 p_PITarget = do
188 n <- p_PName
189 case n of
190 PName{pNameSpace=Just "", pNameLocal=NCName l}
191 | "xml" == TL.toLower l -> p_error $ Error_PI_reserved n
192 _ -> return n
193
194 -- ** Element
195 p_Element :: P.Tokens s ~ TL.Text => Parser Error s XML
196 p_Element = p_SourcedBegin $ (P.char '<' *> p_Element_)
197 p_Element_ :: P.Tokens s ~ TL.Text => Parser Error s XML
198 p_Element_ = P.label "Element" p_STag
199
200 -- *** STag
201 p_STag :: P.Tokens s ~ TL.Text => Parser Error s XML
202 p_STag = do
203 n <- p_PName
204 as <- P.many $ P.try $ p_Spaces1 *> p_Attribute
205 p_Spaces
206 ro <- R.ask
207 elemNS :: HM.HashMap NCName Namespace <-
208 (HM.fromList . List.concat <$>) $ forM as $ \case
209 Sourced _ (PName{..}, Sourced _ av)
210 | ns <- Namespace $ flatText av
211 , Nothing <- pNameSpace
212 , NCName "xmlns" <- pNameLocal ->
213 -- NOTE: default namespace declaration.
214 case ns of
215 _ | ns == xmlns_xml -- DOC: it MUST NOT be declared as the default namespace
216 || ns == xmlns_xmlns -- DOC: it MUST NOT be declared as the default namespace
217 -> p_error $ Error_Namespace_reserved ns
218 _ -> return [(NCName "" , ns)]
219 | ns <- Namespace $ flatText av
220 , Just (NCName "xmlns") <- pNameSpace ->
221 -- NOTE: namespace prefix declaration.
222 case unNCName pNameLocal of
223 "xml" -- DOC: It MAY, but need not, be declared,
224 -- and MUST NOT be bound to any other namespace name.
225 | ns == xmlns_xml -> return []
226 | otherwise -> p_error $ Error_Namespace_reserved_prefix pNameLocal
227 "xmlns" -- DOC: It MUST NOT be declared
228 -> p_error $ Error_Namespace_reserved_prefix pNameLocal
229 local | "xml" <- TL.toLower $ TL.take 3 local -> return []
230 -- DOC: All other prefixes beginning with the three-letter
231 -- sequence x, m, l, in any case combination, are reserved.
232 -- This means that: processors MUST NOT treat them as fatal errors.
233 _ | ns == xmlns_xml -- DOC: Other prefixes MUST NOT be bound to this namespace name.
234 || ns == xmlns_xmlns -- DOC: Other prefixes MUST NOT be bound to this namespace name.
235 -> p_error $ Error_Namespace_reserved ns
236 _ -> return [(pNameLocal, ns)]
237 | otherwise -> return []
238 let scopeNS = elemNS <> reader_ns_scope ro
239 let defaultNS = HM.lookupDefault (reader_ns_default ro) (NCName "") scopeNS
240 let lookupNamePrefix prefix =
241 maybe (p_error $ Error_Namespace_prefix_unknown prefix) return $
242 HM.lookup prefix scopeNS
243 elemName :: QName <-
244 -- NOTE: expand element's QName.
245 case pNameSpace n of
246 Nothing -> return QName{qNameSpace=defaultNS, qNameLocal=pNameLocal n}
247 -- DOC: If there is a default namespace declaration in scope,
248 -- the expanded name corresponding to an unprefixed element name
249 -- has the URI of the default namespace as its namespace name.
250 Just prefix
251 | NCName "xmlns" <- prefix ->
252 -- DOC: Element names MUST NOT have the prefix xmlns.
253 p_error $ Error_Namespace_reserved_prefix prefix
254 | otherwise -> do
255 ns <- lookupNamePrefix prefix
256 return QName{qNameSpace=ns, qNameLocal=pNameLocal n}
257 elemAttrs :: [Sourced FileSource (QName, Sourced FileSource Text)] <-
258 -- NOTE: expand attributes' PName into QName.
259 forM as $ \s@Sourced{unSourced=(an, av)} -> do
260 ns <- maybe (return "") lookupNamePrefix $ pNameSpace an
261 let qn = QName{qNameSpace=ns, qNameLocal=pNameLocal an}
262 return s{unSourced=(qn, av)}
263 -- NOTE: check for attribute collision.
264 let attrsByQName :: HM.HashMap QName [Sourced FileSource (QName, Sourced FileSource Text)] =
265 HM.fromListWith (<>) $ (<$> elemAttrs) $ \a@(Sourced _c (an, _av)) -> (an, [a])
266 case HM.toList $ HM.filter (\x -> length x > 1) attrsByQName of
267 (an, _):_ -> p_error $ Error_Attribute_collision an
268 _ -> return ()
269 elemAttrsXML :: XMLs <- (Seq.fromList <$>) $
270 forM elemAttrs $ \(Sourced sa (an, av)) -> do
271 return $ TS.Tree (Sourced sa $ NodeAttr an) $
272 pure $ TS.tree0 $ NodeText <$> av
273 content :: XMLs <-
274 elemAttrsXML <$ P.string "/>" <|>
275 R.local
276 (const ro
277 { reader_ns_scope = scopeNS
278 , reader_ns_default = defaultNS
279 })
280 ((elemAttrsXML <>) <$ P.char '>' <*> p_content <* p_ETag elemName)
281 cell <- p_SourcedEnd
282 return $ Tree (cell $ NodeElem elemName) content
283
284 -- *** Attribute
285 p_Attribute :: P.Tokens s ~ TL.Text => Parser Error s (Sourced FileSource (PName, Sourced FileSource Text))
286 p_Attribute = p_Sourced $ (,) <$> p_PName <* p_Eq <*> p_AttValue
287
288 p_AttValue :: P.Tokens s ~ TL.Text => Parser Error s (Sourced FileSource Text)
289 p_AttValue = P.label "AttValue" $ p_quoted p_AttValueText
290
291 p_AttValueText :: P.Tokens s ~ TL.Text => Char -> Parser Error s (Sourced FileSource Text)
292 p_AttValueText q = p_Sourced $
293 P.many
294 ( p_Reference
295 <|> TextLexemePlain <$> P.takeWhile1P Nothing (\c ->
296 XC.isXmlChar c &&
297 c `List.notElem` (q:"<&'\">"))
298 <|> TextLexemeEntityRef entityRef_gt <$ P.char '>'
299 <|> (if q == '\''
300 then TextLexemeEntityRef entityRef_quot <$ P.char '"'
301 else TextLexemeEntityRef entityRef_apos <$ P.char '\'')
302 )
303
304 -- * content
305 p_content :: P.Tokens s ~ TL.Text => Parser Error s XMLs
306 p_content =
307 (Seq.fromList <$>) $ P.many $
308 (p_SourcedBegin $ do
309 P.try $ P.char '<' *> P.notFollowedBy (P.char '/')
310 p_Element_ <|> p_PI_ <|> (P.char '!' *> (p_Comment_ <|> p_CDSect_))
311 ) <|> p_CharData
312
313 -- *** ETag
314 p_ETag :: P.Tokens s ~ TL.Text => QName -> Parser Error s ()
315 p_ETag expected = do
316 got <- P.string "</" *> p_QName <* p_Spaces <* P.char '>'
317 unless (got == expected) $
318 p_error $ Error_Closing_tag_unexpected got expected
319
320 -- * Name
321 p_Name :: P.Tokens s ~ TL.Text => Parser Error s Name
322 p_Name = P.label "Name" $
323 Name
324 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNameStartChar)
325 <*> P.takeWhile1P Nothing XC.isXmlNameChar
326
327 -- * PName
328 p_PName :: P.Tokens s ~ TL.Text => Parser e s PName
329 p_PName = P.label "PName" $ do
330 n <- p_NCName
331 s <- P.optional $ P.try $ P.char ':' *> p_NCName
332 return $ case s of
333 Nothing -> PName{pNameSpace=Nothing, pNameLocal=n}
334 Just l -> PName{pNameSpace=Just n , pNameLocal=l}
335
336 -- * QName
337 p_QName :: P.Tokens s ~ TL.Text => Parser Error s QName
338 p_QName = P.label "QName" $ do
339 n <- p_NCName
340 s <- P.optional $ P.try $ P.char ':' *> p_NCName
341 Reader{..} <- R.ask
342 case s of
343 Nothing -> return QName{qNameSpace=reader_ns_default, qNameLocal=n}
344 Just l ->
345 case HM.lookup n reader_ns_scope of
346 Nothing -> p_error $ Error_Namespace_prefix_unknown n
347 Just ns -> return QName{qNameSpace=ns, qNameLocal=l}
348
349 -- ** NCName
350 p_NCName :: P.Tokens s ~ TL.Text => Parser e s NCName
351 p_NCName = P.label "NCName" $
352 NCName
353 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNCNameStartChar)
354 <*> P.takeWhile1P Nothing XC.isXmlNCNameChar
355
356 -- * Reference
357 p_Reference :: P.Tokens s ~ TL.Text => Parser Error s TextLexeme
358 p_Reference =
359 TextLexemeCharRef <$> p_CharRef <|>
360 TextLexemeEntityRef <$> p_EntityRef
361
362 -- ** EntityRef
363 p_EntityRef :: P.Tokens s ~ TL.Text => Parser Error s EntityRef
364 p_EntityRef = P.label "EntityRef" $ do
365 ref <- P.char '&' *> p_NCName <* P.char ';'
366 EntityRef ref <$> lookupEntityRef ref
367 where
368 lookupEntityRef (NCName "lt" ) = pure "<"
369 lookupEntityRef (NCName "gt" ) = pure ">"
370 lookupEntityRef (NCName "amp" ) = pure "&"
371 lookupEntityRef (NCName "apos") = pure "'"
372 lookupEntityRef (NCName "quot") = pure "\""
373 lookupEntityRef n = p_error $ Error_EntityRef_unknown n
374
375 -- ** CharRef
376 p_CharRef :: P.Tokens s ~ TL.Text => Parser Error s CharRef
377 p_CharRef = P.label "CharRef" $
378 do
379 ref <- readHexadecimal
380 <$ P.string "&#x"
381 <*> P.some P.hexDigitChar
382 <* P.char ';'
383 check ref
384 <|> do
385 ref <- readDecimal
386 <$ P.string "&#"
387 <*> P.some P.digitChar
388 <* P.char ';'
389 check ref
390 where
391 check i =
392 let c = toEnum (fromInteger i) in
393 if i <= toInteger (fromEnum (maxBound::Char))
394 && XC.isXmlChar c
395 then pure $ CharRef c
396 else p_error $ Error_CharRef_invalid i
397
398 readInt :: Integer -> String -> Integer
399 readInt base digits =
400 sign * List.foldl' acc 0 (List.concatMap digToInt digits1)
401 where
402 acc q r = q*base + r
403 (sign, digits1) =
404 case digits of
405 [] -> (1, digits)
406 c:ds | c == '-' -> (-1, ds)
407 | c == '+' -> ( 1, ds)
408 | otherwise -> ( 1, digits)
409 ord = toInteger . Char.ord
410 digToInt c
411 | Char.isDigit c = [ord c - ord '0']
412 | Char.isAsciiLower c = [ord c - ord 'a' + 10]
413 | Char.isAsciiUpper c = [ord c - ord 'A' + 10]
414 | otherwise = []
415
416 readDecimal :: String -> Integer
417 readDecimal = readInt 10
418
419 readHexadecimal :: String -> Integer
420 readHexadecimal = readInt 16
421
422 -- * Char
423 p_Char :: P.Tokens s ~ TL.Text => Parser e s Char
424 p_Char = P.label "Char" $ P.satisfy XC.isXmlCharCR <|> p_CRLF
425 {-# INLINE p_Char #-}
426
427 -- ** Space
428 -- | Map '\r' and '\r\n' to '\n'.
429 p_CRLF :: P.Tokens s ~ TL.Text => Parser e s Char
430 p_CRLF = P.label "CRLF" $
431 P.char '\r' *> P.option '\n' (P.char '\n')
432
433 p_Space :: P.Tokens s ~ TL.Text => Parser e s Char
434 p_Space = P.label "Space" $
435 P.satisfy XC.isXmlSpaceCharCR <|> p_CRLF
436 {-# INLINE p_Space #-}
437
438 p_Spaces :: P.Tokens s ~ TL.Text => Parser e s ()
439 p_Spaces = P.label "Spaces" $
440 void $ P.takeWhileP Nothing XC.isXmlSpaceChar
441 {-# INLINE p_Spaces #-}
442
443 p_S :: P.Tokens s ~ TL.Text => Parser Error s XML
444 p_S = P.label "Spaces" $
445 (\ts -> TS.tree0 (NodeText . pure . TextLexemePlain . TL.concat <$> ts))
446 <$> p_Sourced (P.some $
447 P.takeWhile1P Nothing XC.isXmlSpaceCharCR <|>
448 TL.singleton <$> p_CRLF)
449
450 p_Spaces1 :: P.Tokens s ~ TL.Text => Parser e s ()
451 p_Spaces1 = P.label "Spaces1" $
452 void $ P.takeWhile1P Nothing XC.isXmlSpaceChar
453 {-# INLINE p_Spaces1 #-}
454
455 -- * Eq
456 p_separator :: P.Tokens s ~ TL.Text => Char -> Parser e s ()
457 p_separator c = P.try (() <$ p_Spaces <* P.char c) <* p_Spaces <?> [c]
458
459 p_Eq :: P.Tokens s ~ TL.Text => Parser e s ()
460 p_Eq = p_separator '=' <?> "Eq"