]> Git — Sourcephile - haskell/symantic-xml.git/blob - Language/Symantic/XML/Read.hs
init
[haskell/symantic-xml.git] / Language / Symantic / XML / Read.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Language.Symantic.XML.Read
9 ( module Language.Symantic.XML.Read.Parser
10 , module Language.Symantic.XML.Read
11 ) where
12
13 import Control.Applicative (Applicative(..), Alternative(..))
14 import Control.Monad (Monad(..), void, unless, forM, join)
15 import Data.Bool
16 import Data.Char (Char)
17 import Data.Default.Class (Default(..))
18 import Data.Either (Either(..))
19 import Data.Eq (Eq(..))
20 import Data.Foldable (Foldable(..))
21 import Data.Function (($), (.), const)
22 import Data.Functor ((<$>), (<$))
23 import Data.Maybe (Maybe(..), maybe)
24 import Data.Ord (Ord(..))
25 import Data.Semigroup (Semigroup(..))
26 import Data.String (String)
27 import Data.TreeSeq.Strict (Tree(..))
28 import Data.Tuple (snd)
29 import Prelude (Num(..), Enum(..), Bounded(..), Integer, toInteger)
30 import System.IO (FilePath, IO)
31 import Text.Megaparsec ((<?>))
32 import qualified Control.Monad.Trans.Reader as R
33 import qualified Data.ByteString.Lazy as BSL
34 import qualified Data.Char as Char
35 import qualified Data.Char.Properties.XMLCharProps as XC
36 import qualified Data.HashMap.Strict as HM
37 import qualified Data.List as List
38 import qualified Data.Sequence as Seq
39 import qualified Data.Text.Lazy as TL
40 import qualified Data.Text.Lazy.Encoding as TL
41 import qualified Data.TreeSeq.Strict as TS
42 import qualified System.IO.Error as IO
43 import qualified Text.Megaparsec as P
44 import qualified Text.Megaparsec.Char as P
45
46 import Language.Symantic.XML.Document
47 import Language.Symantic.XML.Read.Parser
48
49 readXML :: FilePath -> TL.Text -> Either (P.ParseError (P.Token TL.Text) Error) XMLs
50 readXML filePath stateInput =
51 snd $
52 P.runParser'
53 (R.runReaderT p_document def)
54 P.State
55 { P.stateInput
56 , P.statePos = pure $ P.initialPos filePath
57 , P.stateTabWidth = P.pos1 -- NOTE: do not expand tabs.
58 , P.stateTokensProcessed = 0
59 }
60
61 readFile :: FilePath -> IO (Either IO.IOError TL.Text)
62 readFile fp =
63 (Right . TL.decodeUtf8 <$> BSL.readFile fp)
64 `IO.catchIOError` \e ->
65 if IO.isAlreadyInUseError e
66 || IO.isDoesNotExistError e
67 || IO.isPermissionError e
68 then return $ Left e
69 else IO.ioError e
70
71 -- * Document
72 p_document :: P.Tokens s ~ TL.Text => Parser Error s XMLs
73 p_document = do
74 ps <- p_prolog
75 e <- p_Element
76 ms <- P.many p_Misc
77 P.eof
78 return (ps <> pure e <> join (Seq.fromList ms))
79
80 -- ** Prolog
81 p_prolog :: P.Tokens s ~ TL.Text => Parser Error s XMLs
82 p_prolog = do
83 xmlDecl <- P.option Seq.empty $ pure <$> p_XMLDecl
84 ms <- P.many p_Misc
85 return (xmlDecl <> join (Seq.fromList ms))
86
87 -- ** Misc
88 p_Misc :: P.Tokens s ~ TL.Text => Parser Error s XMLs
89 p_Misc =
90 P.try (pure <$> p_Comment)
91 <|> pure <$> p_PI
92 <|> Seq.empty <$ p_Spaces1
93
94 -- ** XMLDecl
95 p_XMLDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
96 p_XMLDecl = P.label "XMLDecl" $ do
97 Sourced src as <- p_Sourced $ P.between (P.string "<?xml") (P.string "?>") $ do
98 vi <- pure <$> p_VersionInfo
99 ed <- P.option Seq.empty $ pure <$> p_EncodingDecl
100 sd <- P.option Seq.empty $ pure <$> p_SDDecl
101 p_Spaces
102 return $ vi <> ed <> sd
103 return $ Tree (Sourced src $ NodePI "xml" "") as
104
105 p_VersionInfo :: P.Tokens s ~ TL.Text => Parser Error s XML
106 p_VersionInfo = P.label "VersionInfo" $ do
107 Sourced c v <- p_Sourced $ do
108 P.try (() <$ p_Spaces <* P.string "version")
109 p_Eq
110 p_quoted $ const $ p_Sourced $
111 (<>)
112 <$> P.string "1."
113 <*> P.takeWhile1P Nothing Char.isDigit
114 return $ Tree (Sourced c $ NodeAttr "version") $ pure $
115 TS.tree0 $ NodeText <$> v
116
117 p_EncodingDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
118 p_EncodingDecl = P.label "EncodingDecl" $ do
119 Sourced c v <- p_Sourced $ do
120 P.try (() <$ p_Spaces <* P.string "encoding")
121 p_Eq
122 p_quoted $ const $ p_Sourced p_EncName
123 return $ Tree (Sourced c $ NodeAttr "encoding") $ pure $
124 TS.tree0 $ NodeText <$> v
125
126 p_EncName :: P.Tokens s ~ TL.Text => Parser Error s TL.Text
127 p_EncName = P.label "EncName" $ do
128 P.notFollowedBy (P.satisfy $ not . isAlpha)
129 P.takeWhile1P Nothing $ \c ->
130 isAlpha c || Char.isDigit c ||
131 c=='.' || c=='_' || c=='-'
132 where isAlpha c = Char.isAsciiLower c || Char.isAsciiUpper c
133
134 -- *** SDDecl
135 p_SDDecl :: P.Tokens s ~ TL.Text => Parser Error s XML
136 p_SDDecl = P.label "SDDecl" $ do
137 p_SourcedBegin $ do
138 Sourced ca () <- P.try (p_Sourced $ () <$ p_Spaces <* P.string "standalone")
139 p_Eq
140 v <- p_quoted $ const $ p_Sourced $ P.string "yes" <|> P.string "no"
141 return $ Tree (Sourced ca $ NodeAttr "standalone") $ pure $
142 TS.tree0 $ NodeText <$> v
143
144 -- ** CharData
145 p_CharData :: P.Tokens s ~ TL.Text => Parser e s XML
146 p_CharData =
147 (\t -> TS.tree0 (NodeText <$> t))
148 <$> p_Sourced (p_until1 (\c -> XC.isXmlChar c && c/='<' && c/='&') (']',"]>"))
149
150 -- ** Comment
151 p_Comment :: P.Tokens s ~ TL.Text => Parser Error s XML
152 p_Comment = p_SourcedBegin $ P.string "<!--" *> p_Comment__
153 p_Comment_ :: P.Tokens s ~ TL.Text => Parser Error s XML
154 p_Comment_ = P.string "--" *> p_Comment__
155 p_Comment__:: P.Tokens s ~ TL.Text => Parser Error s XML
156 p_Comment__ = P.label "Comment" $ do
157 c <- p_until XC.isXmlChar ('-', "-")
158 void $ P.string "-->"
159 cell <- p_SourcedEnd
160 return $ TS.tree0 (cell $ NodeComment c)
161
162 -- ** CDATA
163 p_CDSect :: P.Tokens s ~ TL.Text => Parser Error s XML
164 p_CDSect = p_SourcedBegin $ P.string "<![CDATA[" *> p_CDSect__
165 p_CDSect_ :: P.Tokens s ~ TL.Text => Parser Error s XML
166 p_CDSect_ = P.string "[CDATA[" *> p_CDSect__
167 p_CDSect__ :: P.Tokens s ~ TL.Text => Parser Error s XML
168 p_CDSect__ = P.label "CDSect" $ do
169 c <- p_until XC.isXmlChar (']', "]>")
170 void $ P.string "]]>"
171 cell <- p_SourcedEnd
172 return $ TS.tree0 $ cell $ NodeCDATA c
173
174 -- ** PI
175 p_PI :: P.Tokens s ~ TL.Text => Parser Error s XML
176 p_PI = p_SourcedBegin $ P.string "<?" *> p_PI__
177 p_PI_ :: P.Tokens s ~ TL.Text => Parser Error s XML
178 p_PI_ = P.char '?' *> p_PI__
179 p_PI__ :: P.Tokens s ~ TL.Text => Parser Error s XML
180 p_PI__ = P.label "PI" $ do
181 n <- p_PITarget
182 v <- P.option "" $ P.try $ p_Spaces1 *> p_until XC.isXmlChar ('?', ">")
183 void $ P.string "?>"
184 cell <- p_SourcedEnd
185 return $ TS.tree0 $ cell $ NodePI n v
186 p_PITarget :: P.Tokens s ~ TL.Text => Parser Error s PName
187 p_PITarget = do
188 n <- p_PName
189 case n of
190 PName{pNameSpace=Just "", pNameLocal=NCName l}
191 | "xml" == TL.toLower l -> p_error $ Error_PI_reserved n
192 _ -> return n
193
194 -- ** Element
195 p_Element :: P.Tokens s ~ TL.Text => Parser Error s XML
196 p_Element = p_SourcedBegin $ (P.char '<' *> p_Element_)
197 p_Element_ :: P.Tokens s ~ TL.Text => Parser Error s XML
198 p_Element_ = P.label "Element" p_STag
199
200 -- *** STag
201 p_STag :: P.Tokens s ~ TL.Text => Parser Error s XML
202 p_STag = do
203 n <- p_PName
204 as <- P.many $ P.try $ p_Spaces1 *> p_Attribute
205 p_Spaces
206 ro <- R.ask
207 elemNS :: HM.HashMap NCName Namespace <-
208 (HM.fromList . List.concat <$>) $ forM as $ \case
209 Sourced _ (PName{..}, Sourced _ av)
210 | ns <- Namespace av
211 , Nothing <- pNameSpace
212 , NCName "xmlns" <- pNameLocal ->
213 -- NOTE: default namespace declaration.
214 case ns of
215 _ | ns == xmlns_xml -- DOC: it MUST NOT be declared as the default namespace
216 || ns == xmlns_xmlns -- DOC: it MUST NOT be declared as the default namespace
217 -> p_error $ Error_Namespace_reserved ns
218 _ -> return [(NCName "" , ns)]
219 | ns <- Namespace av
220 , Just (NCName "xmlns") <- pNameSpace ->
221 -- NOTE: namespace prefix declaration.
222 case unNCName pNameLocal of
223 "xml" -- DOC: It MAY, but need not, be declared,
224 -- and MUST NOT be bound to any other namespace name.
225 | ns == xmlns_xml -> return []
226 | otherwise -> p_error $ Error_Namespace_reserved_prefix pNameLocal
227 "xmlns" -- DOC: It MUST NOT be declared
228 -> p_error $ Error_Namespace_reserved_prefix pNameLocal
229 local | "xml" <- TL.toLower $ TL.take 3 local -> return []
230 -- DOC: All other prefixes beginning with the three-letter
231 -- sequence x, m, l, in any case combination, are reserved.
232 -- This means that: processors MUST NOT treat them as fatal errors.
233 _ | ns == xmlns_xml -- DOC: Other prefixes MUST NOT be bound to this namespace name.
234 || ns == xmlns_xmlns -- DOC: Other prefixes MUST NOT be bound to this namespace name.
235 -> p_error $ Error_Namespace_reserved ns
236 _ -> return [(pNameLocal, ns)]
237 | otherwise -> return []
238 let scopeNS = elemNS <> reader_ns_scope ro
239 let defaultNS = HM.lookupDefault (reader_ns_default ro) (NCName "") scopeNS
240 let lookupNamePrefix prefix =
241 maybe (p_error $ Error_Namespace_prefix_unknown prefix) return $
242 HM.lookup prefix scopeNS
243 elemName :: QName <-
244 -- NOTE: expand element's QName.
245 case pNameSpace n of
246 Nothing -> return QName{qNameSpace=defaultNS, qNameLocal=pNameLocal n}
247 -- DOC: If there is a default namespace declaration in scope,
248 -- the expanded name corresponding to an unprefixed element name
249 -- has the URI of the default namespace as its namespace name.
250 Just prefix
251 | NCName "xmlns" <- prefix ->
252 -- DOC: Element names MUST NOT have the prefix xmlns.
253 p_error $ Error_Namespace_reserved_prefix prefix
254 | otherwise -> do
255 ns <- lookupNamePrefix prefix
256 return QName{qNameSpace=ns, qNameLocal=pNameLocal n}
257 elemAttrs :: [Sourced FileSource (QName, Sourced FileSource Text)] <-
258 -- NOTE: expand attributes' PName into QName.
259 forM as $ \s@Sourced{unSourced=(an, av)} -> do
260 ns <- maybe (return "") lookupNamePrefix $ pNameSpace an
261 let qn = QName{qNameSpace=ns, qNameLocal=pNameLocal an}
262 return s{unSourced=(qn, av)}
263 -- Check for attribute collision.
264 let attrsByQName :: HM.HashMap QName [Sourced FileSource (QName, Sourced FileSource Text)] =
265 HM.fromListWith (<>) $ (<$> elemAttrs) $ \a@(Sourced _c (an, _av)) -> (an, [a])
266 case HM.toList $ HM.filter (\x -> length x > 1) attrsByQName of
267 (an, _):_ -> p_error $ Error_Attribute_collision an
268 _ -> return ()
269 elemAttrsXML :: XMLs <- (Seq.fromList <$>) $
270 forM elemAttrs $ \(Sourced sa (an, av)) -> do
271 return $ TS.Tree (Sourced sa $ NodeAttr an) $
272 pure $ TS.tree0 $ NodeText <$> av
273 content :: XMLs <-
274 elemAttrsXML <$ P.string "/>" <|>
275 R.local
276 (const ro
277 { reader_ns_scope = scopeNS
278 , reader_ns_default = defaultNS
279 })
280 ((elemAttrsXML <>) <$ P.char '>' <*> p_content <* p_ETag elemName)
281 cell <- p_SourcedEnd
282 return $ Tree (cell $ NodeElem elemName) content
283
284 -- *** Attribute
285 p_Attribute :: P.Tokens s ~ TL.Text => Parser Error s (Sourced FileSource (PName, Sourced FileSource TL.Text))
286 p_Attribute = p_Sourced $ (,) <$> p_PName <* p_Eq <*> p_AttValue
287
288 p_AttValue :: P.Tokens s ~ TL.Text => Parser Error s (Sourced FileSource TL.Text)
289 p_AttValue = P.label "AttValue" $ p_quoted p_AttValueText
290
291 p_AttValueText :: P.Tokens s ~ TL.Text => Char -> Parser Error s (Sourced FileSource TL.Text)
292 p_AttValueText q = p_Sourced $
293 TL.concat <$> P.many
294 ( p_Reference
295 <|> P.takeWhile1P Nothing (\c ->
296 XC.isXmlChar c &&
297 c `List.notElem` (q:"<&"))
298 )
299
300 -- * content
301 p_content :: P.Tokens s ~ TL.Text => Parser Error s XMLs
302 p_content =
303 (Seq.fromList <$>) $ P.many $
304 (p_SourcedBegin $ do
305 P.try $ P.char '<' *> P.notFollowedBy (P.char '/')
306 p_Element_ <|> p_PI_ <|> (P.char '!' *> (p_Comment_ <|> p_CDSect_))
307 ) <|> p_CharData
308
309 -- *** ETag
310 p_ETag :: P.Tokens s ~ TL.Text => QName -> Parser Error s ()
311 p_ETag expected = do
312 got <- P.string "</" *> p_QName <* p_Spaces <* P.char '>'
313 unless (got == expected) $
314 p_error $ Error_Closing_tag_unexpected got expected
315
316 -- * Name
317 p_Name :: P.Tokens s ~ TL.Text => Parser Error s Name
318 p_Name = P.label "Name" $
319 Name
320 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNameStartChar)
321 <*> P.takeWhile1P Nothing XC.isXmlNameChar
322
323 -- * PName
324 p_PName :: P.Tokens s ~ TL.Text => Parser e s PName
325 p_PName = P.label "PName" $ do
326 n <- p_NCName
327 s <- P.optional $ P.try $ P.char ':' *> p_NCName
328 return $ case s of
329 Nothing -> PName{pNameSpace=Nothing, pNameLocal=n}
330 Just l -> PName{pNameSpace=Just n , pNameLocal=l}
331
332 -- * QName
333 p_QName :: P.Tokens s ~ TL.Text => Parser Error s QName
334 p_QName = P.label "QName" $ do
335 n <- p_NCName
336 s <- P.optional $ P.try $ P.char ':' *> p_NCName
337 Reader{..} <- R.ask
338 case s of
339 Nothing -> return QName{qNameSpace=reader_ns_default, qNameLocal=n}
340 Just l ->
341 case HM.lookup n reader_ns_scope of
342 Nothing -> p_error $ Error_Namespace_prefix_unknown n
343 Just ns -> return QName{qNameSpace=ns, qNameLocal=l}
344
345 -- ** NCName
346 p_NCName :: P.Tokens s ~ TL.Text => Parser e s NCName
347 p_NCName = P.label "NCName" $
348 NCName
349 <$ P.notFollowedBy (P.satisfy $ not . XC.isXmlNCNameStartChar)
350 <*> P.takeWhile1P Nothing XC.isXmlNCNameChar
351
352 -- * Reference
353 p_Reference :: P.Tokens s ~ TL.Text => Parser Error s TL.Text
354 p_Reference = TL.singleton <$> p_CharRef <|> p_EntityRef
355
356 -- ** EntityRef
357 p_EntityRef :: P.Tokens s ~ TL.Text => Parser Error s TL.Text
358 p_EntityRef = P.label "EntityRef" $ do
359 ref <- P.char '&' *> p_Name <* P.char ';'
360 lookupEntityRef ref
361 where
362 lookupEntityRef (Name "lt" ) = pure "<"
363 lookupEntityRef (Name "gt" ) = pure ">"
364 lookupEntityRef (Name "amp" ) = pure "&"
365 lookupEntityRef (Name "apos") = pure "'"
366 lookupEntityRef (Name "quot") = pure "\""
367 lookupEntityRef n = p_error $ Error_EntityRef_unknown n
368
369 -- ** CharRef
370 p_CharRef :: P.Tokens s ~ TL.Text => Parser Error s Char
371 p_CharRef = P.label "CharRef" $
372 do
373 ref <- readHexadecimal
374 <$ P.string "&#x"
375 <*> P.some P.hexDigitChar
376 <* P.char ';'
377 check ref
378 <|> do
379 ref <- readDecimal
380 <$ P.string "&#"
381 <*> P.some P.digitChar
382 <* P.char ';'
383 check ref
384 where
385 check i =
386 let c = toEnum (fromInteger i) in
387 if i <= toInteger (fromEnum (maxBound::Char))
388 && XC.isXmlChar c
389 then pure c
390 else p_error $ Error_CharRef_invalid i
391
392 readInt :: Integer -> String -> Integer
393 readInt base digits =
394 sign * List.foldl' acc 0 (List.concatMap digToInt digits1)
395 where
396 acc q r = q*base + r
397 (sign, digits1) =
398 case digits of
399 [] -> (1, digits)
400 c:ds | c == '-' -> (-1, ds)
401 | c == '+' -> ( 1, ds)
402 | otherwise -> ( 1, digits)
403 ord = toInteger . Char.ord
404 digToInt c
405 | Char.isDigit c = [ord c - ord '0']
406 | Char.isAsciiLower c = [ord c - ord 'a' + 10]
407 | Char.isAsciiUpper c = [ord c - ord 'A' + 10]
408 | otherwise = []
409
410 readDecimal :: String -> Integer
411 readDecimal = readInt 10
412
413 readHexadecimal :: String -> Integer
414 readHexadecimal = readInt 16
415
416 -- * Char
417 p_Char :: P.Tokens s ~ TL.Text => Parser e s Char
418 p_Char = P.label "Char" $ P.satisfy XC.isXmlCharCR <|> p_CRLF
419 {-# INLINE p_Char #-}
420
421 -- ** Space
422 -- | Map '\r' and '\r\n' to '\n'.
423 p_CRLF :: P.Tokens s ~ TL.Text => Parser e s Char
424 p_CRLF = P.label "CRLF" $
425 P.char '\r' *> P.option '\n' (P.char '\n')
426
427 p_Space :: P.Tokens s ~ TL.Text => Parser e s Char
428 p_Space = P.label "Space" $
429 P.satisfy XC.isXmlSpaceCharCR <|> p_CRLF
430 {-# INLINE p_Space #-}
431
432 p_Spaces :: P.Tokens s ~ TL.Text => Parser e s ()
433 p_Spaces = P.label "Spaces" $
434 void $ P.takeWhileP Nothing XC.isXmlSpaceChar
435 {-# INLINE p_Spaces #-}
436
437 p_S :: P.Tokens s ~ TL.Text => Parser Error s XML
438 p_S = P.label "Spaces" $
439 (\ts -> TS.tree0 (NodeText . TL.concat <$> ts))
440 <$> p_Sourced (P.some $
441 P.takeWhile1P Nothing XC.isXmlSpaceCharCR <|>
442 TL.singleton <$> p_CRLF)
443
444 p_Spaces1 :: P.Tokens s ~ TL.Text => Parser e s ()
445 p_Spaces1 = P.label "Spaces1" $
446 void $ P.takeWhile1P Nothing XC.isXmlSpaceChar
447 {-# INLINE p_Spaces1 #-}
448
449 -- * Eq
450 p_separator :: P.Tokens s ~ TL.Text => Char -> Parser e s ()
451 p_separator c = P.try (() <$ p_Spaces <* P.char c) <* p_Spaces <?> [c]
452
453 p_Eq :: P.Tokens s ~ TL.Text => Parser e s ()
454 p_Eq = p_separator '=' <?> "Eq"