{-# LANGUAGE ConstraintKinds #-} {-# LANGUAGE DataKinds #-} {-# LANGUAGE DefaultSignatures #-} {-# LANGUAGE DeriveFunctor #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE NoMonomorphismRestriction #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE PolyKinds #-} {-# LANGUAGE Rank2Types #-} {-# LANGUAGE StandaloneDeriving #-} {-# LANGUAGE TypeFamilies #-} {-# OPTIONS_GHC -fno-warn-tabs #-} -- | This module defines symantics -- for regular or context-free grammars. module Language.Symantic.Parsing.Grammar where import Control.Applicative (Applicative(..), Alternative(..)) import Control.Monad import Data.Char as Char import Data.Foldable hiding (any) import qualified Data.List as List import Data.Semigroup hiding (option) import Data.String (IsString(..)) import Data.Text (Text) import qualified Data.Text as Text import Prelude hiding (any) -- * Class 'Gram_Rule' type Id a = a -> a class Gram_Rule p where rule :: Text -> Id (p a) rule _n = id rule1 :: Text -> Id (p a -> p b) rule1 _n p = p rule2 :: Text -> Id (p a -> p b -> p c) rule2 _n p = p rule3 :: Text -> Id (p a -> p b -> p c -> p d) rule3 _n p = p rule4 :: Text -> Id (p a -> p b -> p c -> p d -> p e) rule4 _n p = p -- * Type 'Term' -- | Terminal grammar. newtype Term p a = Term { unTerm :: p a } deriving (Functor, Gram_Term) -- ** Class 'Gram_Term' -- | Symantics for terminal grammars. class Gram_Term p where any :: p Char eof :: p () char :: Char -> p Char string :: String -> p String unicat :: Unicat -> p Char range :: (Char, Char) -> p Char -- string = foldr (\c -> (<*>) ((:) <$> char c)) (pure "") -- string [] = pure [] -- string (c:cs) = (:) <$> char c <*> string cs -- *** Type 'Unicat' -- | Unicode category. data Unicat = Unicat_Letter | Unicat_Mark | Unicat_Number | Unicat_Punctuation | Unicat_Symbol | Unicat Char.GeneralCategory deriving (Eq, Show) unicode_categories :: Unicat -> [Char.GeneralCategory] unicode_categories c = case c of Unicat_Letter -> [ UppercaseLetter , LowercaseLetter , TitlecaseLetter , ModifierLetter , OtherLetter ] Unicat_Mark -> [ NonSpacingMark , SpacingCombiningMark , EnclosingMark ] Unicat_Number -> [ DecimalNumber , LetterNumber , OtherNumber ] Unicat_Punctuation -> [ ConnectorPunctuation , DashPunctuation , OpenPunctuation , ClosePunctuation , OtherPunctuation ] Unicat_Symbol -> [ MathSymbol , CurrencySymbol , ModifierSymbol , OtherSymbol ] Unicat cat -> [cat] -- * Type 'Reg' -- | Left or right regular grammar. newtype Reg (lr::LR) p a = Reg { unReg :: p a } deriving (IsString, Functor, Gram_Term, Alter) deriving instance Gram_Rule p => Gram_Rule (Reg lr p) deriving instance (Functor p, Alter p, Gram_RegL p) => Gram_RegL (RegL p) deriving instance (Functor p, Alter p, Gram_RegR p) => Gram_RegR (RegR p) -- ** Type 'LR' data LR = L -- ^ Left | R -- ^ Right deriving (Eq, Show) type RegL = Reg 'L type RegR = Reg 'R -- ** Class 'Alter' -- | Like 'Alternative' but without the 'Applicative' super-class, -- because a regular grammar is not closed under 'Applicative'. class Alter p where nil :: p a (<+>) :: p a -> p a -> p a choice :: [p a] -> p a default nil :: Alternative p => p a default (<+>) :: Alternative p => p a -> p a -> p a default choice :: Alternative p => [p a] -> p a nil = empty (<+>) = (<|>) choice = foldr (<+>) empty infixl 3 <+> -- ** Class 'Gram_RegR' -- | Symantics for right regular grammars. class (Functor p, Alter p) => Gram_RegR p where (.*>) :: Term p (a -> b) -> RegR p a -> RegR p b manyR :: Term p a -> RegR p [a] manyR p = (:) <$> p .*> manyR p <+> nil someR :: Term p a -> RegR p [a] someR p = (:) <$> p .*> manyR p infixl 4 .*> -- ** Class 'Gram_RegL' -- | Symantics for left regular grammars. class (Functor p, Alter p) => Gram_RegL p where (<*.) :: RegL p (a -> b) -> Term p a -> RegL p b manyL :: Term p a -> RegL p [a] manyL p' = reverse <$> go p' where go p = flip (:) <$> go p <*. p <+> nil someL :: Term p a -> RegL p [a] someL p = (\cs c -> cs ++ [c]) <$> manyL p <*. p infixl 4 <*. -- * Type 'CF' -- | Context-free grammar. newtype CF p a = CF { unCF :: p a } deriving (IsString, Functor, Gram_Term, Applicative, App, Alternative, Alter, Alt) deriving instance Gram_Rule p => Gram_Rule (CF p) deriving instance Gram_CF p => Gram_CF (CF p) cf_of_reg :: Reg lr p a -> CF p a cf_of_reg (Reg p) = CF p -- ** Class 'Gram_CF' -- | Symantics for context-free grammars. class Gram_CF p where -- | NOTE: CFL ∩ RL is a CFL. -- See ISBN 81-7808-347-7, Theorem 7.27, p.286 (<&) :: CF p (a -> b) -> Reg lr p a -> CF p b (&>) :: Reg lr p (a -> b) -> CF p a -> CF p b -- | NOTE: CFL - RL is a CFL. -- See ISBN 81-7808-347-7, Theorem 7.29, p.289 but :: CF p a -> Reg lr p b -> CF p a infixl 4 <& infixl 4 &> -- ** Class 'App' class Applicative p => App p where between :: p open -> p close -> p a -> p a between open close p = open *> p <* close -- ** Class 'Alt' class Alternative p => Alt p where option :: a -> p a -> p a option x p = p <|> pure x skipMany :: p a -> p () skipMany = void . many --manyTill :: p a -> p end -> p [a] --manyTill p end = go where go = ([] <$ end) <|> ((:) <$> p <*> go) -- * Type 'EBNF' -- | Extended Bachus-Norm Form, following the -- -- notations, augmented with the following notations: -- -- * @("U+", code_point)@: for (aka. Unicode). -- * @(rule, "&", rule)@: for the intersection. -- * @(rule, "-", rule)@: for the difference. -- * @(rule, " ", rule)@: for rule application. data EBNF a = EBNF { unEBNF :: RuleMode -> (Op, LR) -> Text } runEBNF :: EBNF a -> Text runEBNF (EBNF p) = p RuleMode_Body (nop, L) -- | Get textual rendition of given EBNF rule. renderEBNF :: RuleDef a -> Text renderEBNF = runEBNF . unRuleDef ebnf_const :: Text -> EBNF a ebnf_const t = EBNF $ \_rm _op -> t -- ** Type 'RuleDef' newtype RuleDef a = RuleDef { unRuleDef :: EBNF a } deriving (Functor, Gram_Term, Applicative, App , Alternative, Alter, Alt, Gram_RegL, Gram_RegR, Gram_CF) deriving instance Gram_RuleDef RuleDef deriving instance Gram_RuleDef p => Gram_RuleDef (RegR p) deriving instance Gram_RuleDef p => Gram_RuleDef (RegL p) deriving instance Gram_RuleDef p => Gram_RuleDef (CF p) instance Gram_Rule RuleDef where rule n = rule_def (ebnf_const n) rule1 n p a = rule_def (ebnf_const n `ebnf_arg` unRuleDef a) (p a) rule2 n p a b = rule_def (ebnf_const n `ebnf_arg` unRuleDef a `ebnf_arg` unRuleDef b) (p a b) rule3 n p a b c = rule_def (ebnf_const n `ebnf_arg` unRuleDef a `ebnf_arg` unRuleDef b `ebnf_arg` unRuleDef c) (p a b c) rule4 n p a b c d = rule_def (ebnf_const n `ebnf_arg` unRuleDef a `ebnf_arg` unRuleDef b `ebnf_arg` unRuleDef c `ebnf_arg` unRuleDef d) (p a b c d) -- *** Class 'Gram_RuleDef' class Gram_RuleDef p where rule_def :: EBNF () -> p a -> RuleDef a rule_arg :: Text -> p a -- | Helper for 'Gram_Rule' 'EBNF'. ebnf_arg :: EBNF a -> EBNF b -> EBNF () ebnf_arg (EBNF a) (EBNF b) = EBNF $ \bo po -> op_paren po op $ a bo (op, L) <> " " <> b bo (op, R) where op = Op " " 11 AssocL infixl 5 `ebnf_arg` instance Gram_RuleDef EBNF where rule_arg = ebnf_const rule_def call body = RuleDef $ EBNF $ \mo po -> case mo of RuleMode_Ref -> unEBNF call mo po RuleMode_Body -> Text.intercalate " " $ concat $ [ [unEBNF call RuleMode_Ref (nop, L)] , ["="] , [unEBNF body RuleMode_Ref (nop, R)] , [";"] ] instance IsString (EBNF String) where fromString = string instance Show (EBNF a) where show = Text.unpack . runEBNF instance Gram_Rule EBNF where rule n p = EBNF $ \rm po -> case rm of RuleMode_Body -> unEBNF p RuleMode_Ref po RuleMode_Ref -> n rule1 n p a = EBNF $ \rm po -> case rm of RuleMode_Body -> unEBNF (p a) RuleMode_Ref po RuleMode_Ref -> unEBNF (ebnf_const n `ebnf_arg` a) RuleMode_Ref po rule2 n p a b = EBNF $ \rm po -> case rm of RuleMode_Body -> unEBNF (p a b) RuleMode_Ref po RuleMode_Ref -> unEBNF (ebnf_const n `ebnf_arg` a `ebnf_arg` b) RuleMode_Ref po rule3 n p a b c = EBNF $ \rm po -> case rm of RuleMode_Body -> unEBNF (p a b c) RuleMode_Ref po RuleMode_Ref -> unEBNF (ebnf_const n `ebnf_arg` a `ebnf_arg` b `ebnf_arg` c) RuleMode_Ref po rule4 n p a b c d = EBNF $ \rm po -> case rm of RuleMode_Body -> unEBNF (p a b c d) RuleMode_Ref po RuleMode_Ref -> unEBNF (ebnf_const n `ebnf_arg` a `ebnf_arg` b `ebnf_arg` c `ebnf_arg` d) RuleMode_Ref po instance Functor EBNF where fmap _f (EBNF x) = EBNF x instance Applicative EBNF where pure _ = empty EBNF f <*> EBNF x = EBNF $ \bo po -> op_paren po op $ f bo (op, L) <> ", " <> x bo (op, R) where op = Op "," 10 AssocB instance App EBNF instance Alternative EBNF where empty = ebnf_const $ "\"\"" EBNF x <|> EBNF y = EBNF $ \bo po -> op_paren po op $ x bo (op, L) <> " | " <> y bo (op, R) where op = Op "|" 2 AssocB many (EBNF x) = EBNF $ \rm _po -> "{ " <> x rm (op, L) <> " }" where op = nop some (EBNF x) = EBNF $ \rm _po -> "{ " <> x rm (op, L) <> " }-" where op = nop instance Alter EBNF where choice [] = empty choice [p] = p choice l@(_:_) = EBNF $ \bo po -> op_paren po op $ Text.intercalate " | " $ (unEBNF <$> l) <*> pure bo <*> pure (op, L) where op = Op "|" 2 AssocB instance Alt EBNF instance Gram_Term EBNF where any = ebnf_const "_" eof = ebnf_const "EOF" char = ebnf_const . escape where escape c | Char.isPrint c && c /= '"' = Text.concat $ ["\"", Text.singleton c, "\""] escape c = Text.concat ["U+", Text.pack $ show $ ord c] string s = case List.break (\c -> not (Char.isPrint c) || c == '"') s of (ps, "") -> raw ps ("", [c]) -> "" <$ char c (ps, [c]) -> "" <$ raw ps <* char c ("", c:rs) -> "" <$ char c <* string rs (ps, c:rs) -> "" <$ raw ps <* char c <* string rs where raw cs = ebnf_const $ Text.concat $ ["\"", Text.pack cs, "\""] unicat = ebnf_const . Text.pack . show range (l, h) = ebnf_const $ Text.concat [ runEBNF $ char l , "…" , runEBNF $ char h ] instance Gram_RegR EBNF where Term f .*> Reg x = Reg $ f <*> x manyR = Reg . many . unTerm someR = Reg . some . unTerm instance Gram_RegL EBNF where Reg f <*. Term x = Reg $ f <*> x manyL = Reg . many . unTerm someL = Reg . some . unTerm instance Gram_CF EBNF where CF (EBNF f) <& Reg (EBNF p) = CF $ EBNF $ \bo po -> op_paren po op $ f bo (op, L) <> " & " <> p bo (op, R) where op = Op "&" 4 AssocL Reg (EBNF f) &> CF (EBNF p) = CF $ EBNF $ \bo po -> op_paren po op $ f bo (op, L) <> " & " <> p bo (op, R) where op = Op "&" 4 AssocL CF (EBNF f) `but` Reg (EBNF p) = CF $ EBNF $ \bo po -> op_paren po op $ f bo (op, L) <> " - " <> p bo (op, R) where op = Op "-" 6 AssocL -- ** Type 'RuleMode' data RuleMode = RuleMode_Body -- ^ Generate the body of the rule. | RuleMode_Ref -- ^ Generate a ref to the rule. deriving (Eq, Show) -- ** Type 'Op' data Op = Op { op_ident :: Text , op_prece :: Precedence , op_assoc :: Associativity } deriving (Eq, Show) nop :: Op nop = Op "" 0 AssocN -- *** Type 'Precedence' type Precedence = Int -- *** Type 'Associativity' data Associativity = AssocL | AssocR | AssocN | AssocB deriving (Eq, Show) op_paren :: (Semigroup s, IsString s) => (Op, LR) -> Op -> s -> s op_paren (po, lr) op s = if op_prece op <= op_prece po && not associate then fromString "(" <> s <> fromString ")" else s where associate = op_ident po == op_ident op && case (lr, op_assoc po) of (_, AssocB) -> True (L, AssocL) -> True (R, AssocR) -> True _ -> False -- * Class 'Gram_Context' -- | A monadic backdoor, but limited by 'Context'. -- In 'CF', the context must obviously not be used to change the parser, -- but it can be used to change the parsed value, -- for instance by recording source positions into it. class Gram_Context p where type Context p type Context p = () default context :: (Context p ~ ()) => (Context p -> p a) -> p a context :: (Context p -> p a) -> p a context f = f () instance Gram_Context p => Gram_Context (CF p) where type Context (CF p) = Context p context f = CF $ context (unCF . f) instance Gram_Context EBNF instance Gram_Context RuleDef -- * Class 'Gram_Lexer' class ( Alt p , Alter p , Alternative p , App p , Gram_CF p , Gram_Rule p , Gram_Term p ) => Gram_Lexer p where commentable :: p () -> p () -> p () -> p () commentable = rule3 "commentable" $ \p line block -> skipMany $ choice [p, line, block] comment_line :: CF p String -> CF p String comment_line prefix = rule "comment_line" $ prefix *> many (any `but` (void (char '\n') <+> eof)) comment_block :: CF p String -> Reg lr p String -> CF p String comment_block start end = rule "comment_block" $ start *> many (any `but` void end) lexeme :: CF p a -> CF p a lexeme = rule1 "lexeme" $ \p -> p <* commentable (void $ char ' ') (void $ comment_line (string "--")) (void $ comment_block (string "{-") (string "-}")) parens :: CF p a -> CF p a parens = rule1 "parens" $ between (lexeme $ string "(") (lexeme $ string ")") infixrP :: (a -> a -> a) -> CF p a -> CF p sep -> CF p a -> CF p a infixrP f = rule3 "infixrP" $ \next sep root -> (\a -> \case Just b -> f a b; Nothing -> a) <$> next <*> option Nothing (Just <$ sep <*> root) inside :: (a -> b) -> CF p begin -> CF p a -> CF p end -> CF p b -> CF p b inside f = rule4 "inside" $ \begin i end n -> (f <$ begin <*> i <* end) <+> n symbol :: String -> CF p String symbol = lexeme . string deriving instance Gram_Lexer p => Gram_Lexer (CF p) instance Gram_Lexer EBNF instance Gram_Lexer RuleDef gram_lexer :: forall p . (Gram_Lexer p, Gram_RuleDef p) => [CF p ()] gram_lexer = [ () <$ commentable (void $ rule_arg "space") (void $ rule_arg "line") (void $ rule_arg "block") , () <$ comment_line (rule_arg "prefix") , () <$ comment_block (rule_arg "start") (rule_arg "end" :: Reg 'L p String) , () <$ lexeme (rule_arg "p") , () <$ parens (rule_arg "p") , () <$ inside id (rule_arg "begin") (rule_arg "i") (rule_arg "end") (rule_arg "next") , () <$ infixrP const (rule_arg "next") (rule_arg "sep") (rule_arg "root") ]