]> Git — Sourcephile - haskell/symantic.git/blob - Language/Symantic/Parsing/Grammar.hs
Add Parsing.Grammar.
[haskell/symantic.git] / Language / Symantic / Parsing / Grammar.hs
1 {-# LANGUAGE ConstraintKinds #-}
2 {-# LANGUAGE DataKinds #-}
3 {-# LANGUAGE DefaultSignatures #-}
4 {-# LANGUAGE DeriveFunctor #-}
5 {-# LANGUAGE FlexibleContexts #-}
6 {-# LANGUAGE FlexibleInstances #-}
7 {-# LANGUAGE GADTs #-}
8 {-# LANGUAGE GeneralizedNewtypeDeriving #-}
9 {-# LANGUAGE LambdaCase #-}
10 {-# LANGUAGE NamedFieldPuns #-}
11 {-# LANGUAGE NoMonomorphismRestriction #-}
12 {-# LANGUAGE OverloadedStrings #-}
13 {-# LANGUAGE PolyKinds #-}
14 {-# LANGUAGE Rank2Types #-}
15 {-# LANGUAGE StandaloneDeriving #-}
16 {-# LANGUAGE TypeFamilies #-}
17 {-# OPTIONS_GHC -fno-warn-tabs #-}
18 -- | This module defines symantics
19 -- for regular or context-free grammars.
20 module Language.Symantic.Parsing.Grammar where
21
22 import Control.Applicative (Applicative(..), Alternative(..))
23 import Control.Monad
24 import Data.Char as Char
25 import Data.Foldable hiding (any)
26 import qualified Data.List as List
27 import Data.Semigroup hiding (option)
28 import Data.String (IsString(..))
29 import Data.Text (Text)
30 import qualified Data.Text as Text
31 import Prelude hiding (any)
32
33 -- * Class 'Gram_Rule'
34 type Id a = a -> a
35 class Gram_Rule p where
36 rule :: Text -> Id (p a)
37 rule _n = id
38 rule1 :: Text -> Id (p a -> p b)
39 rule1 _n p = p
40 rule2 :: Text -> Id (p a -> p b -> p c)
41 rule2 _n p = p
42 rule3 :: Text -> Id (p a -> p b -> p c -> p d)
43 rule3 _n p = p
44 rule4 :: Text -> Id (p a -> p b -> p c -> p d -> p e)
45 rule4 _n p = p
46
47 -- * Type 'Term'
48 -- | Terminal grammar.
49 newtype Term p a
50 = Term { unTerm :: p a }
51 deriving (Functor, Gram_Term)
52
53 -- ** Class 'Gram_Term'
54 -- | Symantics for terminal grammars.
55 class Gram_Term p where
56 any :: p Char
57 eof :: p ()
58 char :: Char -> p Char
59 string :: String -> p String
60 unicat :: Unicat -> p Char
61 range :: (Char, Char) -> p Char
62 -- string = foldr (\c -> (<*>) ((:) <$> char c)) (pure "")
63 -- string [] = pure []
64 -- string (c:cs) = (:) <$> char c <*> string cs
65
66 -- *** Type 'Unicat'
67 -- | Unicode category.
68 data Unicat
69 = Unicat_Letter
70 | Unicat_Mark
71 | Unicat_Number
72 | Unicat_Punctuation
73 | Unicat_Symbol
74 | Unicat Char.GeneralCategory
75 deriving (Eq, Show)
76
77 unicode_categories :: Unicat -> [Char.GeneralCategory]
78 unicode_categories c =
79 case c of
80 Unicat_Letter ->
81 [ UppercaseLetter
82 , LowercaseLetter
83 , TitlecaseLetter
84 , ModifierLetter
85 , OtherLetter
86 ]
87 Unicat_Mark ->
88 [ NonSpacingMark
89 , SpacingCombiningMark
90 , EnclosingMark
91 ]
92 Unicat_Number ->
93 [ DecimalNumber
94 , LetterNumber
95 , OtherNumber
96 ]
97 Unicat_Punctuation ->
98 [ ConnectorPunctuation
99 , DashPunctuation
100 , OpenPunctuation
101 , ClosePunctuation
102 , OtherPunctuation
103 ]
104 Unicat_Symbol ->
105 [ MathSymbol
106 , CurrencySymbol
107 , ModifierSymbol
108 , OtherSymbol
109 ]
110 Unicat cat -> [cat]
111
112 -- * Type 'Reg'
113 -- | Left or right regular grammar.
114 newtype Reg (lr::LR) p a = Reg { unReg :: p a }
115 deriving (IsString, Functor, Gram_Term, Alter)
116 deriving instance Gram_Rule p => Gram_Rule (Reg lr p)
117 deriving instance (Functor p, Alter p, Gram_RegL p) => Gram_RegL (RegL p)
118 deriving instance (Functor p, Alter p, Gram_RegR p) => Gram_RegR (RegR p)
119
120 -- ** Type 'LR'
121 data LR
122 = L -- ^ Left
123 | R -- ^ Right
124 deriving (Eq, Show)
125 type RegL = Reg 'L
126 type RegR = Reg 'R
127
128 -- ** Class 'Alter'
129 -- | Like 'Alternative' but without the 'Applicative' super-class,
130 -- because a regular grammar is not closed under 'Applicative'.
131 class Alter p where
132 nil :: p a
133 (<+>) :: p a -> p a -> p a
134 choice :: [p a] -> p a
135 default nil :: Alternative p => p a
136 default (<+>) :: Alternative p => p a -> p a -> p a
137 default choice :: Alternative p => [p a] -> p a
138 nil = empty
139 (<+>) = (<|>)
140 choice = foldr (<+>) empty
141 infixl 3 <+>
142
143 -- ** Class 'Gram_RegR'
144 -- | Symantics for right regular grammars.
145 class (Functor p, Alter p) => Gram_RegR p where
146 (.*>) :: Term p (a -> b) -> RegR p a -> RegR p b
147 manyR :: Term p a -> RegR p [a]
148 manyR p = (:) <$> p .*> manyR p <+> nil
149 someR :: Term p a -> RegR p [a]
150 someR p = (:) <$> p .*> manyR p
151 infixl 4 .*>
152
153 -- ** Class 'Gram_RegL'
154 -- | Symantics for left regular grammars.
155 class (Functor p, Alter p) => Gram_RegL p where
156 (<*.) :: RegL p (a -> b) -> Term p a -> RegL p b
157 manyL :: Term p a -> RegL p [a]
158 manyL p' = reverse <$> go p'
159 where go p = flip (:) <$> go p <*. p <+> nil
160 someL :: Term p a -> RegL p [a]
161 someL p = (\cs c -> cs ++ [c]) <$> manyL p <*. p
162 infixl 4 <*.
163
164 -- * Type 'CF'
165 -- | Context-free grammar.
166 newtype CF p a = CF { unCF :: p a }
167 deriving (IsString, Functor, Gram_Term, Applicative, App, Alternative, Alter, Alt)
168 deriving instance Gram_Rule p => Gram_Rule (CF p)
169 deriving instance Gram_CF p => Gram_CF (CF p)
170
171 cf_of_reg :: Reg lr p a -> CF p a
172 cf_of_reg (Reg p) = CF p
173
174 -- ** Class 'Gram_CF'
175 -- | Symantics for context-free grammars.
176 class Gram_CF p where
177 -- | NOTE: CFL ∩ RL is a CFL.
178 -- See ISBN 81-7808-347-7, Theorem 7.27, p.286
179 (<&) :: CF p (a -> b) -> Reg lr p a -> CF p b
180 (&>) :: Reg lr p (a -> b) -> CF p a -> CF p b
181 -- | NOTE: CFL - RL is a CFL.
182 -- See ISBN 81-7808-347-7, Theorem 7.29, p.289
183 but :: CF p a -> Reg lr p b -> CF p a
184 infixl 4 <&
185 infixl 4 &>
186
187 -- ** Class 'App'
188 class Applicative p => App p where
189 between :: p open -> p close -> p a -> p a
190 between open close p = open *> p <* close
191
192 -- ** Class 'Alt'
193 class Alternative p => Alt p where
194 option :: a -> p a -> p a
195 option x p = p <|> pure x
196 skipMany :: p a -> p ()
197 skipMany = void . many
198 --manyTill :: p a -> p end -> p [a]
199 --manyTill p end = go where go = ([] <$ end) <|> ((:) <$> p <*> go)
200
201 -- * Type 'EBNF'
202 -- | Extended Bachus-Norm Form, following the
203 -- <http://standards.iso.org/ittf/PubliclyAvailableStandards/s026153_ISO_IEC_14977_1996(E).zip ISO-IEC-14977>
204 -- notations, augmented with the following notations:
205 --
206 -- * @("U+", code_point)@: for <http://unicode.org/versions/Unicode8.0.0/ ISO-IEC-10646> (aka. Unicode).
207 -- * @(rule, "&", rule)@: for the intersection.
208 -- * @(rule, "-", rule)@: for the difference.
209 -- * @(rule, " ", rule)@: for rule application.
210 data EBNF a = EBNF { unEBNF :: RuleMode -> (Op, LR) -> Text }
211
212 runEBNF :: EBNF a -> Text
213 runEBNF (EBNF p) = p RuleMode_Body (nop, L)
214
215 -- | Get textual rendition of given EBNF rule.
216 renderEBNF :: RuleDef a -> Text
217 renderEBNF = runEBNF . unRuleDef
218
219 ebnf_const :: Text -> EBNF a
220 ebnf_const t = EBNF $ \_rm _op -> t
221
222 -- ** Type 'RuleDef'
223 newtype RuleDef a = RuleDef { unRuleDef :: EBNF a }
224 deriving (Functor, Gram_Term, Applicative, App
225 , Alternative, Alter, Alt, Gram_RegL, Gram_RegR, Gram_CF)
226 deriving instance Gram_RuleDef RuleDef
227 deriving instance Gram_RuleDef p => Gram_RuleDef (RegR p)
228 deriving instance Gram_RuleDef p => Gram_RuleDef (RegL p)
229 deriving instance Gram_RuleDef p => Gram_RuleDef (CF p)
230
231 instance Gram_Rule RuleDef where
232 rule n = rule_def (ebnf_const n)
233 rule1 n p a = rule_def (ebnf_const n `ebnf_arg` unRuleDef a) (p a)
234 rule2 n p a b = rule_def (ebnf_const n `ebnf_arg` unRuleDef a `ebnf_arg` unRuleDef b) (p a b)
235 rule3 n p a b c = rule_def (ebnf_const n `ebnf_arg` unRuleDef a `ebnf_arg` unRuleDef b `ebnf_arg` unRuleDef c) (p a b c)
236 rule4 n p a b c d = rule_def (ebnf_const n `ebnf_arg` unRuleDef a `ebnf_arg` unRuleDef b `ebnf_arg` unRuleDef c `ebnf_arg` unRuleDef d) (p a b c d)
237
238 -- *** Class 'Gram_RuleDef'
239 class Gram_RuleDef p where
240 rule_def :: EBNF () -> p a -> RuleDef a
241 rule_arg :: Text -> p a
242
243 -- | Helper for 'Gram_Rule' 'EBNF'.
244 ebnf_arg :: EBNF a -> EBNF b -> EBNF ()
245 ebnf_arg (EBNF a) (EBNF b) = EBNF $ \bo po -> op_paren po op $
246 a bo (op, L) <> " " <> b bo (op, R)
247 where op = Op " " 11 AssocL
248 infixl 5 `ebnf_arg`
249
250 instance Gram_RuleDef EBNF where
251 rule_arg = ebnf_const
252 rule_def call body =
253 RuleDef $ EBNF $ \mo po ->
254 case mo of
255 RuleMode_Ref -> unEBNF call mo po
256 RuleMode_Body ->
257 Text.intercalate " " $ concat $
258 [ [unEBNF call RuleMode_Ref (nop, L)]
259 , ["="]
260 , [unEBNF body RuleMode_Ref (nop, R)]
261 , [";"]
262 ]
263 instance IsString (EBNF String) where
264 fromString = string
265 instance Show (EBNF a) where
266 show = Text.unpack . runEBNF
267 instance Gram_Rule EBNF where
268 rule n p = EBNF $ \rm po ->
269 case rm of
270 RuleMode_Body -> unEBNF p RuleMode_Ref po
271 RuleMode_Ref -> n
272 rule1 n p a = EBNF $ \rm po ->
273 case rm of
274 RuleMode_Body -> unEBNF (p a) RuleMode_Ref po
275 RuleMode_Ref -> unEBNF (ebnf_const n `ebnf_arg` a) RuleMode_Ref po
276 rule2 n p a b = EBNF $ \rm po ->
277 case rm of
278 RuleMode_Body -> unEBNF (p a b) RuleMode_Ref po
279 RuleMode_Ref -> unEBNF (ebnf_const n `ebnf_arg` a `ebnf_arg` b) RuleMode_Ref po
280 rule3 n p a b c = EBNF $ \rm po ->
281 case rm of
282 RuleMode_Body -> unEBNF (p a b c) RuleMode_Ref po
283 RuleMode_Ref -> unEBNF (ebnf_const n `ebnf_arg` a `ebnf_arg` b `ebnf_arg` c) RuleMode_Ref po
284 rule4 n p a b c d = EBNF $ \rm po ->
285 case rm of
286 RuleMode_Body -> unEBNF (p a b c d) RuleMode_Ref po
287 RuleMode_Ref -> unEBNF (ebnf_const n `ebnf_arg` a `ebnf_arg` b `ebnf_arg` c `ebnf_arg` d) RuleMode_Ref po
288 instance Functor EBNF where
289 fmap _f (EBNF x) = EBNF x
290 instance Applicative EBNF where
291 pure _ = empty
292 EBNF f <*> EBNF x = EBNF $ \bo po -> op_paren po op $
293 f bo (op, L) <> ", " <> x bo (op, R)
294 where op = Op "," 10 AssocB
295 instance App EBNF
296 instance Alternative EBNF where
297 empty = ebnf_const $ "\"\""
298 EBNF x <|> EBNF y = EBNF $ \bo po -> op_paren po op $
299 x bo (op, L) <> " | " <> y bo (op, R)
300 where op = Op "|" 2 AssocB
301 many (EBNF x) = EBNF $ \rm _po -> "{ " <> x rm (op, L) <> " }" where op = nop
302 some (EBNF x) = EBNF $ \rm _po -> "{ " <> x rm (op, L) <> " }-" where op = nop
303 instance Alter EBNF where
304 choice [] = empty
305 choice [p] = p
306 choice l@(_:_) = EBNF $ \bo po -> op_paren po op $
307 Text.intercalate " | " $
308 (unEBNF <$> l) <*> pure bo <*> pure (op, L)
309 where op = Op "|" 2 AssocB
310 instance Alt EBNF
311 instance Gram_Term EBNF where
312 any = ebnf_const "_"
313 eof = ebnf_const "EOF"
314 char = ebnf_const . escape
315 where
316 escape c | Char.isPrint c && c /= '"' = Text.concat $ ["\"", Text.singleton c, "\""]
317 escape c = Text.concat ["U+", Text.pack $ show $ ord c]
318 string s =
319 case List.break (\c -> not (Char.isPrint c) || c == '"') s of
320 (ps, "") -> raw ps
321 ("", [c]) -> "" <$ char c
322 (ps, [c]) -> "" <$ raw ps <* char c
323 ("", c:rs) -> "" <$ char c <* string rs
324 (ps, c:rs) -> "" <$ raw ps <* char c <* string rs
325 where
326 raw cs = ebnf_const $ Text.concat $ ["\"", Text.pack cs, "\""]
327 unicat = ebnf_const . Text.pack . show
328 range (l, h) = ebnf_const $ Text.concat
329 [ runEBNF $ char l
330 , "…"
331 , runEBNF $ char h
332 ]
333 instance Gram_RegR EBNF where
334 Term f .*> Reg x = Reg $ f <*> x
335 manyR = Reg . many . unTerm
336 someR = Reg . some . unTerm
337 instance Gram_RegL EBNF where
338 Reg f <*. Term x = Reg $ f <*> x
339 manyL = Reg . many . unTerm
340 someL = Reg . some . unTerm
341 instance Gram_CF EBNF where
342 CF (EBNF f) <& Reg (EBNF p) = CF $ EBNF $ \bo po -> op_paren po op $
343 f bo (op, L) <> " & " <> p bo (op, R)
344 where op = Op "&" 4 AssocL
345 Reg (EBNF f) &> CF (EBNF p) = CF $ EBNF $ \bo po -> op_paren po op $
346 f bo (op, L) <> " & " <> p bo (op, R)
347 where op = Op "&" 4 AssocL
348 CF (EBNF f) `but` Reg (EBNF p) = CF $ EBNF $ \bo po -> op_paren po op $
349 f bo (op, L) <> " - " <> p bo (op, R)
350 where op = Op "-" 6 AssocL
351
352 -- ** Type 'RuleMode'
353 data RuleMode
354 = RuleMode_Body -- ^ Generate the body of the rule.
355 | RuleMode_Ref -- ^ Generate a ref to the rule.
356 deriving (Eq, Show)
357
358 -- ** Type 'Op'
359 data Op = Op
360 { op_ident :: Text
361 , op_prece :: Precedence
362 , op_assoc :: Associativity
363 } deriving (Eq, Show)
364
365 nop :: Op
366 nop = Op "" 0 AssocN
367
368 -- *** Type 'Precedence'
369 type Precedence = Int
370
371 -- *** Type 'Associativity'
372 data Associativity
373 = AssocL | AssocR | AssocN | AssocB
374 deriving (Eq, Show)
375
376 op_paren
377 :: (Semigroup s, IsString s)
378 => (Op, LR) -> Op -> s -> s
379 op_paren (po, lr) op s =
380 if op_prece op <= op_prece po && not associate
381 then fromString "(" <> s <> fromString ")"
382 else s
383 where
384 associate =
385 op_ident po == op_ident op &&
386 case (lr, op_assoc po) of
387 (_, AssocB) -> True
388 (L, AssocL) -> True
389 (R, AssocR) -> True
390 _ -> False
391
392 -- * Class 'Gram_Context'
393 -- | A monadic backdoor, but limited by 'Context'.
394 -- In 'CF', the context must obviously not be used to change the parser,
395 -- but it can be used to change the parsed value,
396 -- for instance by recording source positions into it.
397 class Gram_Context p where
398 type Context p
399 type Context p = ()
400 default context :: (Context p ~ ()) => (Context p -> p a) -> p a
401 context :: (Context p -> p a) -> p a
402 context f = f ()
403 instance Gram_Context p => Gram_Context (CF p) where
404 type Context (CF p) = Context p
405 context f = CF $ context (unCF . f)
406 instance Gram_Context EBNF
407 instance Gram_Context RuleDef
408
409 -- * Class 'Gram_Lexer'
410 class
411 ( Alt p
412 , Alter p
413 , Alternative p
414 , App p
415 , Gram_CF p
416 , Gram_Rule p
417 , Gram_Term p
418 ) => Gram_Lexer p where
419 commentable :: p () -> p () -> p () -> p ()
420 commentable = rule3 "commentable" $ \p line block ->
421 skipMany $ choice [p, line, block]
422 comment_line :: CF p String -> CF p String
423 comment_line prefix = rule "comment_line" $
424 prefix *> many (any `but` (void (char '\n') <+> eof))
425 comment_block :: CF p String -> Reg lr p String -> CF p String
426 comment_block start end = rule "comment_block" $
427 start *> many (any `but` void end)
428 lexeme :: CF p a -> CF p a
429 lexeme = rule1 "lexeme" $ \p -> p
430 <* commentable
431 (void $ char ' ')
432 (void $ comment_line (string "--"))
433 (void $ comment_block (string "{-") (string "-}"))
434 parens :: CF p a -> CF p a
435 parens = rule1 "parens" $
436 between
437 (lexeme $ string "(")
438 (lexeme $ string ")")
439 infixrP :: (a -> a -> a) -> CF p a -> CF p sep -> CF p a -> CF p a
440 infixrP f =
441 rule3 "infixrP" $ \next sep root ->
442 (\a -> \case Just b -> f a b; Nothing -> a)
443 <$> next <*> option Nothing (Just <$ sep <*> root)
444 inside :: (a -> b) -> CF p begin -> CF p a -> CF p end -> CF p b -> CF p b
445 inside f = rule4 "inside" $ \begin i end n ->
446 (f <$ begin <*> i <* end) <+> n
447 symbol :: String -> CF p String
448 symbol = lexeme . string
449
450 deriving instance Gram_Lexer p => Gram_Lexer (CF p)
451 instance Gram_Lexer EBNF
452 instance Gram_Lexer RuleDef
453
454 gram_lexer :: forall p . (Gram_Lexer p, Gram_RuleDef p) => [CF p ()]
455 gram_lexer =
456 [ () <$ commentable (void $ rule_arg "space") (void $ rule_arg "line") (void $ rule_arg "block")
457 , () <$ comment_line (rule_arg "prefix")
458 , () <$ comment_block (rule_arg "start") (rule_arg "end" :: Reg 'L p String)
459 , () <$ lexeme (rule_arg "p")
460 , () <$ parens (rule_arg "p")
461 , () <$ inside id (rule_arg "begin") (rule_arg "i") (rule_arg "end") (rule_arg "next")
462 , () <$ infixrP const (rule_arg "next") (rule_arg "sep") (rule_arg "root")
463 ]