]> Git — Sourcephile - doclang.git/blob - Language/TCT/Read/Token.hs
Fix Group parsing.
[doclang.git] / Language / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 {-# LANGUAGE Rank2Types #-}
4 {-# LANGUAGE TypeFamilies #-}
5 module Language.TCT.Read.Token where
6
7 import Control.Applicative (Applicative(..), Alternative(..))
8 import Control.Monad (Monad(..))
9 import Data.Bool
10 import Data.Char (Char)
11 import Data.Eq (Eq(..))
12 import Data.Foldable (Foldable(..))
13 import Data.Function (($), (.), flip)
14 import Data.Functor ((<$>), ($>), (<$))
15 import Data.Maybe (Maybe(..), fromMaybe)
16 import Data.Monoid (Monoid(..))
17 import Data.Semigroup (Semigroup(..))
18 import Data.Sequence (ViewL(..), (<|))
19 import Data.Text (Text)
20 import Data.Text.Buildable (Buildable(..))
21 import Data.Tuple (fst,snd)
22 import Text.Show (Show(..))
23 import qualified Data.Char as Char
24 import qualified Data.Sequence as Seq
25 import qualified Data.Text as Text
26 import qualified Data.Text.Lazy as TL
27 import qualified Data.Text.Lazy.Builder as Builder
28 import qualified Text.Megaparsec as P
29
30 import Language.TCT.Token
31 import Language.TCT.Elem
32 import Language.TCT.Read.Elem -- hiding (pdbg)
33
34 -- pdbg m p = P.dbg m p
35
36 textOf :: Buildable a => a -> Text
37 textOf = TL.toStrict . Builder.toLazyText . build
38
39 -- * Type 'Groups'
40 type Groups = (Token,[(Group,Token)])
41
42 openGroup :: Group -> Groups -> Groups
43 openGroup g (t,ms) = (t,(g,mempty):ms)
44
45 insertToken :: Token -> Groups -> Groups
46 insertToken tok (t,[]) = (t<>tok,[])
47 insertToken tok (t,(g0,t0):gs) = (t,(g0,t0<>tok):gs)
48
49 -- | Close a 'Group' when there is a matching 'LexemeGroupClose'.
50 closeGroup :: Group -> Groups -> Groups
51 closeGroup g (t,[]) = dbg "closeGroup" $ (t<>TokenPlain (snd $ groupBorders g mempty),[])
52 closeGroup g (t,(g1,m1):ms) = dbg "closeGroup" $
53 case (g,g1) of
54 (GroupElem x ax, GroupElem y ay) | x == y ->
55 insertToken (TokenGroup (GroupElem x (ax<>ay)) m1) (t,ms)
56 (x,y) | x == y -> insertToken (TokenGroup g1 m1) (t,ms)
57 _ ->
58 closeGroup g $
59 insertToken
60 (closelessGroup mempty (g1,m1))
61 (t,ms)
62
63 -- | Close a 'Group' when there is not a matching 'LexemeGroupClose'.
64 closelessGroup :: Token -> (Group,Token) -> Token
65 closelessGroup acc (g,t) = dbg "closelessGroup" $
66 case g of
67 -- NOTE: try to close 'GroupHash' as 'TokenTag' instead of 'TokenPlain'.
68 GroupHash | TokenPlain p :< toks <- Seq.viewl $ unTokens $ t <> acc ->
69 case Text.findIndex (not . isTagChar) p of
70 Just 0 -> TokenPlain (fst $ groupBorders g mempty) <> t <> acc
71 Just i -> Tokens $ TokenTag tag <| TokenPlain p' <| toks
72 where (tag,p') = Text.splitAt i p
73 Nothing -> Tokens $ TokenTag p <| toks
74 _ -> TokenPlain (fst $ groupBorders g mempty) <> t <> acc
75 where
76 isTagChar c =
77 Char.isAlphaNum c ||
78 case Char.generalCategory c of
79 Char.DashPunctuation -> True
80 Char.ConnectorPunctuation -> True
81 _ -> False
82
83 -- | Close remaining 'Group's at end of parsing.
84 closeGroups :: Groups -> Token
85 closeGroups (t0,gs) = dbg "closeGroups" $
86 t0 <> foldl' closelessGroup mempty gs
87
88 -- * Type 'Lexeme'
89 data Lexeme
90 = LexemeGroupOpen Group
91 | LexemeGroupClose Group
92 | LexemePunctOrSym Char
93 | LexemeWhite Text
94 | LexemeWord Text
95 | LexemeToken Token
96 | LexemeEscape Char
97 | LexemeLink Text
98 deriving (Show, Eq)
99
100 appendLexeme :: Lexeme -> Groups -> Groups
101 appendLexeme lex gs =
102 dbg "appendLexeme" $
103 case dbg "appendLexeme" lex of
104 LexemeGroupOpen g -> openGroup g gs
105 LexemeGroupClose g -> closeGroup g gs
106 LexemePunctOrSym c -> insertToken (TokenPlain (Text.singleton c)) gs
107 LexemeWhite wh -> insertToken (TokenPlain wh) gs
108 LexemeWord wo -> insertToken (TokenPlain wo) gs
109 LexemeToken tok -> insertToken tok gs
110 LexemeEscape c -> insertToken (TokenEscape c) gs
111 LexemeLink lnk -> insertToken (TokenLink lnk) gs
112
113 appendLexemes :: Groups -> [Lexeme] -> Groups
114 appendLexemes = foldl' (flip appendLexeme)
115
116 -- * Parsers
117
118 p_Token :: Parser e s Token
119 p_Token = closeGroups <$> p_Groups (mempty,[])
120
121 p_Groups :: Groups -> Parser e s Groups
122 p_Groups gs = pdbg "Groups" $
123 (p_Lexemes (mempty == gs) >>= p_Groups . appendLexemes gs) <|>
124 (P.eof $> gs)
125
126 p_Lexemes :: Bool -> Parser e s [Lexeme]
127 p_Lexemes isBOF = pdbg "Lexemes" $
128 P.choice
129 [ P.try $ p_GroupCloseWhite
130 , P.try $ p_GroupWhiteOpen isBOF
131 , P.try $ p_GroupCloseBorder
132 , P.try $ p_GroupBorderOpen
133 , P.try $ p_GroupClose
134 , P.try $ p_GroupOpen
135 , P.try $ pure . LexemePunctOrSym <$> p_PunctOrSym
136 , P.try $ pure <$> p_White
137 , pure . LexemeWord <$> p_Word
138 ]
139
140 p_White :: Parser e s Lexeme
141 p_White = pdbg "White" $ LexemeWhite <$> p_Spaces
142
143 p_PunctOrSym :: Parser e s Char
144 p_PunctOrSym = P.satisfy $ \c ->
145 Char.isPunctuation c ||
146 Char.isSymbol c
147
148 p_GroupCloseWhite :: Parser e s [Lexeme]
149 p_GroupCloseWhite = pdbg "GroupCloseWhite" $
150 (\c b -> mconcat c <> b)
151 <$> P.some (P.try p_GroupClose <|> pure . LexemePunctOrSym <$> p_PunctOrSym)
152 <*> ((pure <$> p_White) <|> P.eof $> [])
153
154 p_GroupWhiteOpen :: Bool -> Parser e s [Lexeme]
155 p_GroupWhiteOpen isBOF = pdbg "GroupWhiteOpen" $
156 (\b o -> b <> mconcat o)
157 <$> (if isBOF then return [] else pure <$> p_White)
158 <*> P.some (P.try p_GroupOpen <|> pure . LexemePunctOrSym <$> p_PunctOrSym)
159
160 p_GroupCloseBorder :: Parser e s [Lexeme]
161 p_GroupCloseBorder = pdbg "GroupCloseBorder" $
162 (\c b -> mconcat $ c <> b)
163 <$> P.some (P.try p_GroupClose)
164 <*> P.some (P.try $ P.choice
165 [ P.try p_ElemOpen
166 , P.try p_ElemClose
167 , do
168 c <- p_PunctOrSym
169 case l_GroupClose c of
170 Just l -> return [l]
171 Nothing ->
172 case l_GroupOpenOrClose LexemeGroupOpen c <|> l_GroupOpen c of
173 Nothing -> return [LexemePunctOrSym c]
174 _ -> fail ""
175 ])
176
177 p_GroupBorderOpen :: Parser e s [Lexeme]
178 p_GroupBorderOpen = pdbg "GroupBorderOpen" $
179 (\b o -> mconcat $ b <> o)
180 <$> P.some (P.try $ P.choice
181 [ P.try p_ElemOpen
182 , P.try p_ElemClose
183 , do
184 c <- p_PunctOrSym
185 case l_GroupOpen c of
186 Just l -> return [l]
187 Nothing ->
188 case l_GroupOpenOrClose LexemeGroupClose c <|> l_GroupClose c of
189 Nothing -> return [LexemePunctOrSym c]
190 _ -> fail ""
191 ])
192 <*> P.some (P.try p_GroupOpen)
193
194 p_GroupOpen :: Parser e s [Lexeme]
195 p_GroupOpen = pdbg "GroupOpen" $ do
196 P.choice
197 [ P.try p_ElemOpen
198 , P.try (pure <$> p_Escape)
199 , P.try (pure <$> p_Link)
200 , do
201 c <- p_PunctOrSym
202 case l_GroupOpenAndClose LexemeGroupOpen c of
203 Just l -> return [l]
204 _ -> fail ""
205 ]
206
207 p_GroupClose :: Parser e s [Lexeme]
208 p_GroupClose = pdbg "GroupClose" $ do
209 P.choice
210 [ P.try p_ElemClose
211 , P.try p_ElemSingle
212 , P.try (pure <$> p_Escape)
213 , P.try (pure <$> p_Link)
214 , do
215 c <- p_PunctOrSym
216 case l_GroupOpenAndClose LexemeGroupClose c of
217 Just l -> return [l]
218 _ -> fail ""
219 ]
220
221 p_GroupPlain :: (Char -> Maybe Lexeme) -> Parser e s [Lexeme]
222 p_GroupPlain grp = pdbg "GroupPlain" $ do
223 (<$> p_PunctOrSym) $ \c ->
224 pure $
225 LexemePunctOrSym c `fromMaybe`
226 grp c
227
228 l_GroupOpenOrClose :: (Group -> Lexeme) -> Char -> Maybe Lexeme
229 l_GroupOpenOrClose lxm c =
230 case c of
231 '/' -> Just $ lxm GroupSlash
232 '-' -> Just $ lxm GroupDash
233 '"' -> Just $ lxm GroupDoublequote
234 '\'' -> Just $ lxm GroupSinglequote
235 '`' -> Just $ lxm GroupBackquote
236 '_' -> Just $ lxm GroupUnderscore
237 '*' -> Just $ lxm GroupStar
238 '#' -> Just $ lxm GroupHash
239 _ -> Nothing
240
241 l_GroupOpenAndClose :: (Group -> Lexeme) -> Char -> Maybe Lexeme
242 l_GroupOpenAndClose lxm c =
243 l_GroupOpenOrClose lxm c <|>
244 l_GroupOpen c <|>
245 l_GroupClose c
246
247 l_GroupOpen :: Char -> Maybe Lexeme
248 l_GroupOpen c =
249 case c of
250 '(' -> Just $ LexemeGroupOpen GroupParen
251 '[' -> Just $ LexemeGroupOpen GroupBracket
252 '{' -> Just $ LexemeGroupOpen GroupBrace
253 '«' -> Just $ LexemeGroupOpen GroupFrenchquote
254 _ -> Nothing
255
256 l_GroupClose :: Char -> Maybe Lexeme
257 l_GroupClose c =
258 case c of
259 ')' -> Just $ LexemeGroupClose GroupParen
260 ']' -> Just $ LexemeGroupClose GroupBracket
261 '}' -> Just $ LexemeGroupClose GroupBrace
262 '»' -> Just $ LexemeGroupClose GroupFrenchquote
263 _ -> Nothing
264
265 p_Link :: Parser e s Lexeme
266 p_Link =
267 (\scheme ss addr -> LexemeLink $ Text.pack $ scheme <> ss <> addr)
268 <$> P.option "" (P.try p_scheme)
269 <*> P.string "//"
270 <*> p_addr
271 where
272 p_scheme =
273 (<>)
274 <$> P.some (P.satisfy $ \c ->
275 Char.isAlphaNum c
276 || c=='_'
277 || c=='-'
278 || c=='+')
279 <*> P.string ":"
280 p_addr =
281 P.many $
282 P.satisfy $ \c ->
283 Char.isAlphaNum c
284 || c=='%'
285 || c=='/'
286 || c=='('
287 || c==')'
288 || c=='-'
289 || c=='_'
290 || c=='.'
291
292 p_Escape :: Parser e s Lexeme
293 p_Escape =
294 LexemeEscape
295 <$ P.char '\\'
296 <*> P.satisfy Char.isPrint
297
298 p_ElemSingle :: Parser e s [Lexeme]
299 p_ElemSingle = pdbg "ElemOpen" $
300 (\e as ->
301 [ LexemeGroupOpen $ GroupElem e as
302 , LexemeToken $ Tokens mempty
303 -- NOTE: encode that it's the same Elem for open and close
304 , LexemeGroupClose $ GroupElem e [] ])
305 <$ P.char '<'
306 <*> p_Word
307 <*> p_Attrs
308 <* P.string "/>"
309
310 p_ElemOpen :: Parser e s [Lexeme]
311 p_ElemOpen = pdbg "ElemOpen" $
312 (\e as oc ->
313 case oc of
314 True -> [ LexemeGroupOpen $ GroupElem e as
315 , LexemeToken $ Tokens mempty
316 , LexemeGroupClose $ GroupElem e [] ]
317 False -> [LexemeGroupOpen $ GroupElem e as])
318 <$ P.char '<'
319 <*> p_Word
320 <*> p_Attrs
321 <*> P.option False (True <$ P.char '/')
322 <* P.char '>'
323
324 p_ElemClose :: Parser e s [Lexeme]
325 p_ElemClose = pdbg "ElemClose" $
326 (\e -> [LexemeGroupClose $ GroupElem e []])
327 <$ P.string "</"
328 <*> p_Word
329 <* P.char '>'