]> Git — Sourcephile - doclang.git/blob - Language/TCT/Read/Token.hs
Fix Group parsing.
[doclang.git] / Language / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 {-# LANGUAGE Rank2Types #-}
4 {-# LANGUAGE TypeFamilies #-}
5 module Language.TCT.Read.Token where
6
7 import Control.Applicative (Applicative(..), Alternative(..))
8 import Control.Monad (Monad(..))
9 import Data.Bool
10 import Data.Char (Char)
11 import Data.Eq (Eq(..))
12 import Data.Foldable (Foldable(..))
13 import Data.Function (($), (.), flip)
14 import Data.Functor ((<$>), ($>), (<$))
15 import Data.Maybe (Maybe(..), fromMaybe)
16 import Data.Monoid (Monoid(..))
17 import Data.Semigroup (Semigroup(..))
18 import Data.Sequence (ViewL(..), (<|))
19 import Data.Text (Text)
20 import Data.Text.Buildable (Buildable(..))
21 import Data.Tuple (fst,snd)
22 import Text.Show (Show(..))
23 import qualified Data.Char as Char
24 import qualified Data.Sequence as Seq
25 import qualified Data.Text as Text
26 import qualified Data.Text.Lazy as TL
27 import qualified Data.Text.Lazy.Builder as Builder
28 import qualified Text.Megaparsec as P
29
30 import Language.TCT.Token
31 import Language.TCT.Elem
32 import Language.TCT.Read.Elem -- hiding (pdbg)
33
34 -- pdbg m p = P.dbg m p
35
36 textOf :: Buildable a => a -> Text
37 textOf = TL.toStrict . Builder.toLazyText . build
38
39 -- * Type 'Groups'
40 type Groups = (Token,[(Group,Token)])
41
42 openGroup :: Group -> Groups -> Groups
43 openGroup g (t,ms) = (t,(g,mempty):ms)
44
45 insertToken :: Token -> Groups -> Groups
46 insertToken tok (t,[]) = (t<>tok,[])
47 insertToken tok (t,(g0,t0):gs) = (t,(g0,t0<>tok):gs)
48
49 -- | Close a 'Group' when there is a matching 'LexemeGroupClose'.
50 closeGroup :: Group -> Groups -> Groups
51 closeGroup g (t,[]) = dbg "closeGroup" $ (t<>TokenPlain (snd $ groupBorders g mempty),[])
52 closeGroup g (t,(g1,m1):ms) = dbg "closeGroup" $
53 case (g,g1) of
54 (GroupElem x ax, GroupElem y ay) | x == y ->
55 insertToken (TokenGroup (GroupElem x (ax<>ay)) m1) (t,ms)
56 (x,y) | x == y -> insertToken (TokenGroup g1 m1) (t,ms)
57 _ ->
58 closeGroup g $
59 insertToken
60 (closelessGroup mempty (g1,m1))
61 (t,ms)
62
63 -- | Close a 'Group' when there is not a matching 'LexemeGroupClose'.
64 closelessGroup :: Token -> (Group,Token) -> Token
65 closelessGroup acc (g,t) = dbg "closelessGroup" $
66 case g of
67 -- NOTE: try to close 'GroupHash' as 'TokenTag' instead of 'TokenPlain'.
68 GroupHash | TokenPlain p :< toks <- Seq.viewl $ unTokens $ t <> acc ->
69 case Text.findIndex (not . isTagChar) p of
70 Just 0 -> TokenPlain (fst $ groupBorders g mempty) <> t <> acc
71 Just i -> Tokens $ TokenTag tag <| TokenPlain p' <| toks
72 where (tag,p') = Text.splitAt i p
73 Nothing -> Tokens $ TokenTag p <| toks
74 _ -> TokenPlain (fst $ groupBorders g mempty) <> t <> acc
75 where
76 isTagChar c =
77 Char.isAlphaNum c ||
78 c=='·' ||
79 case Char.generalCategory c of
80 Char.DashPunctuation -> True
81 Char.ConnectorPunctuation -> True
82 _ -> False
83
84 -- | Close remaining 'Group's at end of parsing.
85 closeGroups :: Groups -> Token
86 closeGroups (t0,gs) = dbg "closeGroups" $
87 t0 <> foldl' closelessGroup mempty gs
88
89 -- * Type 'Lexeme'
90 data Lexeme
91 = LexemeGroupOpen Group
92 | LexemeGroupClose Group
93 | LexemePunctOrSym Char
94 | LexemeWhite Text
95 | LexemeWord Text
96 | LexemeToken Token
97 | LexemeEscape Char
98 | LexemeLink Text
99 deriving (Show, Eq)
100
101 appendLexeme :: Lexeme -> Groups -> Groups
102 appendLexeme lex gs =
103 dbg "appendLexeme" $
104 case dbg "appendLexeme" lex of
105 LexemeGroupOpen g -> openGroup g gs
106 LexemeGroupClose g -> closeGroup g gs
107 LexemePunctOrSym c -> insertToken (TokenPlain (Text.singleton c)) gs
108 LexemeWhite wh -> insertToken (TokenPlain wh) gs
109 LexemeWord wo -> insertToken (TokenPlain wo) gs
110 LexemeToken tok -> insertToken tok gs
111 LexemeEscape c -> insertToken (TokenEscape c) gs
112 LexemeLink lnk -> insertToken (TokenLink lnk) gs
113
114 appendLexemes :: Groups -> [Lexeme] -> Groups
115 appendLexemes = foldl' (flip appendLexeme)
116
117 -- * Parsers
118
119 p_Token :: Parser e s Token
120 p_Token = closeGroups <$> p_Groups (mempty,[])
121
122 p_Groups :: Groups -> Parser e s Groups
123 p_Groups gs = pdbg "Groups" $
124 (p_Lexemes (mempty == gs) >>= p_Groups . appendLexemes gs) <|>
125 (P.eof $> gs)
126
127 p_Lexemes :: Bool -> Parser e s [Lexeme]
128 p_Lexemes isBOF = pdbg "Lexemes" $
129 P.choice
130 [ P.try $ p_GroupCloseWhite
131 , P.try $ p_GroupWhiteOpen isBOF
132 , P.try $ p_GroupCloseBorder
133 , P.try $ p_GroupBorderOpen
134 , P.try $ p_GroupClose
135 , P.try $ p_GroupOpen
136 , P.try $ pure . LexemePunctOrSym <$> p_PunctOrSym
137 , P.try $ pure <$> p_White
138 , pure . LexemeWord <$> p_Word
139 ]
140
141 p_White :: Parser e s Lexeme
142 p_White = pdbg "White" $ LexemeWhite <$> p_Spaces
143
144 p_PunctOrSym :: Parser e s Char
145 p_PunctOrSym = P.satisfy $ \c ->
146 Char.isPunctuation c ||
147 Char.isSymbol c
148
149 p_GroupCloseWhite :: Parser e s [Lexeme]
150 p_GroupCloseWhite = pdbg "GroupCloseWhite" $
151 (\c b -> mconcat c <> b)
152 <$> P.some (P.try p_GroupClose <|> pure . LexemePunctOrSym <$> p_PunctOrSym)
153 <*> ((pure <$> p_White) <|> P.eof $> [])
154
155 p_GroupWhiteOpen :: Bool -> Parser e s [Lexeme]
156 p_GroupWhiteOpen isBOF = pdbg "GroupWhiteOpen" $
157 (\b o -> b <> mconcat o)
158 <$> (if isBOF then return [] else pure <$> p_White)
159 <*> P.some (P.try p_GroupOpen <|> pure . LexemePunctOrSym <$> p_PunctOrSym)
160
161 p_GroupCloseBorder :: Parser e s [Lexeme]
162 p_GroupCloseBorder = pdbg "GroupCloseBorder" $
163 P.try p0 <|> p1
164 where
165 p0 =
166 (\c b -> mconcat $ c <> b)
167 <$> P.some (P.try p_GroupClose)
168 <*> P.some (P.try $ P.choice
169 [ P.try p_ElemOpen
170 , P.try p_ElemClose
171 , do
172 c <- p_PunctOrSym
173 case l_GroupClose c of
174 Just l -> return [l]
175 Nothing ->
176 case l_GroupOpenAndClose LexemeGroupOpen c <|> l_GroupOpen c of
177 Nothing -> return [LexemePunctOrSym c]
178 _ -> fail ""
179 ])
180 p1 =
181 (\c b -> mconcat c <> [LexemePunctOrSym b])
182 <$> P.some (P.try p_GroupClose)
183 <*> p_PunctOrSym
184
185 p_GroupBorderOpen :: Parser e s [Lexeme]
186 p_GroupBorderOpen = pdbg "GroupBorderOpen" $
187 P.try p0 <|> p1
188 where
189 p0 =
190 (\b o -> mconcat $ b <> o)
191 <$> P.some (P.try $ P.choice
192 [ P.try p_ElemOpen
193 , P.try p_ElemClose
194 , do
195 c <- p_PunctOrSym
196 case l_GroupOpen c <|> l_GroupClose c of
197 Just l -> return [l]
198 Nothing -> fail ""
199 ])
200 <*> P.some (P.try p_GroupOpen)
201 p1 =
202 (\b o -> LexemePunctOrSym b : mconcat o)
203 <$> p_PunctOrSym
204 <*> P.some (P.try p_GroupOpen)
205
206 p_GroupOpen :: Parser e s [Lexeme]
207 p_GroupOpen = pdbg "GroupOpen" $ do
208 P.choice
209 [ P.try p_ElemOpen
210 , P.try (pure <$> p_Escape)
211 , P.try (pure <$> p_Link)
212 , do
213 c <- p_PunctOrSym
214 case l_GroupOpenOrClose LexemeGroupOpen c of
215 Just l -> return [l]
216 _ -> fail ""
217 ]
218
219 p_GroupClose :: Parser e s [Lexeme]
220 p_GroupClose = pdbg "GroupClose" $ do
221 P.choice
222 [ P.try p_ElemClose
223 , P.try p_ElemSingle
224 , P.try (pure <$> p_Escape)
225 , P.try (pure <$> p_Link)
226 , do
227 c <- p_PunctOrSym
228 case l_GroupOpenOrClose LexemeGroupClose c of
229 Just l -> return [l]
230 _ -> fail ""
231 ]
232
233 p_GroupPlain :: (Char -> Maybe Lexeme) -> Parser e s [Lexeme]
234 p_GroupPlain grp = pdbg "GroupPlain" $ do
235 (<$> p_PunctOrSym) $ \c ->
236 pure $
237 LexemePunctOrSym c `fromMaybe`
238 grp c
239
240 l_GroupOpenAndClose :: (Group -> Lexeme) -> Char -> Maybe Lexeme
241 l_GroupOpenAndClose lxm c =
242 case c of
243 '/' -> Just $ lxm GroupSlash
244 '-' -> Just $ lxm GroupDash
245 '"' -> Just $ lxm GroupDoublequote
246 '\'' -> Just $ lxm GroupSinglequote
247 '`' -> Just $ lxm GroupBackquote
248 '_' -> Just $ lxm GroupUnderscore
249 '*' -> Just $ lxm GroupStar
250 '#' -> Just $ lxm GroupHash
251 _ -> Nothing
252
253 l_GroupOpenOrClose :: (Group -> Lexeme) -> Char -> Maybe Lexeme
254 l_GroupOpenOrClose lxm c =
255 l_GroupOpenAndClose lxm c <|>
256 l_GroupOpen c <|>
257 l_GroupClose c
258
259 l_GroupOpen :: Char -> Maybe Lexeme
260 l_GroupOpen c =
261 case c of
262 '(' -> Just $ LexemeGroupOpen GroupParen
263 '[' -> Just $ LexemeGroupOpen GroupBracket
264 '{' -> Just $ LexemeGroupOpen GroupBrace
265 '«' -> Just $ LexemeGroupOpen GroupFrenchquote
266 _ -> Nothing
267
268 l_GroupClose :: Char -> Maybe Lexeme
269 l_GroupClose c =
270 case c of
271 ')' -> Just $ LexemeGroupClose GroupParen
272 ']' -> Just $ LexemeGroupClose GroupBracket
273 '}' -> Just $ LexemeGroupClose GroupBrace
274 '»' -> Just $ LexemeGroupClose GroupFrenchquote
275 _ -> Nothing
276
277 p_Link :: Parser e s Lexeme
278 p_Link =
279 (\scheme ss addr -> LexemeLink $ Text.pack $ scheme <> ss <> addr)
280 <$> P.option "" (P.try p_scheme)
281 <*> P.string "//"
282 <*> p_addr
283 where
284 p_scheme =
285 (<>)
286 <$> P.some (P.satisfy $ \c ->
287 Char.isAlphaNum c
288 || c=='_'
289 || c=='-'
290 || c=='+')
291 <*> P.string ":"
292 p_addr =
293 P.many $
294 P.satisfy $ \c ->
295 Char.isAlphaNum c
296 || c=='%'
297 || c=='/'
298 || c=='('
299 || c==')'
300 || c=='-'
301 || c=='_'
302 || c=='.'
303
304 p_Escape :: Parser e s Lexeme
305 p_Escape =
306 LexemeEscape
307 <$ P.char '\\'
308 <*> P.satisfy Char.isPrint
309
310 p_ElemSingle :: Parser e s [Lexeme]
311 p_ElemSingle = pdbg "ElemOpen" $
312 (\e as ->
313 [ LexemeGroupOpen $ GroupElem e as
314 , LexemeToken $ Tokens mempty
315 -- NOTE: encode that it's the same Elem for open and close
316 , LexemeGroupClose $ GroupElem e [] ])
317 <$ P.char '<'
318 <*> p_Word
319 <*> p_Attrs
320 <* P.string "/>"
321
322 p_ElemOpen :: Parser e s [Lexeme]
323 p_ElemOpen = pdbg "ElemOpen" $
324 (\e as oc ->
325 case oc of
326 True -> [ LexemeGroupOpen $ GroupElem e as
327 , LexemeToken $ Tokens mempty
328 , LexemeGroupClose $ GroupElem e [] ]
329 False -> [LexemeGroupOpen $ GroupElem e as])
330 <$ P.char '<'
331 <*> p_Word
332 <*> p_Attrs
333 <*> P.option False (True <$ P.char '/')
334 <* P.char '>'
335
336 p_ElemClose :: Parser e s [Lexeme]
337 p_ElemClose = pdbg "ElemClose" $
338 (\e -> [LexemeGroupClose $ GroupElem e []])
339 <$ P.string "</"
340 <*> p_Word
341 <* P.char '>'