]> Git — Sourcephile - doclang.git/blob - Language/TCT/Read/Token.hs
Fix Token reading.
[doclang.git] / Language / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 {-# LANGUAGE Rank2Types #-}
4 {-# LANGUAGE TypeFamilies #-}
5 module Language.TCT.Read.Token where
6
7 import Control.Applicative (Applicative(..), Alternative(..))
8 import Control.Monad (Monad(..))
9 import Data.Bool
10 import Data.Char (Char)
11 import Data.Eq (Eq(..))
12 import Data.Foldable (Foldable(..))
13 import Data.Function (($), (.), flip)
14 import Data.Functor ((<$>), ($>), (<$))
15 import Data.Maybe (Maybe(..))
16 import Data.Monoid (Monoid(..))
17 import Data.Semigroup (Semigroup(..))
18 import Data.Sequence (ViewL(..), (<|))
19 import Data.Text (Text)
20 import Data.Text.Buildable (Buildable(..))
21 import Data.Tuple (fst,snd)
22 import Text.Show (Show(..))
23 import qualified Data.Char as Char
24 import qualified Data.Sequence as Seq
25 import qualified Data.Text as Text
26 import qualified Data.Text.Lazy as TL
27 import qualified Data.Text.Lazy.Builder as Builder
28 import qualified Text.Megaparsec as P
29
30 import Language.TCT.Token
31 import Language.TCT.Elem -- hiding (dbg)
32 import Language.TCT.Read.Elem -- hiding (pdbg)
33
34 {-
35 import Debug.Trace (trace)
36 dbg m x = trace (m <> ": " <> show x) x
37 pdbg m p = P.dbg m p
38 -}
39
40 textOf :: Buildable a => a -> Text
41 textOf = TL.toStrict . Builder.toLazyText . build
42
43 -- * Type 'Pairs'
44 type Pairs = (Tokens,[(Pair,Tokens)])
45
46 openPair :: Pair -> Pairs -> Pairs
47 openPair g (t,ms) = (t,(g,mempty):ms)
48
49 insertToken :: Token -> Pairs -> Pairs
50 insertToken tok (t,[]) = (t<>tokens [tok],[])
51 insertToken tok (t,(p0,t0):ps) = (t,(p0,t0<>tokens [tok]):ps)
52
53 insertTokens :: Tokens -> Pairs -> Pairs
54 insertTokens toks (t,[]) = (t<>toks,[])
55 insertTokens toks (t,(p0,t0):ps) = (t,(p0,t0<>toks):ps)
56
57 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
58 closePair :: Pair -> Pairs -> Pairs
59 closePair p (t,[]) = dbg "closePair" $
60 (t<>tokens [TokenPlain $ snd $ pairBorders p mempty],[])
61 closePair p (t,(p1,t1):ts) = dbg "closePair" $
62 case (p,p1) of
63 (PairElem x ax, PairElem y ay) | x == y ->
64 insertToken (TokenPair (PairElem x (ax<>ay)) t1) (t,ts)
65 (x,y) | x == y -> insertToken (TokenPair p1 t1) (t,ts)
66 _ ->
67 closePair p $
68 insertTokens
69 (closeUnpaired mempty (p1,t1))
70 (t,ts)
71
72 -- | Close a 'Pair' when there is not a matching 'LexemePairClose'.
73 closeUnpaired :: Tokens -> (Pair,Tokens) -> Tokens
74 closeUnpaired acc (p,tn) = dbg "closeUnpaired" $
75 case p of
76 -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
77 PairHash | TokenPlain t :< ts <- Seq.viewl $ unTokens $ tn <> acc ->
78 case Text.findIndex (not . isTagChar) t of
79 Just 0 -> tokens [TokenPlain $ fst $ pairBorders p mempty] <> tn <> acc
80 Just i -> Tokens $ TokenTag tag <| TokenPlain t' <| ts
81 where (tag,t') = Text.splitAt i t
82 Nothing -> Tokens $ TokenTag t <| ts
83 _ -> tokens [TokenPlain $ fst $ pairBorders p $ tokens [TokenPlain ""]] <> tn <> acc
84 where
85 isTagChar c =
86 Char.isAlphaNum c ||
87 c=='·' ||
88 case Char.generalCategory c of
89 Char.DashPunctuation -> True
90 Char.ConnectorPunctuation -> True
91 _ -> False
92
93 -- | Close remaining 'Pair's at end of parsing.
94 closePairs :: Pairs -> Tokens
95 closePairs (t0,ps) = dbg "closePairs" $
96 t0 <> foldl' closeUnpaired mempty ps
97
98 -- * Type 'Lexeme'
99 data Lexeme
100 = LexemePairOpen Pair
101 | LexemePairClose Pair
102 | LexemePunctOrSym Char
103 | LexemeWhite Text
104 | LexemeWord Text
105 | LexemeToken Tokens
106 | LexemeEscape Char
107 | LexemeLink Text
108 deriving (Show, Eq)
109
110 appendLexeme :: Lexeme -> Pairs -> Pairs
111 appendLexeme lex ps =
112 dbg "appendLexeme" $
113 case dbg "appendLexeme" lex of
114 LexemePairOpen p -> openPair p ps
115 LexemePairClose p -> closePair p ps
116 LexemePunctOrSym c -> insertToken (TokenPlain (Text.singleton c)) ps
117 LexemeWhite wh -> insertToken (TokenPlain wh) ps
118 LexemeWord wo -> insertToken (TokenPlain wo) ps
119 LexemeToken ts -> insertTokens ts ps
120 LexemeEscape c -> insertToken (TokenEscape c) ps
121 LexemeLink lnk -> insertToken (TokenLink lnk) ps
122
123 appendLexemes :: Pairs -> [Lexeme] -> Pairs
124 appendLexemes = foldl' (flip appendLexeme)
125
126 -- * Parsers
127
128 p_Tokens :: Parser e s Tokens
129 p_Tokens = closePairs <$> p_Pairs (mempty,[])
130
131 p_Pairs :: Pairs -> Parser e s Pairs
132 p_Pairs gs = pdbg "Pairs" $
133 (p_Lexemes (mempty == gs) >>= p_Pairs . appendLexemes gs) <|>
134 (P.eof $> gs)
135
136 p_Lexemes :: Bool -> Parser e s [Lexeme]
137 p_Lexemes isBOF = pdbg "Lexemes" $
138 P.choice
139 [ P.try $ p_PairCloseWhite
140 , P.try $ p_PairWhiteOpen isBOF
141 , P.try $ p_PairCloseBorder
142 , P.try $ p_PairBorderOpen
143 , P.try $ p_PairClose
144 , P.try $ p_PairOpen
145 , P.try $ pure . LexemePunctOrSym <$> p_PunctOrSym
146 , P.try $ pure <$> p_White
147 , pure . LexemeWord <$> p_Word
148 ]
149
150 p_White :: Parser e s Lexeme
151 p_White = pdbg "White" $ LexemeWhite <$> p_Spaces
152
153 p_PunctOrSym :: Parser e s Char
154 p_PunctOrSym =
155 P.satisfy $ \c ->
156 Char.isPunctuation c ||
157 Char.isSymbol c
158
159 p_PairCloseWhite :: Parser e s [Lexeme]
160 p_PairCloseWhite = pdbg "PairCloseWhite" $
161 (\c b o -> mconcat c <> b <> mconcat o)
162 <$> P.some (P.try $
163 P.try p_ElemOpen <|>
164 P.try p_ElemClose <|>
165 P.try p_PairClose <|>
166 pure . LexemePunctOrSym <$> p_PunctOrSym)
167 <*> (pure <$> p_White <|> P.eof $> [])
168 <*> P.many (P.try $
169 P.try p_ElemOpen <|>
170 P.try p_ElemClose <|>
171 P.try p_PairOpen <|>
172 pure . LexemePunctOrSym <$> p_PunctOrSym)
173
174 p_PairWhiteOpen :: Bool -> Parser e s [Lexeme]
175 p_PairWhiteOpen isBOF = pdbg "PairWhiteOpen" $
176 (\b o -> b <> mconcat o)
177 <$> (if isBOF then return [] else pure <$> p_White)
178 <*> P.some (P.try $
179 P.try p_ElemOpen <|>
180 P.try p_ElemClose <|>
181 P.try p_PairOpen <|>
182 pure . LexemePunctOrSym <$> p_PunctOrSym
183 )
184
185 p_PairCloseBorder :: Parser e s [Lexeme]
186 p_PairCloseBorder = pdbg "PairCloseBorder" $
187 P.try p0 <|> p1
188 where
189 p0 = pdbg "PairCloseBorder/p0" $
190 (\c b -> mconcat $ c <> b)
191 <$> P.some (P.try p_PairClose)
192 <*> P.some (P.try $
193 P.choice
194 [ P.try p_ElemOpen
195 , P.try p_ElemClose
196 , do
197 c <- p_PunctOrSym
198 case l_PairOpen c <|> l_PairClose c of
199 Just l -> return [l]
200 Nothing -> fail "PairCloseBorder"
201 ])
202 p1 = pdbg "PairCloseBorder/p1" $
203 (\c b -> mconcat c <> [LexemePunctOrSym b])
204 <$> P.some (P.try p_PairClose)
205 <*> p_PunctOrSym
206
207 p_PairBorderOpen :: Parser e s [Lexeme]
208 p_PairBorderOpen = pdbg "PairBorderOpen" $
209 P.try p0 <|> p1
210 where
211 p0 = pdbg "PairBorderOpen/p0" $
212 (\b o -> mconcat $ b <> o)
213 <$> P.some (P.try $
214 P.choice
215 [ P.try p_ElemOpen
216 , P.try p_ElemClose
217 , do
218 c <- p_PunctOrSym
219 case l_PairOpen c <|> l_PairClose c of
220 Just l -> return [l]
221 Nothing -> fail "PairBorderOpen"
222 ])
223 <*> P.some (P.try p_PairOpen)
224 p1 = pdbg "PairBorderOpen/p1" $
225 (\b o -> LexemePunctOrSym b : mconcat o)
226 <$> p_PunctOrSym
227 <*> P.some (P.try p_PairOpen)
228
229 p_PairOpen :: Parser e s [Lexeme]
230 p_PairOpen = pdbg "PairOpen" $ do
231 P.choice
232 [ P.try p_ElemOpen
233 , P.try (pure <$> p_Escape)
234 , P.try (pure <$> p_Link)
235 , do
236 c <- p_PunctOrSym
237 case l_PairOpenOrClose LexemePairOpen c of
238 Just l -> return [l]
239 _ -> fail "PairOpen"
240 ]
241
242 p_PairClose :: Parser e s [Lexeme]
243 p_PairClose = pdbg "PairClose" $ do
244 P.choice
245 [ P.try p_ElemClose
246 , P.try p_ElemSingle
247 , P.try (pure <$> p_Escape)
248 , P.try (pure <$> p_Link)
249 , do
250 c <- p_PunctOrSym
251 case l_PairOpenOrClose LexemePairClose c of
252 Just l -> return [l]
253 _ -> fail "PairClose"
254 ]
255
256 l_PairOpenOrClose :: (Pair -> Lexeme) -> Char -> Maybe Lexeme
257 l_PairOpenOrClose lxm c =
258 l_PairOpenAndClose lxm c <|>
259 l_PairOpen c <|>
260 l_PairClose c
261
262 l_PairOpenAndClose :: (Pair -> Lexeme) -> Char -> Maybe Lexeme
263 l_PairOpenAndClose lxm c =
264 case c of
265 '/' -> Just $ lxm PairSlash
266 '-' -> Just $ lxm PairDash
267 '"' -> Just $ lxm PairDoublequote
268 '\'' -> Just $ lxm PairSinglequote
269 '`' -> Just $ lxm PairBackquote
270 '_' -> Just $ lxm PairUnderscore
271 '*' -> Just $ lxm PairStar
272 '#' -> Just $ lxm PairHash
273 _ -> Nothing
274
275 l_PairOpen :: Char -> Maybe Lexeme
276 l_PairOpen c =
277 case c of
278 '(' -> Just $ LexemePairOpen PairParen
279 '[' -> Just $ LexemePairOpen PairBracket
280 '{' -> Just $ LexemePairOpen PairBrace
281 '«' -> Just $ LexemePairOpen PairFrenchquote
282 _ -> Nothing
283
284 l_PairClose :: Char -> Maybe Lexeme
285 l_PairClose c =
286 case c of
287 ')' -> Just $ LexemePairClose PairParen
288 ']' -> Just $ LexemePairClose PairBracket
289 '}' -> Just $ LexemePairClose PairBrace
290 '»' -> Just $ LexemePairClose PairFrenchquote
291 _ -> Nothing
292
293 p_Link :: Parser e s Lexeme
294 p_Link =
295 (\scheme ss addr -> LexemeLink $ Text.pack $ scheme <> ss <> addr)
296 <$> P.option "" (P.try p_scheme)
297 <*> P.string "//"
298 <*> p_addr
299 where
300 p_scheme =
301 (<>)
302 <$> P.some (P.try $ P.satisfy $ \c ->
303 Char.isAlphaNum c
304 || c=='_'
305 || c=='-'
306 || c=='+')
307 <*> P.string ":"
308 p_addr =
309 P.many $ P.try $
310 P.satisfy $ \c ->
311 Char.isAlphaNum c
312 || c=='%'
313 || c=='/'
314 || c=='('
315 || c==')'
316 || c=='-'
317 || c=='_'
318 || c=='.'
319
320 p_Escape :: Parser e s Lexeme
321 p_Escape =
322 LexemeEscape
323 <$ P.char '\\'
324 <*> P.satisfy Char.isPrint
325
326 p_ElemSingle :: Parser e s [Lexeme]
327 p_ElemSingle = pdbg "ElemSingle" $
328 (\e as ->
329 [ LexemePairOpen $ PairElem e as
330 , LexemeToken $ mempty
331 , LexemePairClose $ PairElem e [] ])
332 <$ P.char '<'
333 <*> p_Word
334 <*> p_Attrs
335 <* P.string "/>"
336
337 p_ElemOpen :: Parser e s [Lexeme]
338 p_ElemOpen = pdbg "ElemOpen" $
339 (\e as oc ->
340 case oc of
341 True -> [ LexemePairOpen $ PairElem e as
342 , LexemeToken $ mempty
343 , LexemePairClose $ PairElem e [] ]
344 False -> [ LexemePairOpen $ PairElem e as
345 , LexemeToken $ tokens [TokenPlain ""]
346 ])
347 <$ P.char '<'
348 <*> p_Word
349 <*> p_Attrs
350 <*> P.option False (True <$ P.char '/')
351 <* P.char '>'
352
353 p_ElemClose :: Parser e s [Lexeme]
354 p_ElemClose = pdbg "ElemClose" $
355 (\e -> [LexemePairClose $ PairElem e []])
356 <$ P.string "</"
357 <*> p_Word
358 <* P.char '>'