]> Git — Sourcephile - doclang.git/blob - Language/TCT/Read/Token.hs
Fix Token reading.
[doclang.git] / Language / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 {-# LANGUAGE Rank2Types #-}
4 {-# LANGUAGE TypeFamilies #-}
5 module Language.TCT.Read.Token where
6
7 import Control.Applicative (Applicative(..), Alternative(..))
8 import Control.Monad (Monad(..))
9 import Data.Bool
10 import Data.Char (Char)
11 import Data.Eq (Eq(..))
12 import Data.Foldable (Foldable(..))
13 import Data.Function (($), (.), flip)
14 import Data.Functor ((<$>), ($>), (<$))
15 import Data.Maybe (Maybe(..), fromMaybe)
16 import Data.Monoid (Monoid(..))
17 import Data.Semigroup (Semigroup(..))
18 import Data.Sequence (ViewL(..), (<|))
19 import Data.Text (Text)
20 import Data.Text.Buildable (Buildable(..))
21 import Data.Tuple (fst,snd)
22 import Text.Show (Show(..))
23 import qualified Data.Char as Char
24 import qualified Data.Sequence as Seq
25 import qualified Data.Text as Text
26 import qualified Data.Text.Lazy as TL
27 import qualified Data.Text.Lazy.Builder as Builder
28 import qualified Text.Megaparsec as P
29
30 import Language.TCT.Token
31 import Language.TCT.Elem -- hiding (dbg)
32 import Language.TCT.Read.Elem -- hiding (pdbg)
33
34 {-
35 import Debug.Trace (trace)
36 dbg m x = trace (m <> ": " <> show x) x
37 pdbg m p = P.dbg m p
38 -}
39
40 textOf :: Buildable a => a -> Text
41 textOf = TL.toStrict . Builder.toLazyText . build
42
43 -- * Type 'Pairs'
44 type Pairs = (Tokens,[(Pair,Tokens)])
45
46 openPair :: Pair -> Pairs -> Pairs
47 openPair g (t,ms) = (t,(g,mempty):ms)
48
49 insertToken :: Token -> Pairs -> Pairs
50 insertToken tok (t,[]) = (t<>tokens [tok],[])
51 insertToken tok (t,(p0,t0):ps) = (t,(p0,t0<>tokens [tok]):ps)
52
53 insertTokens :: Tokens -> Pairs -> Pairs
54 insertTokens toks (t,[]) = (t<>toks,[])
55 insertTokens toks (t,(p0,t0):ps) = (t,(p0,t0<>toks):ps)
56
57 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
58 closePair :: Pair -> Pairs -> Pairs
59 closePair p (t,[]) = dbg "closePair" $ (t<>tokens [TokenPlain (snd $ pairBorders p mempty)],[])
60 closePair p (t,(p1,t1):ts) = dbg "closePair" $
61 case (p,p1) of
62 (PairElem x ax, PairElem y ay) | x == y ->
63 insertToken (TokenPair (PairElem x (ax<>ay)) t1) (t,ts)
64 (x,y) | x == y -> insertToken (TokenPair p1 t1) (t,ts)
65 _ ->
66 closePair p $
67 insertTokens
68 (closeUnpaired mempty (p1,t1))
69 (t,ts)
70
71 -- | Close a 'Pair' when there is not a matching 'LexemePairClose'.
72 closeUnpaired :: Tokens -> (Pair,Tokens) -> Tokens
73 closeUnpaired acc (p,tn) = dbg "closeUnpaired" $
74 case p of
75 -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
76 PairHash | TokenPlain t :< ts <- Seq.viewl $ unTokens $ tn <> acc ->
77 case Text.findIndex (not . isTagChar) t of
78 Just 0 -> tokens [TokenPlain $ fst $ pairBorders p mempty] <> tn <> acc
79 Just i -> Tokens $ TokenTag tag <| TokenPlain t' <| ts
80 where (tag,t') = Text.splitAt i t
81 Nothing -> Tokens $ TokenTag t <| ts
82 _ -> tokens [TokenPlain $ fst $ pairBorders p $ tokens [TokenPlain ""]] <> tn <> acc
83 where
84 isTagChar c =
85 Char.isAlphaNum c ||
86 c=='·' ||
87 case Char.generalCategory c of
88 Char.DashPunctuation -> True
89 Char.ConnectorPunctuation -> True
90 _ -> False
91
92 -- | Close remaining 'Pair's at end of parsing.
93 closePairs :: Pairs -> Tokens
94 closePairs (t0,ps) = dbg "closePairs" $
95 t0 <> foldl' closeUnpaired mempty ps
96
97 -- * Type 'Lexeme'
98 data Lexeme
99 = LexemePairOpen Pair
100 | LexemePairClose Pair
101 | LexemePunctOrSym Char
102 | LexemeWhite Text
103 | LexemeWord Text
104 | LexemeToken Tokens
105 | LexemeEscape Char
106 | LexemeLink Text
107 deriving (Show, Eq)
108
109 appendLexeme :: Lexeme -> Pairs -> Pairs
110 appendLexeme lex ps =
111 dbg "appendLexeme" $
112 case dbg "appendLexeme" lex of
113 LexemePairOpen p -> openPair p ps
114 LexemePairClose p -> closePair p ps
115 LexemePunctOrSym c -> insertToken (TokenPlain (Text.singleton c)) ps
116 LexemeWhite wh -> insertToken (TokenPlain wh) ps
117 LexemeWord wo -> insertToken (TokenPlain wo) ps
118 LexemeToken ts -> insertTokens ts ps
119 LexemeEscape c -> insertToken (TokenEscape c) ps
120 LexemeLink lnk -> insertToken (TokenLink lnk) ps
121
122 appendLexemes :: Pairs -> [Lexeme] -> Pairs
123 appendLexemes = foldl' (flip appendLexeme)
124
125 -- * Parsers
126
127 p_Tokens :: Parser e s Tokens
128 p_Tokens = closePairs <$> p_Pairs (mempty,[])
129
130 p_Pairs :: Pairs -> Parser e s Pairs
131 p_Pairs gs = pdbg "Pairs" $
132 (p_Lexemes (mempty == gs) >>= p_Pairs . appendLexemes gs) <|>
133 (P.eof $> gs)
134
135 p_Lexemes :: Bool -> Parser e s [Lexeme]
136 p_Lexemes isBOF = pdbg "Lexemes" $
137 P.choice
138 [ P.try $ p_PairCloseWhite
139 , P.try $ p_PairWhiteOpen isBOF
140 , P.try $ p_PairCloseBorder
141 , P.try $ p_PairBorderOpen
142 , P.try $ p_PairClose
143 , P.try $ p_PairOpen
144 , P.try $ pure . LexemePunctOrSym <$> p_PunctOrSym
145 , P.try $ pure <$> p_White
146 , pure . LexemeWord <$> p_Word
147 ]
148
149 p_White :: Parser e s Lexeme
150 p_White = pdbg "White" $ LexemeWhite <$> p_Spaces
151
152 p_PunctOrSym :: Parser e s Char
153 p_PunctOrSym = P.satisfy $ \c ->
154 Char.isPunctuation c ||
155 Char.isSymbol c
156
157 p_PairCloseWhite :: Parser e s [Lexeme]
158 p_PairCloseWhite = pdbg "PairCloseWhite" $
159 (\c b -> mconcat c <> b)
160 <$> P.some (
161 P.try p_ElemOpen <|>
162 P.try p_ElemClose <|>
163 P.try p_PairClose <|>
164 pure . LexemePunctOrSym <$> p_PunctOrSym
165 )
166 <*> ((pure <$> p_White) <|> P.eof $> [])
167
168 p_PairWhiteOpen :: Bool -> Parser e s [Lexeme]
169 p_PairWhiteOpen isBOF = pdbg "PairWhiteOpen" $
170 (\b o -> b <> mconcat o)
171 <$> (if isBOF then return [] else pure <$> p_White)
172 <*> P.some (
173 P.try p_ElemOpen <|>
174 P.try p_ElemClose <|>
175 P.try p_PairOpen <|>
176 pure . LexemePunctOrSym <$> p_PunctOrSym
177 )
178
179 p_PairCloseBorder :: Parser e s [Lexeme]
180 p_PairCloseBorder = pdbg "PairCloseBorder" $
181 P.try p0 <|> p1
182 where
183 p0 =
184 (\c b -> mconcat $ c <> b)
185 <$> P.some (P.try p_PairClose)
186 <*> P.some (P.try $ P.choice
187 [ P.try p_ElemOpen
188 , P.try p_ElemClose
189 , do
190 c <- p_PunctOrSym
191 case l_PairOpen c <|> l_PairClose c of
192 Just l -> return [l]
193 Nothing -> fail ""
194 ])
195 p1 =
196 (\c b -> mconcat c <> [LexemePunctOrSym b])
197 <$> P.some (P.try p_PairClose)
198 <*> p_PunctOrSym
199
200 p_PairBorderOpen :: Parser e s [Lexeme]
201 p_PairBorderOpen = pdbg "PairBorderOpen" $
202 P.try p0 <|> p1
203 where
204 p0 =
205 (\b o -> mconcat $ b <> o)
206 <$> P.some (P.try $ P.choice
207 [ P.try p_ElemOpen
208 , P.try p_ElemClose
209 , do
210 c <- p_PunctOrSym
211 case l_PairOpen c <|> l_PairClose c of
212 Just l -> return [l]
213 Nothing -> fail ""
214 ])
215 <*> P.some (P.try p_PairOpen)
216 p1 =
217 (\b o -> LexemePunctOrSym b : mconcat o)
218 <$> p_PunctOrSym
219 <*> P.some (P.try p_PairOpen)
220
221 p_PairOpen :: Parser e s [Lexeme]
222 p_PairOpen = pdbg "PairOpen" $ do
223 P.choice
224 [ P.try p_ElemOpen
225 , P.try (pure <$> p_Escape)
226 , P.try (pure <$> p_Link)
227 , do
228 c <- p_PunctOrSym
229 case l_PairOpenOrClose LexemePairOpen c of
230 Just l -> return [l]
231 _ -> fail ""
232 ]
233
234 p_PairClose :: Parser e s [Lexeme]
235 p_PairClose = pdbg "PairClose" $ do
236 P.choice
237 [ P.try p_ElemClose
238 , P.try p_ElemSingle
239 , P.try (pure <$> p_Escape)
240 , P.try (pure <$> p_Link)
241 , do
242 c <- p_PunctOrSym
243 case l_PairOpenOrClose LexemePairClose c of
244 Just l -> return [l]
245 _ -> fail ""
246 ]
247
248 p_PairPlain :: (Char -> Maybe Lexeme) -> Parser e s [Lexeme]
249 p_PairPlain pair = pdbg "PairPlain" $ do
250 (<$> p_PunctOrSym) $ \c ->
251 pure $
252 LexemePunctOrSym c `fromMaybe`
253 pair c
254
255 l_PairOpenAndClose :: (Pair -> Lexeme) -> Char -> Maybe Lexeme
256 l_PairOpenAndClose lxm c =
257 case c of
258 '/' -> Just $ lxm PairSlash
259 '-' -> Just $ lxm PairDash
260 '"' -> Just $ lxm PairDoublequote
261 '\'' -> Just $ lxm PairSinglequote
262 '`' -> Just $ lxm PairBackquote
263 '_' -> Just $ lxm PairUnderscore
264 '*' -> Just $ lxm PairStar
265 '#' -> Just $ lxm PairHash
266 _ -> Nothing
267
268 l_PairOpenOrClose :: (Pair -> Lexeme) -> Char -> Maybe Lexeme
269 l_PairOpenOrClose lxm c =
270 l_PairOpenAndClose lxm c <|>
271 l_PairOpen c <|>
272 l_PairClose c
273
274 l_PairOpen :: Char -> Maybe Lexeme
275 l_PairOpen c =
276 case c of
277 '(' -> Just $ LexemePairOpen PairParen
278 '[' -> Just $ LexemePairOpen PairBracket
279 '{' -> Just $ LexemePairOpen PairBrace
280 '«' -> Just $ LexemePairOpen PairFrenchquote
281 _ -> Nothing
282
283 l_PairClose :: Char -> Maybe Lexeme
284 l_PairClose c =
285 case c of
286 ')' -> Just $ LexemePairClose PairParen
287 ']' -> Just $ LexemePairClose PairBracket
288 '}' -> Just $ LexemePairClose PairBrace
289 '»' -> Just $ LexemePairClose PairFrenchquote
290 _ -> Nothing
291
292 p_Link :: Parser e s Lexeme
293 p_Link =
294 (\scheme ss addr -> LexemeLink $ Text.pack $ scheme <> ss <> addr)
295 <$> P.option "" (P.try p_scheme)
296 <*> P.string "//"
297 <*> p_addr
298 where
299 p_scheme =
300 (<>)
301 <$> P.some (P.satisfy $ \c ->
302 Char.isAlphaNum c
303 || c=='_'
304 || c=='-'
305 || c=='+')
306 <*> P.string ":"
307 p_addr =
308 P.many $
309 P.satisfy $ \c ->
310 Char.isAlphaNum c
311 || c=='%'
312 || c=='/'
313 || c=='('
314 || c==')'
315 || c=='-'
316 || c=='_'
317 || c=='.'
318
319 p_Escape :: Parser e s Lexeme
320 p_Escape =
321 LexemeEscape
322 <$ P.char '\\'
323 <*> P.satisfy Char.isPrint
324
325 p_ElemSingle :: Parser e s [Lexeme]
326 p_ElemSingle = pdbg "ElemSingle" $
327 (\e as ->
328 [ LexemePairOpen $ PairElem e as
329 , LexemeToken $ mempty
330 , LexemePairClose $ PairElem e [] ])
331 <$ P.char '<'
332 <*> p_Word
333 <*> p_Attrs
334 <* P.string "/>"
335
336 p_ElemOpen :: Parser e s [Lexeme]
337 p_ElemOpen = pdbg "ElemOpen" $
338 (\e as oc ->
339 case oc of
340 True -> [ LexemePairOpen $ PairElem e as
341 , LexemeToken $ mempty
342 , LexemePairClose $ PairElem e [] ]
343 False -> [ LexemePairOpen $ PairElem e as
344 , LexemeToken $ tokens [TokenPlain ""]
345 ])
346 <$ P.char '<'
347 <*> p_Word
348 <*> p_Attrs
349 <*> P.option False (True <$ P.char '/')
350 <* P.char '>'
351
352 p_ElemClose :: Parser e s [Lexeme]
353 p_ElemClose = pdbg "ElemClose" $
354 (\e -> [LexemePairClose $ PairElem e []])
355 <$ P.string "</"
356 <*> p_Word
357 <* P.char '>'