]> Git — Sourcephile - doclang.git/blob - Language/TCT/Read/Token.hs
Fix LexemeLink parsing.
[doclang.git] / Language / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 {-# LANGUAGE Rank2Types #-}
4 {-# LANGUAGE TypeFamilies #-}
5 module Language.TCT.Read.Token where
6
7 import Control.Applicative (Applicative(..), Alternative(..))
8 import Control.Monad (Monad(..))
9 import Data.Bool
10 import Data.Char (Char)
11 import Data.Eq (Eq(..))
12 import Data.Foldable (Foldable(..))
13 import Data.Function (($), (.), flip)
14 import Data.Functor ((<$>), ($>), (<$))
15 import Data.Maybe (Maybe(..))
16 import Data.Monoid (Monoid(..))
17 import Data.Semigroup (Semigroup(..))
18 import Data.Sequence (ViewL(..), (<|))
19 import Data.Text (Text)
20 import Data.Text.Buildable (Buildable(..))
21 import Data.Tuple (fst,snd)
22 import Text.Show (Show(..))
23 import qualified Data.Char as Char
24 import qualified Data.Sequence as Seq
25 import qualified Data.Text as Text
26 import qualified Data.Text.Lazy as TL
27 import qualified Data.Text.Lazy.Builder as Builder
28 import qualified Text.Megaparsec as P
29
30 import Language.TCT.Token
31 import Language.TCT.Elem -- hiding (dbg)
32 import Language.TCT.Read.Elem -- hiding (pdbg)
33
34 {-
35 import Debug.Trace (trace)
36 dbg m x = trace (m <> ": " <> show x) x
37 pdbg m p = P.dbg m p
38 -}
39
40 textOf :: Buildable a => a -> Text
41 textOf = TL.toStrict . Builder.toLazyText . build
42
43 -- * Type 'Pairs'
44 type Pairs = (Tokens,[(Pair,Tokens)])
45
46 openPair :: Pair -> Pairs -> Pairs
47 openPair g (t,ms) = (t,(g,mempty):ms)
48
49 insertToken :: Token -> Pairs -> Pairs
50 insertToken tok (t,[]) = (t<>tokens [tok],[])
51 insertToken tok (t,(p0,t0):ps) = (t,(p0,t0<>tokens [tok]):ps)
52
53 insertTokens :: Tokens -> Pairs -> Pairs
54 insertTokens toks (t,[]) = (t<>toks,[])
55 insertTokens toks (t,(p0,t0):ps) = (t,(p0,t0<>toks):ps)
56
57 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
58 closePair :: Pair -> Pairs -> Pairs
59 closePair p (t,[]) = dbg "closePair" $
60 (t<>tokens [TokenPlain $ snd $ pairBorders p $ tokens [TokenPlain ""]],[])
61 closePair p (t,(p1,t1):ts) = dbg "closePair" $
62 case (p,p1) of
63 (PairElem x ax, PairElem y ay) | x == y ->
64 insertToken (TokenPair (PairElem x (ax<>ay)) t1) (t,ts)
65 (x,y) | x == y -> insertToken (TokenPair p1 t1) (t,ts)
66 _ ->
67 closePair p $
68 insertTokens
69 (closeUnpaired mempty (p1,t1))
70 (t,ts)
71
72 -- | Close a 'Pair' when there is not a matching 'LexemePairClose'.
73 closeUnpaired :: Tokens -> (Pair,Tokens) -> Tokens
74 closeUnpaired acc (p,tn) = dbg "closeUnpaired" $
75 case p of
76 -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
77 PairHash | TokenPlain t :< ts <- Seq.viewl $ unTokens $ tn <> acc ->
78 case Text.findIndex (not . isTagChar) t of
79 Just 0 -> tokens [TokenPlain $ fst $ pairBorders p mempty] <> tn <> acc
80 Just i -> Tokens $ TokenTag tag <| TokenPlain t' <| ts
81 where (tag,t') = Text.splitAt i t
82 Nothing -> Tokens $ TokenTag t <| ts
83 _ -> tokens [TokenPlain $ fst $ pairBorders p $ tokens [TokenPlain ""]] <> tn <> acc
84 where
85 isTagChar c =
86 Char.isAlphaNum c ||
87 c=='·' ||
88 case Char.generalCategory c of
89 Char.DashPunctuation -> True
90 Char.ConnectorPunctuation -> True
91 _ -> False
92
93 -- | Close remaining 'Pair's at end of parsing.
94 closePairs :: Pairs -> Tokens
95 closePairs (t0,ps) = dbg "closePairs" $
96 t0 <> foldl' closeUnpaired mempty ps
97
98 -- * Type 'Lexeme'
99 data Lexeme
100 = LexemePairOpen !Pair
101 | LexemePairClose !Pair
102 | LexemePunctOrSym !Char
103 | LexemeWhite !Text
104 | LexemeWord !Text
105 | LexemeToken !Tokens
106 | LexemeEscape !Char
107 | LexemeLink !Text
108 deriving (Show, Eq)
109
110 appendLexeme :: Lexeme -> Pairs -> Pairs
111 appendLexeme lex ps =
112 dbg "appendLexeme" $
113 case dbg "appendLexeme" lex of
114 LexemePairOpen p -> openPair p ps
115 LexemePairClose p -> closePair p ps
116 LexemePunctOrSym c -> insertToken (TokenPlain (Text.singleton c)) ps
117 LexemeWhite wh -> insertToken (TokenPlain wh) ps
118 LexemeWord wo -> insertToken (TokenPlain wo) ps
119 LexemeToken ts -> insertTokens ts ps
120 LexemeEscape c -> insertToken (TokenEscape c) ps
121 LexemeLink lnk -> insertToken (TokenLink lnk) ps
122
123 appendLexemes :: Pairs -> [Lexeme] -> Pairs
124 appendLexemes = foldl' (flip appendLexeme)
125
126 -- * Parsers
127
128 p_Tokens :: Parser e s Tokens
129 p_Tokens = closePairs <$> p_Pairs (mempty,[])
130
131 p_Pairs :: Pairs -> Parser e s Pairs
132 p_Pairs gs = pdbg "Pairs" $
133 (p_Lexemes (mempty == gs) >>= p_Pairs . appendLexemes gs) <|>
134 (P.eof $> gs)
135
136 p_Lexemes :: Bool -> Parser e s [Lexeme]
137 p_Lexemes isBOF = pdbg "Lexemes" $
138 P.choice
139 [ P.try $ p_PairCloseWhite
140 , P.try $ p_PairWhiteOpen isBOF
141 , P.try $ p_PairCloseBorder
142 , P.try $ p_PairBorderOpen
143 , P.try $ p_PairClose
144 , P.try $ p_PairOpen
145 , P.try $ pure . LexemePunctOrSym <$> p_PunctOrSym
146 , P.try $ pure <$> p_White
147 , pure . LexemeWord <$> p_Word
148 ]
149
150 p_White :: Parser e s Lexeme
151 p_White = pdbg "White" $ LexemeWhite <$> p_Spaces
152
153 p_PunctOrSym :: Parser e s Char
154 p_PunctOrSym =
155 P.satisfy $ \c ->
156 Char.isPunctuation c ||
157 Char.isSymbol c
158
159 p_PairCloseWhite :: Parser e s [Lexeme]
160 p_PairCloseWhite = pdbg "PairCloseWhite" $
161 (\c b o -> mconcat c <> b <> mconcat o)
162 <$> P.some (P.try $
163 P.try p_ElemOpen <|>
164 P.try p_ElemClose <|>
165 P.try p_PairClose <|>
166 pure . LexemePunctOrSym <$> p_PunctOrSym)
167 <*> (pure <$> p_White <|> P.eof $> [])
168 <*> P.many (P.try $
169 P.try p_ElemOpen <|>
170 P.try p_ElemClose <|>
171 P.try p_PairOpen <|>
172 pure . LexemePunctOrSym <$> p_PunctOrSym)
173
174 p_PairWhiteOpen :: Bool -> Parser e s [Lexeme]
175 p_PairWhiteOpen isBOF = pdbg "PairWhiteOpen" $
176 (\b o -> b <> mconcat o)
177 <$> (if isBOF then return [] else pure <$> p_White)
178 <*> P.some (P.try $
179 P.try p_ElemOpen <|>
180 P.try p_ElemClose <|>
181 P.try p_PairOpen <|>
182 pure . LexemePunctOrSym <$> p_PunctOrSym)
183
184 p_PairCloseBorder :: Parser e s [Lexeme]
185 p_PairCloseBorder = pdbg "PairCloseBorder" $
186 P.try p0 <|> p1
187 where
188 p0 =
189 (\c b -> mconcat $ c <> b)
190 <$> P.some (P.try p_PairClose)
191 <*> P.some (P.try $
192 P.choice
193 [ P.try p_ElemOpen
194 , P.try p_ElemClose
195 , do
196 c <- p_PunctOrSym
197 case l_PairOpen c <|> l_PairClose c of
198 Just l -> return [l]
199 Nothing -> fail "PairCloseBorder"
200 ])
201 p1 =
202 (\c b -> mconcat c <> [LexemePunctOrSym b])
203 <$> P.some (P.try p_PairClose)
204 <*> p_PunctOrSym
205
206 p_PairBorderOpen :: Parser e s [Lexeme]
207 p_PairBorderOpen = pdbg "PairBorderOpen" $
208 P.try p0 <|> p1
209 where
210 p0 =
211 (\b o -> mconcat $ b <> o)
212 <$> P.some (P.try $
213 P.choice
214 [ P.try p_ElemOpen
215 , P.try p_ElemClose
216 , do
217 c <- p_PunctOrSym
218 case l_PairOpen c <|> l_PairClose c of
219 Just l -> return [l]
220 Nothing -> fail "PairBorderOpen"
221 ])
222 <*> P.some (P.try p_PairOpen)
223 p1 =
224 (\b o -> LexemePunctOrSym b : mconcat o)
225 <$> p_PunctOrSym
226 <*> P.some (P.try p_PairOpen)
227
228 p_PairOpen :: Parser e s [Lexeme]
229 p_PairOpen = pdbg "PairOpen" $ do
230 P.choice
231 [ P.try p_ElemOpen
232 , P.try (pure <$> p_Escape)
233 , P.try (pure <$> p_Link)
234 , do
235 c <- p_PunctOrSym
236 case l_PairOpenOrClose LexemePairOpen c of
237 Just l -> return [l]
238 _ -> fail "PairOpen"
239 ]
240
241 p_PairClose :: Parser e s [Lexeme]
242 p_PairClose = pdbg "PairClose" $ do
243 P.choice
244 [ P.try p_ElemClose
245 , P.try p_ElemSingle
246 , P.try (pure <$> p_Escape)
247 , P.try (pure <$> p_Link)
248 , do
249 c <- p_PunctOrSym
250 case l_PairOpenOrClose LexemePairClose c of
251 Just l -> return [l]
252 _ -> fail "PairClose"
253 ]
254
255 l_PairOpenOrClose :: (Pair -> Lexeme) -> Char -> Maybe Lexeme
256 l_PairOpenOrClose lxm c =
257 l_PairOpenAndClose lxm c <|>
258 l_PairOpen c <|>
259 l_PairClose c
260
261 l_PairOpenAndClose :: (Pair -> Lexeme) -> Char -> Maybe Lexeme
262 l_PairOpenAndClose lxm c =
263 case c of
264 '/' -> Just $ lxm PairSlash
265 '-' -> Just $ lxm PairDash
266 '"' -> Just $ lxm PairDoublequote
267 '\'' -> Just $ lxm PairSinglequote
268 '`' -> Just $ lxm PairBackquote
269 '_' -> Just $ lxm PairUnderscore
270 '*' -> Just $ lxm PairStar
271 '#' -> Just $ lxm PairHash
272 _ -> Nothing
273
274 l_PairOpen :: Char -> Maybe Lexeme
275 l_PairOpen c =
276 case c of
277 '(' -> Just $ LexemePairOpen PairParen
278 '[' -> Just $ LexemePairOpen PairBracket
279 '{' -> Just $ LexemePairOpen PairBrace
280 '«' -> Just $ LexemePairOpen PairFrenchquote
281 _ -> Nothing
282
283 l_PairClose :: Char -> Maybe Lexeme
284 l_PairClose c =
285 case c of
286 ')' -> Just $ LexemePairClose PairParen
287 ']' -> Just $ LexemePairClose PairBracket
288 '}' -> Just $ LexemePairClose PairBrace
289 '»' -> Just $ LexemePairClose PairFrenchquote
290 _ -> Nothing
291
292 p_Link :: Parser e s Lexeme
293 p_Link =
294 (\scheme ss addr -> LexemeLink $ Text.pack $ scheme <> ss <> addr)
295 <$> P.option "" (P.try p_scheme)
296 <*> P.string "//"
297 <*> p_addr
298 where
299 p_scheme =
300 (<>)
301 <$> P.some (P.satisfy $ \c ->
302 Char.isAlphaNum c
303 || c=='_'
304 || c=='-'
305 || c=='+')
306 <*> P.string ":"
307 p_addr =
308 P.many $
309 P.satisfy $ \c ->
310 Char.isAlphaNum c
311 || c=='%'
312 || c=='/'
313 || c=='('
314 || c==')'
315 || c=='-'
316 || c=='_'
317 || c=='.'
318 || c=='#'
319 || c=='?'
320 || c=='='
321
322 p_Escape :: Parser e s Lexeme
323 p_Escape =
324 LexemeEscape
325 <$ P.char '\\'
326 <*> P.satisfy Char.isPrint
327
328 p_ElemSingle :: Parser e s [Lexeme]
329 p_ElemSingle = pdbg "ElemSingle" $
330 (\e as ->
331 [ LexemePairOpen $ PairElem e as
332 , LexemeToken $ mempty
333 , LexemePairClose $ PairElem e [] ])
334 <$ P.char '<'
335 <*> p_Word
336 <*> p_Attrs
337 <* P.string "/>"
338
339 p_ElemOpen :: Parser e s [Lexeme]
340 p_ElemOpen = pdbg "ElemOpen" $
341 (\e as oc ->
342 case oc of
343 True -> [ LexemePairOpen $ PairElem e as
344 , LexemeToken $ mempty
345 , LexemePairClose $ PairElem e [] ]
346 False -> [ LexemePairOpen $ PairElem e as
347 , LexemeToken $ tokens [TokenPlain ""]
348 ])
349 <$ P.char '<'
350 <*> p_Word
351 <*> p_Attrs
352 <*> P.option False (True <$ P.char '/')
353 <* P.char '>'
354
355 p_ElemClose :: Parser e s [Lexeme]
356 p_ElemClose = pdbg "ElemClose" $
357 (\e -> [LexemePairClose $ PairElem e []])
358 <$ P.string "</"
359 <*> p_Word
360 <* P.char '>'