]> Git — Sourcephile - doclang.git/blob - Language/TCT/Read/Token.hs
Use Text.Lazy to speedup Token parsing.
[doclang.git] / Language / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 {-# LANGUAGE Rank2Types #-}
4 {-# LANGUAGE TypeFamilies #-}
5 module Language.TCT.Read.Token where
6
7 import Control.Applicative (Applicative(..), Alternative(..))
8 import Control.Monad (Monad(..))
9 import Data.Bool
10 import Data.Char (Char)
11 import Data.Eq (Eq(..))
12 import Data.Foldable (Foldable(..))
13 import Data.Function (($), (.), flip)
14 import Data.Functor ((<$>), ($>), (<$))
15 import Data.Maybe (Maybe(..))
16 import Data.Monoid (Monoid(..))
17 import Data.Semigroup (Semigroup(..))
18 import Data.Sequence (ViewL(..), (<|))
19 import Data.Text (Text)
20 import Data.Text.Buildable (Buildable(..))
21 import Data.Tuple (fst,snd)
22 import Text.Show (Show(..))
23 import qualified Data.Char as Char
24 import qualified Data.Sequence as Seq
25 import qualified Data.Text as Text
26 import qualified Data.Text.Lazy as TL
27 import qualified Data.Text.Lazy.Builder as Builder
28 import qualified Text.Megaparsec as P
29
30 import Language.TCT.Tree (int64)
31 import Language.TCT.Token
32 import Language.TCT.Elem -- hiding (dbg)
33 import Language.TCT.Read.Elem -- hiding (pdbg)
34
35 {-
36 import Debug.Trace (trace)
37 dbg m x = trace (m <> ": " <> show x) x
38 pdbg m p = P.dbg m p
39 -}
40
41 textOf :: Buildable a => a -> Text
42 textOf = TL.toStrict . Builder.toLazyText . build
43
44 -- * Type 'Pairs'
45 type Pairs = (Tokens,[(Pair,Tokens)])
46
47 openPair :: Pair -> Pairs -> Pairs
48 openPair g (t,ms) = (t,(g,mempty):ms)
49
50 insertToken :: Token -> Pairs -> Pairs
51 insertToken tok (t,[]) = (t<>tokens [tok],[])
52 insertToken tok (t,(p0,t0):ps) = (t,(p0,t0<>tokens [tok]):ps)
53
54 insertTokens :: Tokens -> Pairs -> Pairs
55 insertTokens toks (t,[]) = (t<>toks,[])
56 insertTokens toks (t,(p0,t0):ps) = (t,(p0,t0<>toks):ps)
57
58 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
59 closePair :: Pair -> Pairs -> Pairs
60 closePair p (t,[]) = dbg "closePair" $
61 (t<>tokens [TokenPlain $ snd $ pairBorders p $ tokens [TokenPlain ""]],[])
62 closePair p (t,(p1,t1):ts) = dbg "closePair" $
63 case (p,p1) of
64 (PairElem x ax, PairElem y ay) | x == y ->
65 insertToken (TokenPair (PairElem x (ax<>ay)) t1) (t,ts)
66 (x,y) | x == y -> insertToken (TokenPair p1 t1) (t,ts)
67 _ ->
68 closePair p $
69 insertTokens
70 (closeUnpaired mempty (p1,t1))
71 (t,ts)
72
73 -- | Close a 'Pair' when there is not a matching 'LexemePairClose'.
74 closeUnpaired :: Tokens -> (Pair,Tokens) -> Tokens
75 closeUnpaired acc (p,tn) = dbg "closeUnpaired" $
76 case p of
77 -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
78 PairHash | TokenPlain t :< ts <- Seq.viewl $ unTokens $ tn <> acc ->
79 case Text.findIndex (not . isTagChar) (TL.toStrict t) of
80 Just 0 -> tokens [TokenPlain $ fst $ pairBorders p mempty] <> tn <> acc
81 Just i -> Tokens $ TokenTag (TL.toStrict tag) <| TokenPlain t' <| ts
82 where (tag,t') = TL.splitAt (int64 i) t
83 Nothing -> Tokens $ TokenTag (TL.toStrict t) <| ts
84 _ -> tokens [TokenPlain $ fst $ pairBorders p $ tokens [TokenPlain ""]] <> tn <> acc
85 where
86 isTagChar c =
87 Char.isAlphaNum c ||
88 c=='·' ||
89 case Char.generalCategory c of
90 Char.DashPunctuation -> True
91 Char.ConnectorPunctuation -> True
92 _ -> False
93
94 -- | Close remaining 'Pair's at end of parsing.
95 closePairs :: Pairs -> Tokens
96 closePairs (t0,ps) = dbg "closePairs" $
97 t0 <> foldl' closeUnpaired mempty ps
98
99 -- * Type 'Lexeme'
100 data Lexeme
101 = LexemePairOpen !Pair
102 | LexemePairClose !Pair
103 | LexemePunctOrSym !Char
104 | LexemeWhite !Text
105 | LexemeWord !TL.Text
106 | LexemeToken !Tokens
107 | LexemeEscape !Char
108 | LexemeLink !Text
109 deriving (Show, Eq)
110
111 appendLexeme :: Lexeme -> Pairs -> Pairs
112 appendLexeme lex ps =
113 dbg "appendLexeme" $
114 case dbg "appendLexeme" lex of
115 LexemePairOpen p -> openPair p ps
116 LexemePairClose p -> closePair p ps
117 LexemePunctOrSym c -> insertToken (TokenPlain (TL.singleton c)) ps
118 LexemeWhite wh -> insertToken (TokenPlain (TL.fromStrict wh)) ps
119 LexemeWord wo -> insertToken (TokenPlain wo) ps
120 LexemeToken ts -> insertTokens ts ps
121 LexemeEscape c -> insertToken (TokenEscape c) ps
122 LexemeLink lnk -> insertToken (TokenLink lnk) ps
123
124 appendLexemes :: Pairs -> [Lexeme] -> Pairs
125 appendLexemes = foldl' (flip appendLexeme)
126
127 -- * Parsers
128
129 p_Tokens :: Parser e s Tokens
130 p_Tokens = closePairs <$> p_Pairs (mempty,[])
131
132 p_Pairs :: Pairs -> Parser e s Pairs
133 p_Pairs gs = pdbg "Pairs" $
134 (p_Lexemes (mempty == gs) >>= p_Pairs . appendLexemes gs) <|>
135 (P.eof $> gs)
136
137 p_Lexemes :: Bool -> Parser e s [Lexeme]
138 p_Lexemes isBOF = pdbg "Lexemes" $
139 P.choice
140 [ P.try $ p_PairCloseWhite
141 , P.try $ p_PairWhiteOpen isBOF
142 , P.try $ p_PairCloseBorder
143 , P.try $ p_PairBorderOpen
144 , P.try $ p_PairClose
145 , P.try $ p_PairOpen
146 , P.try $ pure . LexemePunctOrSym <$> p_PunctOrSym
147 , P.try $ pure <$> p_White
148 , pure . LexemeWord <$> p_Word
149 ]
150
151 p_White :: Parser e s Lexeme
152 p_White = pdbg "White" $ LexemeWhite <$> p_Spaces
153
154 p_PunctOrSym :: Parser e s Char
155 p_PunctOrSym =
156 P.satisfy $ \c ->
157 Char.isPunctuation c ||
158 Char.isSymbol c
159
160 p_PairCloseWhite :: Parser e s [Lexeme]
161 p_PairCloseWhite = pdbg "PairCloseWhite" $
162 (\c b o -> mconcat c <> b <> mconcat o)
163 <$> P.some (P.try $
164 P.try p_ElemOpen <|>
165 P.try p_ElemClose <|>
166 P.try p_PairClose <|>
167 pure . LexemePunctOrSym <$> p_PunctOrSym)
168 <*> (pure <$> p_White <|> P.eof $> [])
169 <*> P.many (P.try $
170 P.try p_ElemOpen <|>
171 P.try p_ElemClose <|>
172 P.try p_PairOpen <|>
173 pure . LexemePunctOrSym <$> p_PunctOrSym)
174
175 p_PairWhiteOpen :: Bool -> Parser e s [Lexeme]
176 p_PairWhiteOpen isBOF = pdbg "PairWhiteOpen" $
177 (\b o -> b <> mconcat o)
178 <$> (if isBOF then return [] else pure <$> p_White)
179 <*> P.some (P.try $
180 P.try p_ElemOpen <|>
181 P.try p_ElemClose <|>
182 P.try p_PairOpen <|>
183 pure . LexemePunctOrSym <$> p_PunctOrSym)
184
185 p_PairCloseBorder :: Parser e s [Lexeme]
186 p_PairCloseBorder = pdbg "PairCloseBorder" $
187 P.try p0 <|> p1
188 where
189 p0 =
190 (\c b -> mconcat $ c <> b)
191 <$> P.some (P.try p_PairClose)
192 <*> P.some (P.try $
193 P.choice
194 [ P.try p_ElemOpen
195 , P.try p_ElemClose
196 , do
197 c <- p_PunctOrSym
198 case l_PairOpen c <|> l_PairClose c of
199 Just l -> return [l]
200 Nothing -> fail "PairCloseBorder"
201 ])
202 p1 =
203 (\c b -> mconcat c <> [LexemePunctOrSym b])
204 <$> P.some (P.try p_PairClose)
205 <*> p_PunctOrSym
206
207 p_PairBorderOpen :: Parser e s [Lexeme]
208 p_PairBorderOpen = pdbg "PairBorderOpen" $
209 P.try p0 <|> p1
210 where
211 p0 =
212 (\b o -> mconcat $ b <> o)
213 <$> P.some (P.try $
214 P.choice
215 [ P.try p_ElemOpen
216 , P.try p_ElemClose
217 , do
218 c <- p_PunctOrSym
219 case l_PairOpen c <|> l_PairClose c of
220 Just l -> return [l]
221 Nothing -> fail "PairBorderOpen"
222 ])
223 <*> P.some (P.try p_PairOpen)
224 p1 =
225 (\b o -> LexemePunctOrSym b : mconcat o)
226 <$> p_PunctOrSym
227 <*> P.some (P.try p_PairOpen)
228
229 p_PairOpen :: Parser e s [Lexeme]
230 p_PairOpen = pdbg "PairOpen" $ do
231 P.choice
232 [ P.try p_ElemOpen
233 , P.try (pure <$> p_Escape)
234 , P.try (pure <$> p_Link)
235 , do
236 c <- p_PunctOrSym
237 case l_PairOpenOrClose LexemePairOpen c of
238 Just l -> return [l]
239 _ -> fail "PairOpen"
240 ]
241
242 p_PairClose :: Parser e s [Lexeme]
243 p_PairClose = pdbg "PairClose" $ do
244 P.choice
245 [ P.try p_ElemClose
246 , P.try p_ElemSingle
247 , P.try (pure <$> p_Escape)
248 , P.try (pure <$> p_Link)
249 , do
250 c <- p_PunctOrSym
251 case l_PairOpenOrClose LexemePairClose c of
252 Just l -> return [l]
253 _ -> fail "PairClose"
254 ]
255
256 l_PairOpenOrClose :: (Pair -> Lexeme) -> Char -> Maybe Lexeme
257 l_PairOpenOrClose lxm c =
258 l_PairOpenAndClose lxm c <|>
259 l_PairOpen c <|>
260 l_PairClose c
261
262 l_PairOpenAndClose :: (Pair -> Lexeme) -> Char -> Maybe Lexeme
263 l_PairOpenAndClose lxm c =
264 case c of
265 '/' -> Just $ lxm PairSlash
266 '-' -> Just $ lxm PairDash
267 '"' -> Just $ lxm PairDoublequote
268 '\'' -> Just $ lxm PairSinglequote
269 '`' -> Just $ lxm PairBackquote
270 '_' -> Just $ lxm PairUnderscore
271 '*' -> Just $ lxm PairStar
272 '#' -> Just $ lxm PairHash
273 _ -> Nothing
274
275 l_PairOpen :: Char -> Maybe Lexeme
276 l_PairOpen c =
277 case c of
278 '(' -> Just $ LexemePairOpen PairParen
279 '[' -> Just $ LexemePairOpen PairBracket
280 '{' -> Just $ LexemePairOpen PairBrace
281 '«' -> Just $ LexemePairOpen PairFrenchquote
282 _ -> Nothing
283
284 l_PairClose :: Char -> Maybe Lexeme
285 l_PairClose c =
286 case c of
287 ')' -> Just $ LexemePairClose PairParen
288 ']' -> Just $ LexemePairClose PairBracket
289 '}' -> Just $ LexemePairClose PairBrace
290 '»' -> Just $ LexemePairClose PairFrenchquote
291 _ -> Nothing
292
293 p_Link :: Parser e s Lexeme
294 p_Link =
295 (\scheme ss addr -> LexemeLink $ Text.pack $ scheme <> ss <> addr)
296 <$> P.option "" (P.try p_scheme)
297 <*> P.string "//"
298 <*> p_addr
299 where
300 p_scheme =
301 (<>)
302 <$> P.some (P.satisfy $ \c ->
303 Char.isAlphaNum c
304 || c=='_'
305 || c=='-'
306 || c=='+')
307 <*> P.string ":"
308 p_addr =
309 P.many $
310 P.satisfy $ \c ->
311 Char.isAlphaNum c
312 || c=='%'
313 || c=='/'
314 || c=='('
315 || c==')'
316 || c=='-'
317 || c=='_'
318 || c=='.'
319 || c=='#'
320 || c=='?'
321 || c=='='
322
323 p_Escape :: Parser e s Lexeme
324 p_Escape =
325 LexemeEscape
326 <$ P.char '\\'
327 <*> P.satisfy Char.isPrint
328
329 p_ElemSingle :: Parser e s [Lexeme]
330 p_ElemSingle = pdbg "ElemSingle" $
331 (\(TL.toStrict -> e) as ->
332 [ LexemePairOpen $ PairElem e as
333 , LexemeToken $ mempty
334 , LexemePairClose $ PairElem e [] ])
335 <$ P.char '<'
336 <*> p_Word
337 <*> p_Attrs
338 <* P.string "/>"
339
340 p_ElemOpen :: Parser e s [Lexeme]
341 p_ElemOpen = pdbg "ElemOpen" $
342 (\(TL.toStrict -> e) as oc ->
343 case oc of
344 True -> [ LexemePairOpen $ PairElem e as
345 , LexemeToken $ mempty
346 , LexemePairClose $ PairElem e [] ]
347 False -> [ LexemePairOpen $ PairElem e as
348 , LexemeToken $ tokens [TokenPlain ""]
349 ])
350 <$ P.char '<'
351 <*> p_Word
352 <*> p_Attrs
353 <*> P.option False (True <$ P.char '/')
354 <* P.char '>'
355
356 p_ElemClose :: Parser e s [Lexeme]
357 p_ElemClose = pdbg "ElemClose" $
358 (\(TL.toStrict -> e) -> [LexemePairClose $ PairElem e []])
359 <$ P.string "</"
360 <*> p_Word
361 <* P.char '>'