]> Git — Sourcephile - doclang.git/blob - Language/TCT/Read/Token.hs
Split Token/Tokens types.
[doclang.git] / Language / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 {-# LANGUAGE Rank2Types #-}
4 {-# LANGUAGE TypeFamilies #-}
5 module Language.TCT.Read.Token where
6
7 import Control.Applicative (Applicative(..), Alternative(..))
8 import Control.Monad (Monad(..))
9 import Data.Bool
10 import Data.Char (Char)
11 import Data.Eq (Eq(..))
12 import Data.Foldable (Foldable(..))
13 import Data.Function (($), (.), flip)
14 import Data.Functor ((<$>), ($>), (<$))
15 import Data.Maybe (Maybe(..), fromMaybe)
16 import Data.Monoid (Monoid(..))
17 import Data.Semigroup (Semigroup(..))
18 import Data.Sequence (ViewL(..), (<|))
19 import Data.Text (Text)
20 import Data.Text.Buildable (Buildable(..))
21 import Data.Tuple (fst,snd)
22 import Text.Show (Show(..))
23 import qualified Data.Char as Char
24 import qualified Data.Sequence as Seq
25 import qualified Data.Text as Text
26 import qualified Data.Text.Lazy as TL
27 import qualified Data.Text.Lazy.Builder as Builder
28 import qualified Text.Megaparsec as P
29
30 import Language.TCT.Token
31 import Language.TCT.Elem
32 import Language.TCT.Read.Elem -- hiding (pdbg)
33
34 -- pdbg m p = P.dbg m p
35
36 textOf :: Buildable a => a -> Text
37 textOf = TL.toStrict . Builder.toLazyText . build
38
39 -- * Type 'Pairs'
40 type Pairs = (Tokens,[(Pair,Tokens)])
41
42 openPair :: Pair -> Pairs -> Pairs
43 openPair g (t,ms) = (t,(g,mempty):ms)
44
45 insertToken :: Token -> Pairs -> Pairs
46 insertToken tok (t,[]) = (t<>tokens [tok],[])
47 insertToken tok (t,(p0,t0):ps) = (t,(p0,t0<>tokens [tok]):ps)
48
49 insertTokens :: Tokens -> Pairs -> Pairs
50 insertTokens toks (t,[]) = (t<>toks,[])
51 insertTokens toks (t,(p0,t0):ps) = (t,(p0,t0<>toks):ps)
52
53 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
54 closePair :: Pair -> Pairs -> Pairs
55 closePair p (t,[]) = dbg "closePair" $ (t<>tokens [TokenPlain (snd $ pairBorders p mempty)],[])
56 closePair p (t,(p1,t1):ts) = dbg "closePair" $
57 case (p,p1) of
58 (PairElem x ax, PairElem y ay) | x == y ->
59 insertToken (TokenPair (PairElem x (ax<>ay)) t1) (t,ts)
60 (x,y) | x == y -> insertToken (TokenPair p1 t1) (t,ts)
61 _ ->
62 closePair p $
63 insertTokens
64 (closeUnpaired mempty (p1,t1))
65 (t,ts)
66
67 -- | Close a 'Pair' when there is not a matching 'LexemePairClose'.
68 closeUnpaired :: Tokens -> (Pair,Tokens) -> Tokens
69 closeUnpaired acc (p,tn) = dbg "closeUnpaired" $
70 case p of
71 -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
72 PairHash | TokenPlain t :< ts <- Seq.viewl $ unTokens $ tn <> acc ->
73 case Text.findIndex (not . isTagChar) t of
74 Just 0 -> tokens [TokenPlain $ fst $ pairBorders p mempty] <> tn <> acc
75 Just i -> Tokens $ TokenTag tag <| TokenPlain t' <| ts
76 where (tag,t') = Text.splitAt i t
77 Nothing -> Tokens $ TokenTag t <| ts
78 _ -> tokens [TokenPlain $ fst $ pairBorders p mempty] <> tn <> acc
79 where
80 isTagChar c =
81 Char.isAlphaNum c ||
82 c=='·' ||
83 case Char.generalCategory c of
84 Char.DashPunctuation -> True
85 Char.ConnectorPunctuation -> True
86 _ -> False
87
88 -- | Close remaining 'Pair's at end of parsing.
89 closePairs :: Pairs -> Tokens
90 closePairs (t0,ps) = dbg "closePairs" $
91 t0 <> foldl' closeUnpaired mempty ps
92
93 -- * Type 'Lexeme'
94 data Lexeme
95 = LexemePairOpen Pair
96 | LexemePairClose Pair
97 | LexemePunctOrSym Char
98 | LexemeWhite Text
99 | LexemeWord Text
100 | LexemeToken Tokens
101 | LexemeEscape Char
102 | LexemeLink Text
103 deriving (Show, Eq)
104
105 appendLexeme :: Lexeme -> Pairs -> Pairs
106 appendLexeme lex ps =
107 dbg "appendLexeme" $
108 case dbg "appendLexeme" lex of
109 LexemePairOpen p -> openPair p ps
110 LexemePairClose p -> closePair p ps
111 LexemePunctOrSym c -> insertToken (TokenPlain (Text.singleton c)) ps
112 LexemeWhite wh -> insertToken (TokenPlain wh) ps
113 LexemeWord wo -> insertToken (TokenPlain wo) ps
114 LexemeToken ts -> insertTokens ts ps
115 LexemeEscape c -> insertToken (TokenEscape c) ps
116 LexemeLink lnk -> insertToken (TokenLink lnk) ps
117
118 appendLexemes :: Pairs -> [Lexeme] -> Pairs
119 appendLexemes = foldl' (flip appendLexeme)
120
121 -- * Parsers
122
123 p_Tokens :: Parser e s Tokens
124 p_Tokens = closePairs <$> p_Pairs (mempty,[])
125
126 p_Pairs :: Pairs -> Parser e s Pairs
127 p_Pairs gs = pdbg "Pairs" $
128 (p_Lexemes (mempty == gs) >>= p_Pairs . appendLexemes gs) <|>
129 (P.eof $> gs)
130
131 p_Lexemes :: Bool -> Parser e s [Lexeme]
132 p_Lexemes isBOF = pdbg "Lexemes" $
133 P.choice
134 [ P.try $ p_PairCloseWhite
135 , P.try $ p_PairWhiteOpen isBOF
136 , P.try $ p_PairCloseBorder
137 , P.try $ p_PairBorderOpen
138 , P.try $ p_PairClose
139 , P.try $ p_PairOpen
140 , P.try $ pure . LexemePunctOrSym <$> p_PunctOrSym
141 , P.try $ pure <$> p_White
142 , pure . LexemeWord <$> p_Word
143 ]
144
145 p_White :: Parser e s Lexeme
146 p_White = pdbg "White" $ LexemeWhite <$> p_Spaces
147
148 p_PunctOrSym :: Parser e s Char
149 p_PunctOrSym = P.satisfy $ \c ->
150 Char.isPunctuation c ||
151 Char.isSymbol c
152
153 p_PairCloseWhite :: Parser e s [Lexeme]
154 p_PairCloseWhite = pdbg "PairCloseWhite" $
155 (\c b -> mconcat c <> b)
156 <$> P.some (P.try p_PairClose <|> pure . LexemePunctOrSym <$> p_PunctOrSym)
157 <*> ((pure <$> p_White) <|> P.eof $> [])
158
159 p_PairWhiteOpen :: Bool -> Parser e s [Lexeme]
160 p_PairWhiteOpen isBOF = pdbg "PairWhiteOpen" $
161 (\b o -> b <> mconcat o)
162 <$> (if isBOF then return [] else pure <$> p_White)
163 <*> P.some (P.try p_PairOpen <|> pure . LexemePunctOrSym <$> p_PunctOrSym)
164
165 p_PairCloseBorder :: Parser e s [Lexeme]
166 p_PairCloseBorder = pdbg "PairCloseBorder" $
167 P.try p0 <|> p1
168 where
169 p0 =
170 (\c b -> mconcat $ c <> b)
171 <$> P.some (P.try p_PairClose)
172 <*> P.some (P.try $ P.choice
173 [ P.try p_ElemOpen
174 , P.try p_ElemClose
175 , do
176 c <- p_PunctOrSym
177 case l_PairClose c of
178 Just l -> return [l]
179 Nothing ->
180 case l_PairOpenAndClose LexemePairOpen c <|> l_PairOpen c of
181 Nothing -> return [LexemePunctOrSym c]
182 _ -> fail ""
183 ])
184 p1 =
185 (\c b -> mconcat c <> [LexemePunctOrSym b])
186 <$> P.some (P.try p_PairClose)
187 <*> p_PunctOrSym
188
189 p_PairBorderOpen :: Parser e s [Lexeme]
190 p_PairBorderOpen = pdbg "PairBorderOpen" $
191 P.try p0 <|> p1
192 where
193 p0 =
194 (\b o -> mconcat $ b <> o)
195 <$> P.some (P.try $ P.choice
196 [ P.try p_ElemOpen
197 , P.try p_ElemClose
198 , do
199 c <- p_PunctOrSym
200 case l_PairOpen c <|> l_PairClose c of
201 Just l -> return [l]
202 Nothing -> fail ""
203 ])
204 <*> P.some (P.try p_PairOpen)
205 p1 =
206 (\b o -> LexemePunctOrSym b : mconcat o)
207 <$> p_PunctOrSym
208 <*> P.some (P.try p_PairOpen)
209
210 p_PairOpen :: Parser e s [Lexeme]
211 p_PairOpen = pdbg "PairOpen" $ do
212 P.choice
213 [ P.try p_ElemOpen
214 , P.try (pure <$> p_Escape)
215 , P.try (pure <$> p_Link)
216 , do
217 c <- p_PunctOrSym
218 case l_PairOpenOrClose LexemePairOpen c of
219 Just l -> return [l]
220 _ -> fail ""
221 ]
222
223 p_PairClose :: Parser e s [Lexeme]
224 p_PairClose = pdbg "PairClose" $ do
225 P.choice
226 [ P.try p_ElemClose
227 , P.try p_ElemSingle
228 , P.try (pure <$> p_Escape)
229 , P.try (pure <$> p_Link)
230 , do
231 c <- p_PunctOrSym
232 case l_PairOpenOrClose LexemePairClose c of
233 Just l -> return [l]
234 _ -> fail ""
235 ]
236
237 p_PairPlain :: (Char -> Maybe Lexeme) -> Parser e s [Lexeme]
238 p_PairPlain pair = pdbg "PairPlain" $ do
239 (<$> p_PunctOrSym) $ \c ->
240 pure $
241 LexemePunctOrSym c `fromMaybe`
242 pair c
243
244 l_PairOpenAndClose :: (Pair -> Lexeme) -> Char -> Maybe Lexeme
245 l_PairOpenAndClose lxm c =
246 case c of
247 '/' -> Just $ lxm PairSlash
248 '-' -> Just $ lxm PairDash
249 '"' -> Just $ lxm PairDoublequote
250 '\'' -> Just $ lxm PairSinglequote
251 '`' -> Just $ lxm PairBackquote
252 '_' -> Just $ lxm PairUnderscore
253 '*' -> Just $ lxm PairStar
254 '#' -> Just $ lxm PairHash
255 _ -> Nothing
256
257 l_PairOpenOrClose :: (Pair -> Lexeme) -> Char -> Maybe Lexeme
258 l_PairOpenOrClose lxm c =
259 l_PairOpenAndClose lxm c <|>
260 l_PairOpen c <|>
261 l_PairClose c
262
263 l_PairOpen :: Char -> Maybe Lexeme
264 l_PairOpen c =
265 case c of
266 '(' -> Just $ LexemePairOpen PairParen
267 '[' -> Just $ LexemePairOpen PairBracket
268 '{' -> Just $ LexemePairOpen PairBrace
269 '«' -> Just $ LexemePairOpen PairFrenchquote
270 _ -> Nothing
271
272 l_PairClose :: Char -> Maybe Lexeme
273 l_PairClose c =
274 case c of
275 ')' -> Just $ LexemePairClose PairParen
276 ']' -> Just $ LexemePairClose PairBracket
277 '}' -> Just $ LexemePairClose PairBrace
278 '»' -> Just $ LexemePairClose PairFrenchquote
279 _ -> Nothing
280
281 p_Link :: Parser e s Lexeme
282 p_Link =
283 (\scheme ss addr -> LexemeLink $ Text.pack $ scheme <> ss <> addr)
284 <$> P.option "" (P.try p_scheme)
285 <*> P.string "//"
286 <*> p_addr
287 where
288 p_scheme =
289 (<>)
290 <$> P.some (P.satisfy $ \c ->
291 Char.isAlphaNum c
292 || c=='_'
293 || c=='-'
294 || c=='+')
295 <*> P.string ":"
296 p_addr =
297 P.many $
298 P.satisfy $ \c ->
299 Char.isAlphaNum c
300 || c=='%'
301 || c=='/'
302 || c=='('
303 || c==')'
304 || c=='-'
305 || c=='_'
306 || c=='.'
307
308 p_Escape :: Parser e s Lexeme
309 p_Escape =
310 LexemeEscape
311 <$ P.char '\\'
312 <*> P.satisfy Char.isPrint
313
314 p_ElemSingle :: Parser e s [Lexeme]
315 p_ElemSingle = pdbg "ElemOpen" $
316 (\e as ->
317 [ LexemePairOpen $ PairElem e as
318 , LexemeToken $ Tokens mempty
319 -- NOTE: encode that it's the same Elem for open and close
320 , LexemePairClose $ PairElem e [] ])
321 <$ P.char '<'
322 <*> p_Word
323 <*> p_Attrs
324 <* P.string "/>"
325
326 p_ElemOpen :: Parser e s [Lexeme]
327 p_ElemOpen = pdbg "ElemOpen" $
328 (\e as oc ->
329 case oc of
330 True -> [ LexemePairOpen $ PairElem e as
331 , LexemeToken $ Tokens mempty
332 , LexemePairClose $ PairElem e [] ]
333 False -> [LexemePairOpen $ PairElem e as])
334 <$ P.char '<'
335 <*> p_Word
336 <*> p_Attrs
337 <*> P.option False (True <$ P.char '/')
338 <* P.char '>'
339
340 p_ElemClose :: Parser e s [Lexeme]
341 p_ElemClose = pdbg "ElemClose" $
342 (\e -> [LexemePairClose $ PairElem e []])
343 <$ P.string "</"
344 <*> p_Word
345 <* P.char '>'