]> Git — Sourcephile - doclang.git/blob - Language/TCT/Read/Token.hs
Simplify Token parsing.
[doclang.git] / Language / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 {-# LANGUAGE Rank2Types #-}
4 {-# LANGUAGE TypeFamilies #-}
5 module Language.TCT.Read.Token where
6
7 import Control.Applicative (Applicative(..), Alternative(..))
8 import Control.Monad (Monad(..))
9 import Data.Bool
10 import Data.Char (Char)
11 import Data.Either (Either(..))
12 import Data.Eq (Eq(..))
13 import Data.Foldable (Foldable(..))
14 import Data.Function (($), (.))
15 import Data.Functor ((<$>), ($>), (<$))
16 import Data.List.NonEmpty (NonEmpty(..))
17 import Data.Maybe (Maybe(..))
18 import Data.Monoid (Monoid(..))
19 import Data.Semigroup (Semigroup(..))
20 import Data.Sequence (ViewL(..), (<|))
21 import Data.Text (Text)
22 import Data.Text.Buildable (Buildable(..))
23 import Data.Tuple (fst,snd)
24 import Text.Show (Show(..))
25 import qualified Data.Char as Char
26 import qualified Data.Sequence as Seq
27 import qualified Data.Set as Set
28 import qualified Data.Text as Text
29 import qualified Data.Text.Lazy as TL
30 import qualified Data.Text.Lazy.Builder as Builder
31 import qualified Text.Megaparsec as P
32 import qualified Text.Megaparsec.Prim as P
33
34 import Language.TCT.Token
35 import Language.TCT.Elem -- hiding (dbg)
36 import Language.TCT.Read.Elem -- hiding (pdbg)
37
38 {-
39 import Debug.Trace (trace)
40 dbg m x = trace (m <> ": " <> show x) x
41 pdbg m p = P.dbg m p
42 -}
43
44 textOf :: Buildable a => a -> Text
45 textOf = TL.toStrict . Builder.toLazyText . build
46
47 -- * Type 'Pairs'
48 type Pairs = (Tokens,[(Pair,Tokens)])
49
50 appendToken :: Pairs -> Token -> Pairs
51 appendToken ps = appendTokens ps . Tokens . Seq.singleton
52
53 appendTokens :: Pairs -> Tokens -> Pairs
54 appendTokens (t,[]) toks = (t<>toks,[])
55 appendTokens (t,(p0,t0):ps) toks = (t,(p0,t0<>toks):ps)
56
57 openPair :: Pairs -> Pair -> Pairs
58 openPair (t,ms) p = (t,(p,mempty):ms)
59
60 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
61 closePair :: Pairs -> Pair -> Pairs
62 closePair (t,[]) p = dbg "closePair" $
63 (t<>tokens1 (TokenPlain $ snd $ pairBorders p tokensPlainEmpty),[])
64 closePair (t,(p1,t1):ts) p = dbg "closePair" $
65 case (p,p1) of
66 (PairElem x ax, PairElem y ay) | x == y ->
67 appendToken (t,ts) $ TokenPair (PairElem x (ax<>ay)) t1
68 (x,y) | x == y -> appendToken (t,ts) $ TokenPair p1 t1
69 _ ->
70 (`closePair` p) $
71 appendTokens
72 (t,ts)
73 (closeUnpaired mempty (p1,t1))
74
75 -- | Close a 'Pair' when there is not a matching 'LexemePairClose'.
76 closeUnpaired :: Tokens -> (Pair,Tokens) -> Tokens
77 closeUnpaired acc (p,toks) = dbg "closeUnpaired" $
78 case p of
79 -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
80 PairHash | TokenPlain t :< ts <- Seq.viewl $ unTokens $ toks <> acc ->
81 case Text.findIndex (not . isTagChar) t of
82 Just 0 -> toksHash <> toks <> acc
83 Just i -> Tokens $ TokenTag tag <| TokenPlain t' <| ts
84 where (tag,t') = Text.splitAt i t
85 Nothing | Text.null t -> toksHash <> toks <> acc
86 Nothing -> Tokens $ TokenTag t <| ts
87 where toksHash = tokens1 $ TokenPlain $ fst $ pairBorders p mempty
88 _ -> tokens1 (TokenPlain $ fst $ pairBorders p tokensPlainEmpty) <> toks <> acc
89 where
90 isTagChar c =
91 Char.isAlphaNum c ||
92 c=='·' ||
93 case Char.generalCategory c of
94 Char.DashPunctuation -> True
95 Char.ConnectorPunctuation -> True
96 _ -> False
97
98 -- | Close remaining 'Pair's at end of parsing.
99 closePairs :: Pairs -> Tokens
100 closePairs (t0,ps) = dbg "closePairs" $
101 t0 <> foldl' closeUnpaired mempty ps
102
103 appendLexeme :: Lexeme -> Pairs -> Pairs
104 appendLexeme lex acc =
105 dbg "appendLexeme" $
106 case dbg "appendLexeme" lex of
107 LexemePairOpen ps -> foldl' open acc ps
108 where
109 open a p@PairElem{} = openPair a p `appendToken` TokenPlain ""
110 open a p = openPair a p
111 LexemePairClose ps -> foldl' closePair acc ps
112 LexemePairAny ps -> appendTokens acc $ tokens $ TokenPlain . fst . (`pairBorders` mempty) <$> ps
113 LexemePairBoth ps -> appendTokens acc $ tokens $ (`TokenPair`mempty) <$> ps
114 LexemeEscape c -> appendToken acc $ TokenEscape c
115 LexemeLink t -> appendToken acc $ TokenLink t
116 LexemeWhite cs -> appendToken acc $ TokenPlain cs
117 LexemeAlphaNum cs -> appendToken acc $ TokenPlain $ Text.pack cs
118 LexemeChar c -> appendToken acc $ TokenPlain $ Text.singleton c
119 LexemeToken ts -> appendTokens acc ts
120
121 -- * Type 'Lexeme'
122 data Lexeme
123 = LexemePairOpen ![Pair]
124 | LexemePairClose ![Pair]
125 | LexemePairAny ![Pair]
126 | LexemePairBoth ![Pair]
127 | LexemeEscape !Char
128 | LexemeLink !Text
129 | LexemeWhite !White
130 | LexemeAlphaNum ![Char]
131 | LexemeChar !Char
132 | LexemeToken !Tokens
133 deriving (Eq, Show)
134
135 p_satisfyMaybe :: (P.MonadParsec e s m, P.Token s ~ Char) => (Char -> Maybe a) -> m a
136 p_satisfyMaybe f = P.token testChar Nothing
137 where
138 testChar c =
139 case f c of
140 Just a -> Right a
141 Nothing -> Left (Set.singleton $ P.Tokens $ c:|[], Set.empty, Set.empty)
142
143 p_Tokens :: Parser e s Tokens
144 p_Tokens = pdbg "Tokens" $
145 closePairs .
146 foldr appendLexeme mempty .
147 dbg "Lexemes" .
148 mangleLexemes .
149 (LexemeWhite "" :) <$>
150 go [LexemeWhite ""]
151 where
152 go :: [Lexeme] -> Parser e s [Lexeme]
153 go acc =
154 (P.eof $> acc) <|>
155 (p_Lexeme >>= \next -> go $ mangleLexemes $ next:acc)
156
157 mangleLexemes = \case
158 w@LexemeWhite{} :p@LexemePairAny{}:acc -> w:any2close p:acc
159 p@LexemePairAny{}:w@LexemeWhite{} :acc -> any2open p:w:acc
160
161 l@LexemeAlphaNum{}:c@LexemeChar{} :p@LexemePairAny{}:acc -> l:c:any2close p:acc
162 l@LexemeAlphaNum{}:p@LexemePairAny{}:c@LexemeChar{}:acc -> l:any2open p:c:acc
163
164 acc -> acc
165
166 any2close,any2open :: Lexeme -> Lexeme
167 any2close (LexemePairAny ps) = LexemePairClose ps
168 any2close c = c
169 any2open (LexemePairAny ps) = LexemePairOpen ps
170 any2open c = c
171
172 pairAny :: Char -> Maybe Pair
173 pairAny = \case
174 '-' -> Just PairDash
175 '/' -> Just PairSlash
176 '"' -> Just PairDoublequote
177 '\'' -> Just PairSinglequote
178 '`' -> Just PairBackquote
179 '_' -> Just PairUnderscore
180 '*' -> Just PairStar
181 '#' -> Just PairHash
182 _ -> Nothing
183
184 pairOpen :: Char -> Maybe Pair
185 pairOpen = \case
186 '(' -> Just PairParen
187 '[' -> Just PairBracket
188 '{' -> Just PairBrace
189 '«' -> Just PairFrenchquote
190 _ -> Nothing
191
192 pairClose :: Char -> Maybe Pair
193 pairClose = \case
194 ')' -> Just PairParen
195 ']' -> Just PairBracket
196 '}' -> Just PairBrace
197 '»' -> Just PairFrenchquote
198 _ -> Nothing
199
200 p_Lexeme :: Parser e s Lexeme
201 p_Lexeme = pdbg "Lexeme" $
202 P.choice
203 [ P.try $ LexemeWhite <$> p_Spaces
204 , P.try $ LexemePairAny <$> P.some (p_satisfyMaybe pairAny)
205 , P.try $ LexemePairBoth <$> P.some (P.try p_ElemSingle)
206 , P.try $ LexemePairOpen <$> P.some (p_satisfyMaybe pairOpen <|> P.try p_ElemOpen)
207 , P.try $ LexemePairClose <$> P.some (p_satisfyMaybe pairClose <|> P.try p_ElemClose)
208 , P.try $ LexemeEscape <$> p_Escape
209 , P.try $ LexemeLink <$> p_Link
210 , P.try $ LexemeAlphaNum <$> P.some p_AlphaNum
211 , LexemeChar <$> P.anyChar
212 ]
213
214 p_AlphaNum :: Parser e s Char
215 p_AlphaNum = P.satisfy Char.isAlphaNum
216
217 p_Escape :: Parser e s Char
218 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
219
220 p_Link :: Parser e s Text
221 p_Link =
222 (\scheme ss addr -> Text.pack $ scheme <> ss <> addr)
223 <$> P.option "" (P.try p_scheme)
224 <*> P.string "//"
225 <*> p_addr
226 where
227 p_scheme =
228 (<>)
229 <$> P.some (P.satisfy $ \c ->
230 Char.isAlphaNum c
231 || c=='_'
232 || c=='-'
233 || c=='+')
234 <*> P.string ":"
235 p_addr =
236 P.many $
237 P.satisfy $ \c ->
238 Char.isAlphaNum c
239 || c=='%'
240 || c=='/'
241 || c=='('
242 || c==')'
243 || c=='-'
244 || c=='_'
245 || c=='.'
246 || c=='#'
247 || c=='?'
248 || c=='='
249
250 p_ElemSingle :: Parser e s Pair
251 p_ElemSingle = pdbg "ElemSingle" $
252 PairElem
253 <$ P.char '<'
254 <*> p_Word
255 <*> p_Attrs
256 <* P.string "/>"
257
258 p_ElemOpen :: Parser e s Pair
259 p_ElemOpen = pdbg "ElemOpen" $
260 PairElem
261 <$ P.char '<'
262 <*> p_Word
263 <*> p_Attrs
264 <* P.char '>'
265
266 p_ElemClose :: Parser e s Pair
267 p_ElemClose = pdbg "ElemClose" $
268 (`PairElem` [])
269 <$ P.string "</"
270 <*> p_Word
271 <* P.char '>'
272
273 {-
274 p_ElemOpenOrSingle :: Parser e s Pair
275 p_ElemOpenOrSingle =
276 p_ElemOpen >>= \p ->
277 P.char '>' $> LexemePairOpen p <|>
278 P.string "/>" $> LexemePairAny p
279 -}