Add RNC schema generation.
[doclang.git] / Language / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 {-# LANGUAGE Rank2Types #-}
4 {-# LANGUAGE TypeFamilies #-}
5 module Language.TCT.Read.Token where
6
7 import Control.Applicative (Applicative(..), Alternative(..))
8 import Control.Monad (Monad(..))
9 import Data.Bool
10 import Data.Char (Char)
11 import Data.Eq (Eq(..))
12 import Data.Foldable (Foldable(..))
13 import Data.Function (($), (.))
14 import Data.Functor ((<$>), ($>), (<$))
15 import Data.Maybe (Maybe(..))
16 import Data.Monoid (Monoid(..))
17 import Data.Semigroup (Semigroup(..))
18 import Data.Sequence (ViewL(..), (<|))
19 import Data.Text (Text)
20 import Data.Text.Buildable (Buildable(..))
21 import Data.Tuple (fst,snd)
22 import Prelude (Num(..))
23 import Text.Show (Show(..))
24 import qualified Data.Char as Char
25 import qualified Data.Sequence as Seq
26 import qualified Data.Text as Text
27 import qualified Data.Text.Lazy as TL
28 import qualified Data.Text.Lazy.Builder as Builder
29 import qualified Text.Megaparsec as P
30 import qualified Text.Megaparsec.Char as P
31
32 import Language.TCT.Token
33 import Language.TCT.Cell
34 import Language.TCT.Elem
35 import Language.TCT.Read.Elem
36 import Language.TCT.Read.Cell
37
38 {-
39 import Debug.Trace (trace)
40 dbg m x = trace (m <> ": " <> show x) x
41 pdbg m p = P.dbg m p
42 -}
43
44 textOf :: Buildable a => a -> Text
45 textOf = TL.toStrict . Builder.toLazyText . build
46
47 -- * Type 'Pairs'
48 type Pairs = (Tokens,[(Cell Pair,Tokens)])
49
50 appendToken :: Pairs -> Cell Token -> Pairs
51 appendToken ps = appendTokens ps . Seq.singleton
52
53 appendTokens :: Pairs -> Tokens -> Pairs
54 appendTokens (t,[]) toks = (t<>toks,[])
55 appendTokens (t,(p0,t0):ps) toks = (t,(p0,t0<>toks):ps)
56
57 openPair :: Pairs -> Cell Pair -> Pairs
58 openPair (t,ms) p = (t,(p,mempty):ms)
59
60 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
61 closePair :: Pairs -> Cell Pair -> Pairs
62 closePair ps@(_,[]) (Cell pos posEnd p) = dbg "closePair" $
63 appendToken ps $
64 Cell pos posEnd $
65 TokenPlain $ snd $ pairBorders p tokensPlainEmpty
66 closePair (t,(p1,t1):ts) p = dbg "closePair" $
67 case (p,p1) of
68 (Cell bx _ex (PairElem x ax), Cell _by ey (PairElem y ay)) | x == y ->
69 appendToken (t,ts) $
70 Cell bx ey $
71 TokenPair (PairElem x (ax<>ay)) t1
72 (Cell bx _ex x, Cell _by ey y) | x == y ->
73 appendToken (t,ts) $
74 Cell bx ey $
75 TokenPair x t1
76 _ ->
77 (`closePair` p) $
78 appendTokens
79 (t,ts)
80 (closeUnpaired mempty (p1,t1))
81
82 -- | Close a 'Pair' when there is not a matching 'LexemePairClose'.
83 closeUnpaired :: Tokens -> (Cell Pair,Tokens) -> Tokens
84 closeUnpaired acc (Cell bp ep p,toks) = dbg "closeUnpaired" $
85 case p of
86 -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
87 PairHash | (Cell bt et (TokenPlain t)) :< ts <- Seq.viewl $ toks <> acc ->
88 case Text.findIndex (not . isTagChar) t of
89 -- Just 0 -> toksHash mempty <> toks <> acc
90 Just i ->
91 Cell bp bt{columnPos = columnPos bt + i} (TokenTag tag)
92 <| Cell bt{columnPos = columnPos bt + i + 1} et (TokenPlain t')
93 <| ts
94 where (tag,t') = Text.splitAt i t
95 Nothing | Text.null t -> toksHash mempty <> toks <> acc
96 Nothing -> Cell bp et (TokenTag t) <| ts
97 _ -> toksHash tokensPlainEmpty <> toks <> acc
98 where
99 toksHash = tokens1 . Cell bp ep . TokenPlain . fst . pairBorders p
100 isTagChar c =
101 Char.isAlphaNum c ||
102 c=='·' ||
103 case Char.generalCategory c of
104 Char.DashPunctuation -> True
105 Char.ConnectorPunctuation -> True
106 _ -> False
107
108 -- | Close remaining 'Pair's at end of parsing.
109 closePairs :: Pairs -> Tokens
110 closePairs (t0,ps) = dbg "closePairs" $
111 t0 <> foldl' closeUnpaired mempty ps
112
113 appendLexeme :: Lexeme -> Pairs -> Pairs
114 appendLexeme lex acc =
115 dbg "appendLexeme" $
116 case dbg "appendLexeme" lex of
117 LexemePairOpen ps -> foldl' open acc ps
118 where
119 open a p@(Cell _bp ep (PairElem{})) = openPair a p `appendToken` (Cell ep ep $ TokenPlain "")
120 open a p = openPair a p
121 LexemePairClose ps -> foldl' closePair acc ps
122 LexemePairAny ps -> appendTokens acc $ tokens $ ((\p -> TokenPlain $ fst $ pairBorders p mempty) <$>) <$> ps
123 LexemePairBoth ps -> appendTokens acc $ tokens $ ((`TokenPair`mempty) <$>) <$> ps
124 LexemeEscape c -> appendToken acc $ TokenEscape <$> c
125 LexemeLink t -> appendToken acc $ TokenLink <$> t
126 LexemeWhite (unCell -> "") -> acc
127 LexemeWhite cs -> appendToken acc $ TokenPlain <$> cs
128 LexemeAlphaNum cs -> appendToken acc $ TokenPlain . Text.pack <$> cs
129 LexemeChar c -> appendToken acc $ TokenPlain . Text.singleton <$> c
130 LexemeToken ts -> appendTokens acc ts
131
132 -- * Type 'Lexeme'
133 data Lexeme
134 = LexemePairOpen ![Cell Pair]
135 | LexemePairClose ![Cell Pair]
136 | LexemePairAny ![Cell Pair]
137 | LexemePairBoth ![Cell Pair]
138 | LexemeEscape !(Cell Char)
139 | LexemeLink !(Cell Text)
140 | LexemeWhite !(Cell White)
141 | LexemeAlphaNum !(Cell [Char])
142 | LexemeChar !(Cell Char)
143 | LexemeToken !Tokens
144 deriving (Eq, Show)
145
146 p_Tokens :: Parser e s Tokens
147 p_Tokens = pdbg "Tokens" $
148 closePairs .
149 foldr appendLexeme mempty .
150 dbg "Lexemes" .
151 mangleLexemes .
152 (LexemeWhite (cell0 "") :) <$>
153 go [LexemeWhite (cell0 "")]
154 where
155 go :: [Lexeme] -> Parser e s [Lexeme]
156 go acc =
157 (P.eof $> acc) <|>
158 (p_Lexeme >>= \next -> go $ mangleLexemes $ next:acc)
159
160 mangleLexemes = \case
161 w@LexemeWhite{} :p@LexemePairAny{}:acc -> w:any2close p:acc
162 p@LexemePairAny{}:w@LexemeWhite{} :acc -> any2open p:w:acc
163
164 l@LexemeAlphaNum{}:c@LexemeChar{} :p@LexemePairAny{}:acc -> l:c:any2close p:acc
165 l@LexemeAlphaNum{}:p@LexemePairAny{}:c@LexemeChar{}:acc -> l:any2open p:c:acc
166
167 acc -> acc
168
169 any2close,any2open :: Lexeme -> Lexeme
170 any2close (LexemePairAny ps) = LexemePairClose ps
171 any2close c = c
172 any2open (LexemePairAny ps) = LexemePairOpen ps
173 any2open c = c
174
175 pairAny :: Char -> Maybe Pair
176 pairAny = \case
177 '-' -> Just PairDash
178 '/' -> Just PairSlash
179 '"' -> Just PairDoublequote
180 '\'' -> Just PairSinglequote
181 '`' -> Just PairBackquote
182 '_' -> Just PairUnderscore
183 '*' -> Just PairStar
184 '#' -> Just PairHash
185 _ -> Nothing
186
187 pairOpen :: Char -> Maybe Pair
188 pairOpen = \case
189 '(' -> Just PairParen
190 '[' -> Just PairBracket
191 '{' -> Just PairBrace
192 '«' -> Just PairFrenchquote
193 _ -> Nothing
194
195 pairClose :: Char -> Maybe Pair
196 pairClose = \case
197 ')' -> Just PairParen
198 ']' -> Just PairBracket
199 '}' -> Just PairBrace
200 '»' -> Just PairFrenchquote
201 _ -> Nothing
202
203 p_Cell :: Parser e s a -> Parser e s (Cell a)
204 p_Cell pa = do
205 bp <- p_Position
206 a <- pa
207 ep <- p_Position
208 return $ Cell bp ep a
209
210 p_Lexeme :: Parser e s Lexeme
211 p_Lexeme = pdbg "Lexeme" $
212 P.choice
213 [ P.try $ LexemeWhite <$> p_Cell p_Spaces
214 , P.try $ LexemePairAny <$> P.some (p_Cell $ p_satisfyMaybe pairAny)
215 , P.try $ LexemePairBoth <$> P.some (P.try $ p_Cell p_ElemSingle)
216 , P.try $ LexemePairOpen <$> P.some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen)
217 , P.try $ LexemePairClose <$> P.some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
218 , P.try $ LexemeEscape <$> p_Cell p_Escape
219 , P.try $ LexemeLink <$> p_Cell p_Link
220 , P.try $ LexemeAlphaNum <$> p_Cell (P.some p_AlphaNum)
221 , LexemeChar <$> p_Cell P.anyChar
222 ]
223
224 p_AlphaNum :: Parser e s Char
225 p_AlphaNum = P.satisfy Char.isAlphaNum
226
227 p_Escape :: Parser e s Char
228 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
229
230 p_Link :: Parser e s Text
231 p_Link =
232 (\scheme addr -> Text.pack $ scheme <> "//" <> addr)
233 <$> P.option "" (P.try p_scheme)
234 <* P.string "//"
235 <*> p_addr
236 where
237 p_scheme =
238 (<> ":")
239 <$> P.some (P.satisfy $ \c ->
240 Char.isAlphaNum c
241 || c=='_'
242 || c=='-'
243 || c=='+')
244 <* P.char ':'
245 p_addr =
246 P.many $
247 P.satisfy $ \c ->
248 Char.isAlphaNum c
249 || c=='%'
250 || c=='/'
251 || c=='('
252 || c==')'
253 || c=='-'
254 || c=='_'
255 || c=='.'
256 || c=='#'
257 || c=='?'
258 || c=='='
259
260 p_ElemSingle :: Parser e s Pair
261 p_ElemSingle = pdbg "ElemSingle" $
262 PairElem
263 <$ P.char '<'
264 <*> p_Word
265 <*> p_Attrs
266 <* P.string "/>"
267
268 p_ElemOpen :: Parser e s Pair
269 p_ElemOpen = pdbg "ElemOpen" $
270 PairElem
271 <$ P.char '<'
272 <*> p_Word
273 <*> p_Attrs
274 <* P.char '>'
275
276 p_ElemClose :: Parser e s Pair
277 p_ElemClose = pdbg "ElemClose" $
278 (`PairElem` [])
279 <$ P.string "</"
280 <*> p_Word
281 <* P.char '>'
282
283 {-
284 p_ElemOpenOrSingle :: Parser e s Pair
285 p_ElemOpenOrSingle =
286 p_ElemOpen >>= \p ->
287 P.char '>' $> LexemePairOpen p <|>
288 P.string "/>" $> LexemePairAny p
289 -}