]> Git — Sourcephile - doclang.git/blob - Language/TCT/Read/Token.hs
Maintain Plain and HTML5 rendering of TCT.
[doclang.git] / Language / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# LANGUAGE ViewPatterns #-}
8 {-# OPTIONS_GHC -fno-warn-orphans #-}
9 module Language.TCT.Read.Token where
10
11 import Control.Applicative (Applicative(..), Alternative(..))
12 import Control.Monad (Monad(..))
13 import Data.Bool
14 import Data.Char (Char)
15 import Data.Either (Either(..))
16 import Data.Eq (Eq(..))
17 import Data.Foldable (Foldable(..))
18 import Data.Function (($), (.))
19 import Data.Functor ((<$>), ($>))
20 import Data.List.NonEmpty (NonEmpty(..))
21 import Data.Maybe (Maybe(..))
22 import Data.Monoid (Monoid(..))
23 import Data.Semigroup (Semigroup(..))
24 import Data.Sequence (ViewL(..), ViewR(..), (<|), (|>))
25 import Data.String (String)
26 import Data.TreeSeq.Strict (Tree(..), Trees)
27 import Data.Tuple (fst,snd)
28 import Data.Void (Void)
29 import Prelude (Num(..))
30 import Text.Show (Show(..))
31 import qualified Data.Char as Char
32 import qualified Data.List.NonEmpty as NonEmpty
33 import qualified Data.Sequence as Seq
34 import qualified Data.Text as Text
35 import qualified Data.Text.Lazy as TL
36 import qualified Text.Megaparsec as P
37 import qualified Text.Megaparsec.Char as P
38
39 import Language.TCT.Debug
40 import Language.TCT.Cell
41 import Language.TCT.Elem
42 import Language.TCT.Tree
43 import Language.TCT.Read.Elem
44 import Language.TCT.Read.Cell
45
46 -- * Type 'Pairs'
47 -- | Right-only Dyck language,
48 -- to keep track of opened 'Pair's.
49 type Pairs = (Tokens,[Opening])
50 type Tokens = Trees (Cell Node)
51
52 -- ** Type 'Opening'
53 -- | An opened 'Pair' and its content so far.
54 type Opening = (Cell Pair,Tokens)
55
56 appendToken :: Pairs -> Tree (Cell Node) -> Pairs
57 appendToken (ts,[]) tok = (ts|>tok,[])
58 appendToken (ts,(p0,t0):ps) tok = (ts,(p0,t0|>tok):ps)
59
60 appendTokens :: Pairs -> Tokens -> Pairs
61 appendTokens (ts,[]) toks = (ts<>toks,[])
62 appendTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0<>toks):ps)
63
64 -- | Appending 'TL.Text' is a special case
65 -- to append at the 'TokenText' level is possible,
66 -- instead of the higher 'NodeToken' level.
67 appendText :: Pairs -> Cell TL.Text -> Pairs
68 appendText ps tok =
69 case ps of
70 (ts,[]) -> (appendTokenText ts tok,[])
71 (ts,(p0,ts0):pss) -> (ts,(p0,appendTokenText ts0 tok):pss)
72
73 appendTokenText :: Tokens -> Cell TL.Text -> Tokens
74 appendTokenText ts (Cell bn en n) =
75 {-
76 | TL.null n = ts
77 | otherwise = -}
78 case Seq.viewr ts of
79 EmptyR -> pure $ Tree0 $ Cell bn en $ NodeToken $ TokenText n
80 is :> Tree (Cell bo _eo nod) st ->
81 case nod of
82 NodeToken (TokenText o) -> is |> i
83 where i = Tree (Cell bo en (NodeToken $ TokenText (o <> n))) st
84 _ -> ts |> Tree0 (Cell bn en $ NodeToken $ TokenText n)
85
86 prependTokenText :: Tokens -> Cell TL.Text -> Tokens
87 prependTokenText ts (Cell bn en n)
88 {-
89 | TL.null n = ts
90 | otherwise-} =
91 case Seq.viewl ts of
92 EmptyL -> pure $ Tree0 $ Cell bn en $ NodeToken $ TokenText n
93 Tree (Cell _bo eo nod) st :< is ->
94 case nod of
95 NodeToken (TokenText o) -> i <| is
96 where i = Tree (Cell bn eo (NodeToken $ TokenText (n <> o))) st
97 _ -> Tree0 (Cell bn en $ NodeToken $ TokenText n) <| ts
98
99 openPair :: Pairs -> Cell Pair -> Pairs
100 openPair (t,ps) p = (t,(p,mempty):ps)
101
102 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
103 closePair :: Pairs -> Cell Pair -> Pairs
104 closePair ps@(_,[]) (Cell bp ep p) = -- debug0 "closePair" $
105 appendText ps $ Cell bp ep $ snd $ pairBorders p
106 closePair (t,(p1,t1):ts) p = -- debug0 "closePair" $
107 case (p1,p) of
108 (Cell bx _ex (PairElem nx ax), Cell _by ey (PairElem ny ay)) | nx == ny ->
109 appendToken (t,ts) $
110 Tree (Cell bx ey $ NodePair $ PairElem nx as) t1
111 where as | null ay = ax
112 | otherwise = ax<>ay
113 (Cell bx _ex x, Cell _by ey y) | x == y ->
114 appendToken (t,ts) $
115 Tree (Cell bx ey $ NodePair x) t1
116 _ ->
117 (`closePair` p) $
118 appendTokens
119 (t,ts)
120 (closeImpaired mempty (p1,t1))
121
122 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
123 closeImpaired :: Tokens -> (Cell Pair,Tokens) -> Tokens
124 closeImpaired acc (Cell bp ep p,toks) = -- debug0 "closeImpaired" $
125 case p of
126 -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
127 PairHash | Just (Cell _bt et t, ts) <- tagFrom $ toks <> acc ->
128 Tree0 (Cell bp et $ NodeToken $ TokenTag t) <| ts
129 {-
130 PairHash | (Tree0 (Cell bt et (TokenPlain t))) :< ts <- Seq.viewl $ toks <> acc ->
131 case Text.span isTagChar t of
132 ("",_) | Text.null t -> toksHash mempty <> toks <> acc
133 | otherwise -> Tree0 (Cell bp et (TokenTag t)) <| ts
134 (tag,t') ->
135 let len = Text.length tag in
136 Tree0 (Cell bp bt{columnPos = columnPos bt + len} (TokenTag tag)) <|
137 Tree0 (Cell bt{columnPos = columnPos bt + len + 1} et (TokenPlain t')) <|
138 ts
139 -}
140 _ -> prependTokenText (toks <> acc) toksHash
141 where
142 toksHash :: Cell TL.Text
143 toksHash = Cell bp ep $ fst $ pairBorders p
144
145 -- | Close remaining 'Pair's at end of parsing.
146 closePairs :: Pairs -> Tokens
147 closePairs (t0,ps) = -- debug0 "closePairs" $
148 t0 <> foldl' closeImpaired mempty ps
149
150 appendLexeme :: Lexeme -> Pairs -> Pairs
151 appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
152 case lex of
153 LexemePairOpen ps -> foldl' open acc ps
154 where
155 -- NOTE: insert an empty node to encode <elem></elem>, not <elem/>
156 open a p@(Cell _bp ep (PairElem{})) =
157 openPair a p `appendToken`
158 (Tree0 $ Cell ep ep $ NodeToken $ TokenText "")
159 open a p = openPair a p
160 LexemePairClose ps -> foldl' closePair acc ps
161 LexemePairAny ps -> foldl' openPair acc ps
162 {-
163 LexemePairAny ps ->
164 appendText acc $ sconcat $
165 ((fst . pairBordersWithoutContent) <$>) <$> ps
166 -}
167 LexemePairBoth ps -> appendTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
168 LexemeEscape c -> appendToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
169 LexemeLink t -> appendToken acc $ Tree0 $ NodeToken . TokenLink <$> t
170 {-LexemeWhite (unCell -> "") -> acc-}
171 -- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc
172 LexemeWhite t -> appendText acc t
173 LexemeAlphaNum t -> appendText acc t
174 LexemeOther t -> appendText acc t
175 LexemeTree t -> appendToken acc t
176 LexemeEnd -> acc
177
178 appendLexemes :: Pairs -> [Lexeme] -> Pairs
179 appendLexemes = foldr appendLexeme
180
181 -- * Type 'Lexeme'
182 -- | 'Lexeme's cut the input in the longest chunks of common semantic,
183 -- this enables 'orientLexemePairAny' to work with a more meaningful context.
184 data Lexeme
185 = LexemePairOpen !(NonEmpty (Cell Pair))
186 | LexemePairClose !(NonEmpty (Cell Pair))
187 | LexemePairAny !(NonEmpty (Cell Pair))
188 -- ^ orientation depending on the surrounding 'Lexeme's,
189 -- see 'orientLexemePairAny'
190 | LexemePairBoth !(NonEmpty (Cell Pair))
191 | LexemeEscape !(Cell Char)
192 | LexemeLink !(Cell TL.Text)
193 | LexemeWhite !(Cell TL.Text)
194 | LexemeAlphaNum !(Cell TL.Text)
195 | LexemeOther !(Cell TL.Text)
196 | LexemeTree !(Tree (Cell Node))
197 | LexemeEnd
198 deriving (Eq, Show)
199 instance Pretty Lexeme
200
201 parseTokens :: [Lexeme] -> Tokens
202 parseTokens ps =
203 closePairs $
204 appendLexemes mempty $
205 -- debug0 "Lexemes (post orient)" $
206 orientLexemePairAny $ LexemeEnd :
207 ps
208
209 parseLexemes ::
210 String ->
211 Cell TL.Text ->
212 Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme]
213 parseLexemes inp = runParserOnCell inp (p_Lexemes <* P.eof)
214
215 -- | Parse 'Lexeme's, returning them in reverse order
216 -- to apply 'orientLexemePairAny'.
217 p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
218 p_Lexemes = debugParser "Lexemes" $ go []
219 where
220 go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
221 go acc =
222 (P.eof $> acc) <|>
223 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
224
225 -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
226 -- so that it can try to orient nearby 'LexemePairAny'
227 -- to 'LexemePairOpen' or 'LexemePairClose'.
228 orientLexemePairAny :: [Lexeme] -> [Lexeme]
229 orientLexemePairAny = \case
230 -- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc
231
232 -- "   
233 t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
234 w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
235 LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
236 --    "
237 LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
238 LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
239 LexemePairAny p:[] -> LexemePairOpen p:[]
240
241 --    ,,,"
242 LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
243 LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
244 -- ",,,   
245 w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
246 LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
247
248 -- ",,,AAA
249 an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
250 -- ,,,"AAA
251 an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
252
253 -- ")
254 c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
255 -- ("
256 LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
257
258 -- "(
259 o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
260 -- )"
261 LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc
262
263 acc -> acc
264
265 p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
266 p_Lexeme = debugParser "Lexeme" $
267 P.choice
268 [ P.try $ LexemeWhite <$> p_Cell p_Spaces1
269 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
270 , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
271 , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen)
272 , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
273 , P.try $ LexemeEscape <$> p_Cell p_Escape
274 , P.try $ LexemeLink <$> p_Cell p_Link
275 , P.try $ LexemeAlphaNum <$> p_Cell (P.takeWhile1P (Just "AlphaNum") Char.isAlphaNum)
276 , LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar)
277 ]
278
279 p_some :: Parser e s a -> Parser e s (NonEmpty a)
280 p_some p = NonEmpty.fromList <$> P.some p
281
282 pairAny :: Char -> Maybe Pair
283 pairAny = \case
284 '-' -> Just PairDash
285 '/' -> Just PairSlash
286 '"' -> Just PairDoublequote
287 '\'' -> Just PairSinglequote
288 '`' -> Just PairBackquote
289 '_' -> Just PairUnderscore
290 '*' -> Just PairStar
291 '#' -> Just PairHash
292 _ -> Nothing
293
294 pairOpen :: Char -> Maybe Pair
295 pairOpen = \case
296 '(' -> Just PairParen
297 '[' -> Just PairBracket
298 '{' -> Just PairBrace
299 '«' -> Just PairFrenchquote
300 _ -> Nothing
301
302 pairClose :: Char -> Maybe Pair
303 pairClose = \case
304 ')' -> Just PairParen
305 ']' -> Just PairBracket
306 '}' -> Just PairBrace
307 '»' -> Just PairFrenchquote
308 _ -> Nothing
309
310 p_Escape :: Parser e s Char
311 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
312
313 p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text
314 p_Link =
315 P.try (P.char '<' *> p <* P.char '>') <|>
316 p
317 where
318 p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
319 p =
320 (\scheme addr -> scheme <> "//" <> addr)
321 <$> P.option "" (P.try p_scheme)
322 <* P.string "//"
323 <*> p_addr
324 p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
325 p_scheme =
326 (<> ":")
327 <$> (P.takeWhile1P (Just "scheme") $ \c ->
328 Char.isAlphaNum c
329 || c=='_'
330 || c=='-'
331 || c=='+')
332 <* P.char ':'
333 p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
334 p_addr =
335 P.takeWhileP (Just "addr") $ \c ->
336 Char.isAlphaNum c
337 || c=='%'
338 || c=='/'
339 || c=='('
340 || c==')'
341 || c=='-'
342 || c=='_'
343 || c=='.'
344 || c=='#'
345 || c=='?'
346 || c=='='
347
348 -- | Build 'Tokens' from many 'Token's.
349 tokens :: [Cell Token] -> Tokens
350 tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
351
352 -- | Build 'Tokens' from one 'Token'.
353 tokens1 :: Tree (Cell Node) -> Tokens
354 tokens1 = Seq.singleton
355
356 unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
357 unTokenElem toks =
358 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
359 [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
360 _ -> Nothing
361
362 isTokenElem :: Tokens -> Bool
363 isTokenElem toks =
364 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
365 [Tree (unCell -> NodePair PairElem{}) _] -> True
366 _ -> False
367
368 pairBordersWithoutContent :: Pair -> (TL.Text,TL.Text)
369 pairBordersWithoutContent = \case
370 PairElem n as ->
371 ("<"<>n<>foldMap f as<>"/>","")
372 where f (elemAttr_white,ElemAttr{..}) =
373 elemAttr_white <>
374 elemAttr_name <>
375 elemAttr_open <>
376 elemAttr_value <>
377 elemAttr_close
378 p -> pairBorders p
379
380 pairBorders :: Pair -> (TL.Text,TL.Text)
381 pairBorders = \case
382 PairElem n as -> ("<"<>n<>foldMap f as<>">","</"<>n<>">")
383 where f (elemAttr_white,ElemAttr{..}) =
384 elemAttr_white <>
385 elemAttr_name <>
386 elemAttr_open <>
387 elemAttr_value <>
388 elemAttr_close
389 PairHash -> ("#","#")
390 PairStar -> ("*","*")
391 PairSlash -> ("/","/")
392 PairUnderscore -> ("_","_")
393 PairDash -> ("-","-")
394 PairBackquote -> ("`","`")
395 PairSinglequote -> ("'","'")
396 PairDoublequote -> ("\"","\"")
397 PairFrenchquote -> ("«","»")
398 PairParen -> ("(",")")
399 PairBrace -> ("{","}")
400 PairBracket -> ("[","]")
401
402 -- * Class 'TagFrom'
403 class TagFrom a where
404 tagFrom :: a -> Maybe (Cell Tag, a)
405 instance TagFrom Tokens where
406 tagFrom ts =
407 case Seq.viewl ts of
408 EmptyL -> Nothing
409 Tree0 (Cell b0 e0 n) :< ns ->
410 case n of
411 NodeToken (TokenText t) ->
412 case tagFrom $ Cell b0 e0 t of
413 Nothing -> Nothing
414 Just (t0,r0) ->
415 if TL.null (unCell r0)
416 then
417 case tagFrom ns of
418 Just (t1@(Cell b1 _e1 _), r1) | e0 == b1 ->
419 Just (t0<>t1, r1)
420 _ -> Just (t0, n0 <| ns)
421 else Just (t0, n0 <| ns)
422 where n0 = (Tree0 $ NodeToken . TokenText <$> r0)
423 _ -> Nothing
424 _ -> Nothing
425 instance TagFrom (Cell TL.Text) where
426 tagFrom (Cell bp ep t)
427 | (w,r) <- TL.span isTagChar t
428 , not $ TL.null w
429 , ew <- pos_column bp + sum (Text.length <$> (TL.toChunks w)) =
430 Just
431 ( Cell bp bp{pos_column=ew} w
432 , Cell bp{pos_column=ew} ep r )
433 tagFrom _ = Nothing
434
435 isTagChar :: Char -> Bool
436 isTagChar c =
437 Char.isAlphaNum c ||
438 c=='·' ||
439 case Char.generalCategory c of
440 Char.DashPunctuation -> True
441 Char.ConnectorPunctuation -> True
442 _ -> False