1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Hdoc.TCT.Read.Token where
10 import Control.Applicative (Applicative(..), Alternative(..))
11 import Control.Monad (Monad(..))
13 import Data.Char (Char)
14 import Data.Either (Either(..))
15 import Data.Eq (Eq(..))
16 import Data.Foldable (Foldable(..))
17 import Data.Function (($), (.))
18 import Data.Functor ((<$>), ($>), (<$))
19 import Data.List.NonEmpty (NonEmpty(..))
20 import Data.Maybe (Maybe(..))
21 import Data.Monoid (Monoid(..))
22 import Data.Semigroup (Semigroup(..))
23 import Data.Sequence (ViewL(..), ViewR(..), (<|))
24 import Data.TreeSeq.Strict (Tree(..), Trees)
25 import Data.Tuple (fst,snd)
26 import Data.Void (Void)
27 import Prelude (Num(..))
28 import Text.Show (Show(..))
29 import qualified Data.Char as Char
30 import qualified Data.List.NonEmpty as NonEmpty
31 import qualified Data.Sequence as Seq
32 import qualified Data.Text as Text
33 import qualified Data.Text.Lazy as TL
34 import qualified Text.Megaparsec as P
35 import qualified Text.Megaparsec.Char as P
41 import Hdoc.TCT.Read.Elem
42 import Hdoc.TCT.Read.Cell
45 -- | Right-only Dyck language,
46 -- to keep track of opened 'Pair's.
47 type Pairs = (Tokens,[Opening])
48 type Tokens = Trees (Cell Node)
51 -- | An opened 'Pair' and its content so far.
52 type Opening = (Cell Pair,Tokens)
54 appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs
55 appendPairsToken ps t = appendPairsTokens ps (pure t)
57 appendPairsText :: Pairs -> Cell TL.Text -> Pairs
58 appendPairsText ps (Cell sp t) =
61 NodeToken $ TokenText t
63 appendPairsTokens :: Pairs -> Tokens -> Pairs
64 appendPairsTokens (ts,[]) toks = (ts`unionTokens`toks,[])
65 appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps)
67 -- | Unify two 'Tokens', merging border 'TokenText's if any.
68 unionTokens :: Tokens -> Tokens -> Tokens
70 case (Seq.viewr x, Seq.viewl y) of
71 (xs :> x0, y0 :< ys) ->
73 ( Tree (Cell (Span fx bx _ex:| lx) (NodeToken (TokenText tx))) tsx
74 , Tree (Cell (Span _fy _by ey:|_ly) (NodeToken (TokenText ty))) tsy ) ->
76 pure (Tree (Cell (Span fx bx ey:|lx) $ NodeToken $ TokenText $ tx <> ty) (tsx<>tsy)) `unionTokens`
82 unionsTokens :: Foldable f => f Tokens -> Tokens
83 unionsTokens = foldl' unionTokens mempty
85 openPair :: Pairs -> Cell Pair -> Pairs
86 openPair (t,ps) p = (t,(p,mempty):ps)
88 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
89 closePair :: Pairs -> Cell Pair -> Pairs
90 closePair ps@(_,[]) (Cell loc p) = -- debug0 "closePair" $
91 appendPairsText ps $ Cell loc $
92 snd $ pairBordersDouble p
93 closePair (t,(cx@(Cell (Span fx bx _ex:| lx) px),t1):ts)
94 cy@(Cell (Span _fy _by ey:|_ly) py) = -- debug0 "closePair" $
96 (PairElem nx ax, PairElem ny ay) | nx == ny ->
97 appendPairsToken (t,ts) $
98 Tree (Cell (Span fx bx ey:|lx) $ NodePair $ PairElem nx as) t1
99 where as | null ay = ax
102 appendPairsToken (t,ts) $
103 Tree (Cell (Span fx bx ey:|lx) $ NodePair px) t1
108 (closeImpaired mempty (cx,t1))
110 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
111 closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens
112 closeImpaired acc (Cell loc@(s0:|lp) pair, toks) = -- debug0 "closeImpaired" $
114 -- NOTE: try to close 'PairTag' as 'TokenTag' instead of 'TokenText'.
115 PairTag isBackref | Just (Cell (Span{span_end}:|_lt) ref, rest) <- parseRef body ->
116 Tree0 (Cell (s0{span_end}:|lp) $ NodeToken $ TokenTag isBackref ref) <| rest
117 -- NOTE: use bp (not bt) to include the '#'
118 -- NOTE: try to close 'PairAt' as 'TokenAt' instead of 'TokenText'.
119 PairAt isBackref | Just (Cell (Span{span_end}:|_lt) ref, rest) <- parseRef body ->
120 Tree0 (Cell (s0{span_end}:|lp) $ NodeToken $ TokenAt isBackref ref) <| rest
121 -- NOTE: use bp (not bt) to include the '@'
122 _ -> pure open `unionTokens` body
124 body = toks `unionTokens` acc
125 open = Tree0 $ Cell loc $ NodeToken $ TokenText $ fst $ pairBordersDouble pair
127 -- | Close remaining 'Pair's at end of parsing.
128 closePairs :: Pairs -> Tokens
129 closePairs (t0,ps) = -- debug0 "closePairs" $
130 t0 `unionTokens` foldl' closeImpaired mempty ps
132 appendLexeme :: Lexeme -> Pairs -> Pairs
133 appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
135 LexemePairOpen ps -> foldl' open acc ps
137 -- NOTE: insert an empty node to encode <elem></elem>, not <elem/>
138 open a p@(Cell (Span{span_end, ..}:|sp) PairElem{}) = openPair a p `appendPairsText` Cell (Span{span_begin=span_end, ..}:|sp) ""
139 open a p = openPair a p
140 LexemePairClose ps -> foldl' closePair acc ps
141 LexemePairAny ps -> foldl' openPair acc ps
142 LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
143 LexemeEscape c -> appendPairsToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
144 LexemeLink t -> appendPairsToken acc $ Tree0 $ NodeToken . TokenLink <$> t
145 {-LexemeWhite (unCell -> "") -> acc-}
146 -- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc
147 LexemeWhite t -> appendPairsText acc t
148 LexemeAlphaNum t -> appendPairsText acc t
149 LexemeOther t -> appendPairsText acc t
150 LexemeTree t -> appendPairsToken acc t
153 appendLexemes :: Pairs -> [Lexeme] -> Pairs
154 appendLexemes = foldr appendLexeme
157 -- | 'Lexeme's cut the input in the longest chunks of common semantic,
158 -- this enables 'orientLexemePairAny' to work with a more meaningful context.
160 = LexemePairOpen !(NonEmpty (Cell Pair))
161 | LexemePairClose !(NonEmpty (Cell Pair))
162 | LexemePairAny !(NonEmpty (Cell Pair))
163 -- ^ orientation depending on the surrounding 'Lexeme's,
164 -- see 'orientLexemePairAny'
165 | LexemePairBoth !(NonEmpty (Cell Pair))
166 | LexemeEscape !(Cell Char)
167 | LexemeLink !(Cell TL.Text)
168 | LexemeWhite !(Cell TL.Text)
169 | LexemeAlphaNum !(Cell TL.Text)
170 | LexemeOther !(Cell TL.Text)
171 | LexemeTree !(Tree (Cell Node)) -- FIXME: useless?
174 instance Pretty Lexeme
176 parseTokens :: [Lexeme] -> Tokens
179 appendLexemes mempty $
180 -- debug0 "Lexemes (post orient)" $
181 orientLexemePairAny $ LexemeEnd :
186 Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme]
187 parseLexemes = runParserOnCell (p_Lexemes <* P.eof)
189 -- | Parse 'Lexeme's, returning them in reverse order
190 -- to apply 'orientLexemePairAny'.
191 p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
192 p_Lexemes = debugParser "Lexemes" $ go []
194 go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
197 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
199 -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
200 -- so that it can try to orient nearby 'LexemePairAny'
201 -- to 'LexemePairOpen' or 'LexemePairClose'.
202 orientLexemePairAny :: [Lexeme] -> [Lexeme]
203 orientLexemePairAny = \case
204 -- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc
207 t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
208 w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
209 LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
211 LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
212 LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
213 LexemePairAny p:[] -> LexemePairOpen p:[]
216 LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
217 LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
219 w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
220 LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
223 an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
224 an@LexemeEscape{} :a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
226 an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
227 an@LexemeEscape{} :LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
230 c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
232 LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
235 o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
237 LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc
240 a1@LexemeAlphaNum{}:LexemePairAny p:a2@LexemeAlphaNum{}:acc ->
241 a1:LexemeOther (sconcat (((fst . pairBordersDouble) <$>) <$> p)):a2:acc
245 p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
246 p_Lexeme = debugParser "Lexeme" $
248 [ P.try $ LexemeWhite <$> p_Cell p_Spaces1
249 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
250 , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
251 , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen <|> P.try p_BackOpen)
252 , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
253 , P.try $ LexemeEscape <$> p_Cell p_Escape
254 , P.try $ LexemeLink <$> p_Cell p_Link
255 , P.try $ LexemeAlphaNum <$> p_Cell p_AlphaNums1
256 , LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar)
259 p_some :: Parser e s a -> Parser e s (NonEmpty a)
260 p_some p = NonEmpty.fromList <$> P.some p
262 pairAny :: Char -> Maybe Pair
265 '/' -> Just PairSlash
266 '"' -> Just PairDoublequote
267 '\'' -> Just PairSinglequote
268 '`' -> Just PairBackquote
269 '_' -> Just PairUnderscore
271 '#' -> Just $ PairTag False
272 '@' -> Just $ PairAt False
275 pairOpen :: Char -> Maybe Pair
277 '(' -> Just PairParen
278 '[' -> Just PairBracket
279 '{' -> Just PairBrace
280 '«' -> Just PairFrenchquote
283 pairClose :: Char -> Maybe Pair
285 ')' -> Just PairParen
286 ']' -> Just PairBracket
287 '}' -> Just PairBrace
288 '»' -> Just PairFrenchquote
291 p_BackOpen :: P.Tokens s ~ TL.Text => Parser e s Pair
292 p_BackOpen = debugParser "BackOpen" $
294 *> (PairAt True <$ P.char '@'
295 <|> PairTag True <$ P.char '#')
297 p_Escape :: Parser e s Char
298 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
300 p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text
302 P.try (P.char '<' *> p <* P.char '>') <|>
305 p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
307 (\scheme addr -> scheme <> "//" <> addr)
308 <$> P.option "" (P.try p_scheme)
311 p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
314 <$> (P.takeWhile1P (Just "scheme") $ \c ->
320 p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
322 P.takeWhileP (Just "addr") $ \c ->
336 pairBorders :: Foldable f => Pair -> f a -> (TL.Text,TL.Text)
337 pairBorders p ts | null ts = pairBordersSingle p
338 | otherwise = pairBordersDouble p
340 pairBordersSingle :: Pair -> (TL.Text,TL.Text)
341 pairBordersSingle = \case
343 ("<"<>n<>foldMap f as<>"/>","")
344 where f (elemAttr_white,ElemAttr{..}) =
350 p -> pairBordersDouble p
352 pairBordersDouble :: Pair -> (TL.Text,TL.Text)
353 pairBordersDouble = \case
354 PairElem n as -> ("<"<>n<>foldMap f as<>">","</"<>n<>">")
355 where f (elemAttr_white,ElemAttr{..}) =
361 PairTag isBackref | isBackref -> ("^#","#")
362 | otherwise -> ("#","#")
363 PairAt isBackref | isBackref -> ("^@","@")
364 | otherwise -> ("@","@")
365 PairStar -> ("*","*")
366 PairSlash -> ("/","/")
367 PairUnderscore -> ("_","_")
368 PairDash -> ("-","-")
369 PairBackquote -> ("`","`")
370 PairSinglequote -> ("'","'")
371 PairDoublequote -> ("\"","\"")
372 PairFrenchquote -> ("«","»")
373 PairParen -> ("(",")")
374 PairBrace -> ("{","}")
375 PairBracket -> ("[","]")
377 -- * Class 'ParseRef'
378 class ParseRef a where
379 parseRef :: a -> Maybe (Cell Ref, a)
380 instance ParseRef Tokens where
384 Tree0 (Cell loc0@(Span _f0 _b0 e0:|_l0) n) :< ns ->
386 NodeToken (TokenText t) ->
387 case parseRef $ Cell loc0 t of
390 if TL.null $ unCell r0
393 Just (t1@(Cell (Span _f1 b1 _e1:|_l1) _), r1) | e0 == b1 ->
396 else Just (t0, pure n0 `unionTokens` ns)
397 where n0 = Tree0 $ NodeToken . TokenText <$> r0
400 instance ParseRef (Cell TL.Text) where
401 parseRef (Cell (Span fp bp ep:|sp) t)
402 | (w,r) <- TL.span isTagChar t
404 , ew <- pos_column bp + sum (Text.length <$> TL.toChunks w) =
406 ( Cell (Span fp bp bp{pos_column=ew}:|sp) w
407 , Cell (Span fp bp{pos_column=ew} ep:|sp) r )
410 isTagChar :: Char -> Bool
414 case Char.generalCategory c of
415 Char.DashPunctuation -> True
416 Char.ConnectorPunctuation -> True
420 -- | Build 'Tokens' from many 'Token's.
421 tokens :: [Cell Token] -> Tokens
422 tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
424 -- | Build 'Tokens' from one 'Token'.
425 tokens1 :: Tree (Cell Node) -> Tokens
426 tokens1 = Seq.singleton
428 unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
430 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
431 [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
434 isTokenElem :: Tokens -> Bool
436 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
437 [Tree (unCell -> NodePair PairElem{}) _] -> True