{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE Rank2Types #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeFamilies #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Hdoc.TCT.Read.Token where
import Control.Applicative (Applicative(..), Alternative(..))
import Control.Monad (Monad(..))
import Data.Bool
import Data.Char (Char)
import Data.Either (Either(..))
import Data.Eq (Eq(..))
import Data.Foldable (Foldable(..))
import Data.Function (($), (.))
import Data.Functor ((<$>), ($>), (<$))
import Data.List.NonEmpty (NonEmpty(..))
import Data.Maybe (Maybe(..))
import Data.Monoid (Monoid(..))
import Data.Semigroup (Semigroup(..))
import Data.Sequence (ViewL(..), ViewR(..), (<|))
import Data.TreeSeq.Strict (Tree(..), Trees)
import Data.Tuple (fst,snd)
import Data.Void (Void)
import Prelude (Num(..))
import Text.Show (Show(..))
import qualified Data.Char as Char
import qualified Data.List.NonEmpty as NonEmpty
import qualified Data.Sequence as Seq
import qualified Data.Text as Text
import qualified Data.Text.Lazy as TL
import qualified Text.Megaparsec as P
import qualified Text.Megaparsec.Char as P
import Hdoc.TCT.Debug
import Hdoc.TCT.Cell
import Hdoc.TCT.Elem
import Hdoc.TCT.Tree
import Hdoc.TCT.Read.Elem
import Hdoc.TCT.Read.Cell
-- * Type 'Pairs'
-- | Right-only Dyck language,
-- to keep track of opened 'Pair's.
type Pairs = (Tokens,[Opening])
type Tokens = Trees (Cell Node)
-- ** Type 'Opening'
-- | An opened 'Pair' and its content so far.
type Opening = (Cell Pair,Tokens)
appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs
appendPairsToken ps t = appendPairsTokens ps (pure t)
appendPairsText :: Pairs -> Cell TL.Text -> Pairs
appendPairsText ps (Cell sp t) =
appendPairsToken ps $
Tree0 $ Cell sp $
NodeToken $ TokenText t
appendPairsTokens :: Pairs -> Tokens -> Pairs
appendPairsTokens (ts,[]) toks = (ts`unionTokens`toks,[])
appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps)
-- | Unify two 'Tokens', merging border 'TokenText's if any.
unionTokens :: Tokens -> Tokens -> Tokens
unionTokens x y =
case (Seq.viewr x, Seq.viewl y) of
(xs :> x0, y0 :< ys) ->
case (x0,y0) of
( Tree (Cell (Span fx bx _ex:| lx) (NodeToken (TokenText tx))) tsx
, Tree (Cell (Span _fy _by ey:|_ly) (NodeToken (TokenText ty))) tsy ) ->
xs `unionTokens`
pure (Tree (Cell (Span fx bx ey:|lx) $ NodeToken $ TokenText $ tx <> ty) (tsx<>tsy)) `unionTokens`
ys
_ -> x <> y
(EmptyR, _) -> y
(_, EmptyL) -> x
unionsTokens :: Foldable f => f Tokens -> Tokens
unionsTokens = foldl' unionTokens mempty
openPair :: Pairs -> Cell Pair -> Pairs
openPair (t,ps) p = (t,(p,mempty):ps)
-- | Close a 'Pair' when there is a matching 'LexemePairClose'.
closePair :: Pairs -> Cell Pair -> Pairs
closePair ps@(_,[]) (Cell loc p) = -- debug0 "closePair" $
appendPairsText ps $ Cell loc $
snd $ pairBordersDouble p
closePair (t,(cx@(Cell (Span fx bx _ex:| lx) px),t1):ts)
cy@(Cell (Span _fy _by ey:|_ly) py) = -- debug0 "closePair" $
case (px,py) of
(PairElem nx ax, PairElem ny ay) | nx == ny ->
appendPairsToken (t,ts) $
Tree (Cell (Span fx bx ey:|lx) $ NodePair $ PairElem nx as) t1
where as | null ay = ax
| otherwise = ax<>ay
_ | px == py ->
appendPairsToken (t,ts) $
Tree (Cell (Span fx bx ey:|lx) $ NodePair px) t1
_ ->
(`closePair` cy) $
appendPairsTokens
(t,ts)
(closeImpaired mempty (cx,t1))
-- | Close a 'Pair' when there is no matching 'LexemePairClose'.
closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens
closeImpaired acc (Cell loc@(s0:|lp) pair, toks) = -- debug0 "closeImpaired" $
case pair of
-- NOTE: try to close 'PairTag' as 'TokenTag' instead of 'TokenText'.
PairTag isBackref | Just (Cell (Span{span_end}:|_lt) ref, rest) <- parseRef body ->
Tree0 (Cell (s0{span_end}:|lp) $ NodeToken $ TokenTag isBackref ref) <| rest
-- NOTE: use bp (not bt) to include the '#'
-- NOTE: try to close 'PairAt' as 'TokenAt' instead of 'TokenText'.
PairAt isBackref | Just (Cell (Span{span_end}:|_lt) ref, rest) <- parseRef body ->
Tree0 (Cell (s0{span_end}:|lp) $ NodeToken $ TokenAt isBackref ref) <| rest
-- NOTE: use bp (not bt) to include the '@'
_ -> pure open `unionTokens` body
where
body = toks `unionTokens` acc
open = Tree0 $ Cell loc $ NodeToken $ TokenText $ fst $ pairBordersDouble pair
-- | Close remaining 'Pair's at end of parsing.
closePairs :: Pairs -> Tokens
closePairs (t0,ps) = -- debug0 "closePairs" $
t0 `unionTokens` foldl' closeImpaired mempty ps
appendLexeme :: Lexeme -> Pairs -> Pairs
appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
case lex of
LexemePairOpen ps -> foldl' open acc ps
where
-- NOTE: insert an empty node to encode , not
open a p@(Cell (Span{span_end, ..}:|sp) PairElem{}) = openPair a p `appendPairsText` Cell (Span{span_begin=span_end, ..}:|sp) ""
open a p = openPair a p
LexemePairClose ps -> foldl' closePair acc ps
LexemePairAny ps -> foldl' openPair acc ps
LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
LexemeEscape c -> appendPairsToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
LexemeLink t -> appendPairsToken acc $ Tree0 $ NodeToken . TokenLink <$> t
{-LexemeWhite (unCell -> "") -> acc-}
-- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc
LexemeWhite t -> appendPairsText acc t
LexemeAlphaNum t -> appendPairsText acc t
LexemeOther t -> appendPairsText acc t
LexemeTree t -> appendPairsToken acc t
LexemeEnd -> acc
appendLexemes :: Pairs -> [Lexeme] -> Pairs
appendLexemes = foldr appendLexeme
-- * Type 'Lexeme'
-- | 'Lexeme's cut the input in the longest chunks of common semantic,
-- this enables 'orientLexemePairAny' to work with a more meaningful context.
data Lexeme
= LexemePairOpen !(NonEmpty (Cell Pair))
| LexemePairClose !(NonEmpty (Cell Pair))
| LexemePairAny !(NonEmpty (Cell Pair))
-- ^ orientation depending on the surrounding 'Lexeme's,
-- see 'orientLexemePairAny'
| LexemePairBoth !(NonEmpty (Cell Pair))
| LexemeEscape !(Cell Char)
| LexemeLink !(Cell TL.Text)
| LexemeWhite !(Cell TL.Text)
| LexemeAlphaNum !(Cell TL.Text)
| LexemeOther !(Cell TL.Text)
| LexemeTree !(Tree (Cell Node)) -- FIXME: useless?
| LexemeEnd
deriving (Eq, Show)
instance Pretty Lexeme
parseTokens :: [Lexeme] -> Tokens
parseTokens ps =
closePairs $
appendLexemes mempty $
-- debug0 "Lexemes (post orient)" $
orientLexemePairAny $ LexemeEnd :
ps
parseLexemes ::
Cell TL.Text ->
Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme]
parseLexemes = runParserOnCell (p_Lexemes <* P.eof)
-- | Parse 'Lexeme's, returning them in reverse order
-- to apply 'orientLexemePairAny'.
p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
p_Lexemes = debugParser "Lexemes" $ go []
where
go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
go acc =
(P.eof $> acc) <|>
(p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
-- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
-- so that it can try to orient nearby 'LexemePairAny'
-- to 'LexemePairOpen' or 'LexemePairClose'.
orientLexemePairAny :: [Lexeme] -> [Lexeme]
orientLexemePairAny = \case
-- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc
-- "
t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
-- "
LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
LexemePairAny p:[] -> LexemePairOpen p:[]
-- ,,,"
LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
-- ",,,
w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
-- ",,,AAA
an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
an@LexemeEscape{} :a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
-- ,,,"AAA
an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
an@LexemeEscape{} :LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
-- ")
c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
-- ("
LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
-- "(
o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
-- )"
LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc
-- AAA#AAA
a1@LexemeAlphaNum{}:LexemePairAny p:a2@LexemeAlphaNum{}:acc ->
a1:LexemeOther (sconcat (((fst . pairBordersDouble) <$>) <$> p)):a2:acc
acc -> acc
p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
p_Lexeme = debugParser "Lexeme" $
P.choice
[ P.try $ LexemeWhite <$> p_Cell p_Spaces1
, P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
, P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
, P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen <|> P.try p_BackOpen)
, P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
, P.try $ LexemeEscape <$> p_Cell p_Escape
, P.try $ LexemeLink <$> p_Cell p_Link
, P.try $ LexemeAlphaNum <$> p_Cell p_AlphaNums1
, LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar)
]
p_some :: Parser e s a -> Parser e s (NonEmpty a)
p_some p = NonEmpty.fromList <$> P.some p
pairAny :: Char -> Maybe Pair
pairAny = \case
'-' -> Just PairDash
'/' -> Just PairSlash
'"' -> Just PairDoublequote
'\'' -> Just PairSinglequote
'`' -> Just PairBackquote
'_' -> Just PairUnderscore
'*' -> Just PairStar
'#' -> Just $ PairTag False
'@' -> Just $ PairAt False
_ -> Nothing
pairOpen :: Char -> Maybe Pair
pairOpen = \case
'(' -> Just PairParen
'[' -> Just PairBracket
'{' -> Just PairBrace
'«' -> Just PairFrenchquote
_ -> Nothing
pairClose :: Char -> Maybe Pair
pairClose = \case
')' -> Just PairParen
']' -> Just PairBracket
'}' -> Just PairBrace
'»' -> Just PairFrenchquote
_ -> Nothing
p_BackOpen :: P.Tokens s ~ TL.Text => Parser e s Pair
p_BackOpen = debugParser "BackOpen" $
P.char '~'
*> (PairAt True <$ P.char '@'
<|> PairTag True <$ P.char '#')
p_Escape :: Parser e s Char
p_Escape = P.char '\\' *> P.satisfy Char.isPrint
p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text
p_Link =
P.try (P.char '<' *> p <* P.char '>') <|>
p
where
p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
p =
(\scheme addr -> scheme <> "//" <> addr)
<$> P.option "" (P.try p_scheme)
<* P.string "//"
<*> p_addr
p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
p_scheme =
(<> ":")
<$> (P.takeWhile1P (Just "scheme") $ \c ->
Char.isAlphaNum c
|| c=='_'
|| c=='-'
|| c=='+')
<* P.char ':'
p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
p_addr =
P.takeWhileP (Just "addr") $ \c ->
Char.isAlphaNum c
|| c=='%'
|| c=='/'
|| c=='('
|| c==')'
|| c=='-'
|| c=='_'
|| c=='.'
|| c=='#'
|| c=='?'
|| c=='='
|| c=='@'
pairBorders :: Foldable f => Pair -> f a -> (TL.Text,TL.Text)
pairBorders p ts | null ts = pairBordersSingle p
| otherwise = pairBordersDouble p
pairBordersSingle :: Pair -> (TL.Text,TL.Text)
pairBordersSingle = \case
PairElem n as ->
("<"<>n<>foldMap f as<>"/>","")
where f (elemAttr_white,ElemAttr{..}) =
elemAttr_white <>
elemAttr_name <>
elemAttr_open <>
elemAttr_value <>
elemAttr_close
p -> pairBordersDouble p
pairBordersDouble :: Pair -> (TL.Text,TL.Text)
pairBordersDouble = \case
PairElem n as -> ("<"<>n<>foldMap f as<>">",""<>n<>">")
where f (elemAttr_white,ElemAttr{..}) =
elemAttr_white <>
elemAttr_name <>
elemAttr_open <>
elemAttr_value <>
elemAttr_close
PairTag isBackref | isBackref -> ("^#","#")
| otherwise -> ("#","#")
PairAt isBackref | isBackref -> ("^@","@")
| otherwise -> ("@","@")
PairStar -> ("*","*")
PairSlash -> ("/","/")
PairUnderscore -> ("_","_")
PairDash -> ("-","-")
PairBackquote -> ("`","`")
PairSinglequote -> ("'","'")
PairDoublequote -> ("\"","\"")
PairFrenchquote -> ("«","»")
PairParen -> ("(",")")
PairBrace -> ("{","}")
PairBracket -> ("[","]")
-- * Class 'ParseRef'
class ParseRef a where
parseRef :: a -> Maybe (Cell Ref, a)
instance ParseRef Tokens where
parseRef ts =
case Seq.viewl ts of
EmptyL -> Nothing
Tree0 (Cell loc0@(Span _f0 _b0 e0:|_l0) n) :< ns ->
case n of
NodeToken (TokenText t) ->
case parseRef $ Cell loc0 t of
Nothing -> Nothing
Just (t0,r0) ->
if TL.null $ unCell r0
then
case parseRef ns of
Just (t1@(Cell (Span _f1 b1 _e1:|_l1) _), r1) | e0 == b1 ->
Just (t0<>t1, r1)
_ -> Just (t0, ns)
else Just (t0, pure n0 `unionTokens` ns)
where n0 = Tree0 $ NodeToken . TokenText <$> r0
_ -> Nothing
_ -> Nothing
instance ParseRef (Cell TL.Text) where
parseRef (Cell (Span fp bp ep:|sp) t)
| (w,r) <- TL.span isTagChar t
, not $ TL.null w
, ew <- pos_column bp + sum (Text.length <$> TL.toChunks w) =
Just
( Cell (Span fp bp bp{pos_column=ew}:|sp) w
, Cell (Span fp bp{pos_column=ew} ep:|sp) r )
parseRef _ = Nothing
isTagChar :: Char -> Bool
isTagChar c =
Char.isAlphaNum c ||
c=='·' ||
case Char.generalCategory c of
Char.DashPunctuation -> True
Char.ConnectorPunctuation -> True
_ -> False
{-
-- | Build 'Tokens' from many 'Token's.
tokens :: [Cell Token] -> Tokens
tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
-- | Build 'Tokens' from one 'Token'.
tokens1 :: Tree (Cell Node) -> Tokens
tokens1 = Seq.singleton
unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
unTokenElem toks =
case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
[Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
_ -> Nothing
isTokenElem :: Tokens -> Bool
isTokenElem toks =
case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
[Tree (unCell -> NodePair PairElem{}) _] -> True
_ -> False
-}