{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Rank2Types #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeFamilies #-} {-# OPTIONS_GHC -fno-warn-orphans #-} module Hdoc.TCT.Read.Token where import Control.Applicative (Applicative(..), Alternative(..)) import Control.Monad (Monad(..)) import Data.Bool import Data.Char (Char) import Data.Either (Either(..)) import Data.Eq (Eq(..)) import Data.Foldable (Foldable(..)) import Data.Function (($), (.)) import Data.Functor ((<$>), ($>)) import Data.List.NonEmpty (NonEmpty(..)) import Data.Maybe (Maybe(..)) import Data.Monoid (Monoid(..)) import Data.Semigroup (Semigroup(..)) import Data.Sequence (ViewL(..), ViewR(..), (<|)) import Data.TreeSeq.Strict (Tree(..), Trees) import Data.Tuple (fst,snd) import Data.Void (Void) import Prelude (Num(..)) import Text.Show (Show(..)) import qualified Data.Char as Char import qualified Data.List.NonEmpty as NonEmpty import qualified Data.Sequence as Seq import qualified Data.Text as Text import qualified Data.Text.Lazy as TL import qualified Text.Megaparsec as P import qualified Text.Megaparsec.Char as P import Hdoc.TCT.Debug import Hdoc.TCT.Cell import Hdoc.TCT.Elem import Hdoc.TCT.Tree import Hdoc.TCT.Read.Elem import Hdoc.TCT.Read.Cell -- * Type 'Pairs' -- | Right-only Dyck language, -- to keep track of opened 'Pair's. type Pairs = (Tokens,[Opening]) type Tokens = Trees (Cell Node) -- ** Type 'Opening' -- | An opened 'Pair' and its content so far. type Opening = (Cell Pair,Tokens) appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs appendPairsToken ps t = appendPairsTokens ps (pure t) appendPairsText :: Pairs -> Cell TL.Text -> Pairs appendPairsText ps (Cell sp t) = appendPairsToken ps $ Tree0 $ Cell sp $ NodeToken $ TokenText t appendPairsTokens :: Pairs -> Tokens -> Pairs appendPairsTokens (ts,[]) toks = (ts`unionTokens`toks,[]) appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps) -- | Unify two 'Tokens', merging border 'TokenText's if any. unionTokens :: Tokens -> Tokens -> Tokens unionTokens x y = case (Seq.viewr x, Seq.viewl y) of (xs :> x0, y0 :< ys) -> case (x0,y0) of ( Tree (Cell (Span fx bx _ex:| lx) (NodeToken (TokenText tx))) tsx , Tree (Cell (Span _fy _by ey:|_ly) (NodeToken (TokenText ty))) tsy ) -> xs `unionTokens` pure (Tree (Cell (Span fx bx ey:|lx) $ NodeToken $ TokenText $ tx <> ty) (tsx<>tsy)) `unionTokens` ys _ -> x <> y (EmptyR, _) -> y (_, EmptyL) -> x unionsTokens :: Foldable f => f Tokens -> Tokens unionsTokens = foldl' unionTokens mempty openPair :: Pairs -> Cell Pair -> Pairs openPair (t,ps) p = (t,(p,mempty):ps) -- | Close a 'Pair' when there is a matching 'LexemePairClose'. closePair :: Pairs -> Cell Pair -> Pairs closePair ps@(_,[]) (Cell loc p) = -- debug0 "closePair" $ appendPairsText ps $ Cell loc $ snd $ pairBordersDouble p closePair (t,(cx@(Cell (Span fx bx _ex:| lx) px),t1):ts) cy@(Cell (Span _fy _by ey:|_ly) py) = -- debug0 "closePair" $ case (px,py) of (PairElem nx ax, PairElem ny ay) | nx == ny -> appendPairsToken (t,ts) $ Tree (Cell (Span fx bx ey:|lx) $ NodePair $ PairElem nx as) t1 where as | null ay = ax | otherwise = ax<>ay _ | px == py -> appendPairsToken (t,ts) $ Tree (Cell (Span fx bx ey:|lx) $ NodePair px) t1 _ -> (`closePair` cy) $ appendPairsTokens (t,ts) (closeImpaired mempty (cx,t1)) -- | Close a 'Pair' when there is no matching 'LexemePairClose'. closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens closeImpaired acc (Cell loc@(s0:|lp) pair, toks) = -- debug0 "closeImpaired" $ case pair of -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'. PairHash | Just (Cell (Span{span_end}:|_lt) tag, rest) <- tagFrom body -> Tree0 (Cell (s0{span_end}:|lp) $ NodeToken $ TokenTag tag) <| rest -- NOTE: use bp (not bt) to include the '#' _ -> pure open `unionTokens` body where body = toks `unionTokens` acc open = Tree0 $ Cell loc $ NodeToken $ TokenText $ fst $ pairBordersDouble pair -- | Close remaining 'Pair's at end of parsing. closePairs :: Pairs -> Tokens closePairs (t0,ps) = -- debug0 "closePairs" $ t0 `unionTokens` foldl' closeImpaired mempty ps appendLexeme :: Lexeme -> Pairs -> Pairs appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc -> case lex of LexemePairOpen ps -> foldl' open acc ps where -- NOTE: insert an empty node to encode , not open a p@(Cell (Span{span_end, ..}:|sp) PairElem{}) = openPair a p `appendPairsText` Cell (Span{span_begin=span_end, ..}:|sp) "" open a p = openPair a p LexemePairClose ps -> foldl' closePair acc ps LexemePairAny ps -> foldl' openPair acc ps {- LexemePairAny ps -> appendPairsText acc $ sconcat $ ((fst . pairBordersSingle) <$>) <$> ps -} LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps LexemeEscape c -> appendPairsToken acc $ Tree0 $ NodeToken . TokenEscape <$> c LexemeLink t -> appendPairsToken acc $ Tree0 $ NodeToken . TokenLink <$> t {-LexemeWhite (unCell -> "") -> acc-} -- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc LexemeWhite t -> appendPairsText acc t LexemeAlphaNum t -> appendPairsText acc t LexemeOther t -> appendPairsText acc t LexemeTree t -> appendPairsToken acc t LexemeEnd -> acc appendLexemes :: Pairs -> [Lexeme] -> Pairs appendLexemes = foldr appendLexeme -- * Type 'Lexeme' -- | 'Lexeme's cut the input in the longest chunks of common semantic, -- this enables 'orientLexemePairAny' to work with a more meaningful context. data Lexeme = LexemePairOpen !(NonEmpty (Cell Pair)) | LexemePairClose !(NonEmpty (Cell Pair)) | LexemePairAny !(NonEmpty (Cell Pair)) -- ^ orientation depending on the surrounding 'Lexeme's, -- see 'orientLexemePairAny' | LexemePairBoth !(NonEmpty (Cell Pair)) | LexemeEscape !(Cell Char) | LexemeLink !(Cell TL.Text) | LexemeWhite !(Cell TL.Text) | LexemeAlphaNum !(Cell TL.Text) | LexemeOther !(Cell TL.Text) | LexemeTree !(Tree (Cell Node)) | LexemeEnd deriving (Eq, Show) instance Pretty Lexeme parseTokens :: [Lexeme] -> Tokens parseTokens ps = closePairs $ appendLexemes mempty $ -- debug0 "Lexemes (post orient)" $ orientLexemePairAny $ LexemeEnd : ps parseLexemes :: Cell TL.Text -> Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme] parseLexemes = runParserOnCell (p_Lexemes <* P.eof) -- | Parse 'Lexeme's, returning them in reverse order -- to apply 'orientLexemePairAny'. p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme] p_Lexemes = debugParser "Lexemes" $ go [] where go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme] go acc = (P.eof $> acc) <|> (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc) -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme', -- so that it can try to orient nearby 'LexemePairAny' -- to 'LexemePairOpen' or 'LexemePairClose'. orientLexemePairAny :: [Lexeme] -> [Lexeme] orientLexemePairAny = \case -- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc -- "    t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc --    " LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc LexemePairAny p:[] -> LexemePairOpen p:[] --    ,,," LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[] -- ",,,    w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc -- ",,,AAA an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc -- ,,,"AAA an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc -- ") c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc -- (" LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc -- "( o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc -- )" LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc acc -> acc p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme p_Lexeme = debugParser "Lexeme" $ P.choice [ P.try $ LexemeWhite <$> p_Cell p_Spaces1 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny) , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle) , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen) , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose) , P.try $ LexemeEscape <$> p_Cell p_Escape , P.try $ LexemeLink <$> p_Cell p_Link , P.try $ LexemeAlphaNum <$> p_Cell (P.takeWhile1P (Just "AlphaNum") Char.isAlphaNum) , LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar) ] p_some :: Parser e s a -> Parser e s (NonEmpty a) p_some p = NonEmpty.fromList <$> P.some p pairAny :: Char -> Maybe Pair pairAny = \case '-' -> Just PairDash '/' -> Just PairSlash '"' -> Just PairDoublequote '\'' -> Just PairSinglequote '`' -> Just PairBackquote '_' -> Just PairUnderscore '*' -> Just PairStar '#' -> Just PairHash _ -> Nothing pairOpen :: Char -> Maybe Pair pairOpen = \case '(' -> Just PairParen '[' -> Just PairBracket '{' -> Just PairBrace '«' -> Just PairFrenchquote _ -> Nothing pairClose :: Char -> Maybe Pair pairClose = \case ')' -> Just PairParen ']' -> Just PairBracket '}' -> Just PairBrace '»' -> Just PairFrenchquote _ -> Nothing p_Escape :: Parser e s Char p_Escape = P.char '\\' *> P.satisfy Char.isPrint p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text p_Link = P.try (P.char '<' *> p <* P.char '>') <|> p where p :: P.Tokens s ~ TL.Text => Parser e s TL.Text p = (\scheme addr -> scheme <> "//" <> addr) <$> P.option "" (P.try p_scheme) <* P.string "//" <*> p_addr p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text p_scheme = (<> ":") <$> (P.takeWhile1P (Just "scheme") $ \c -> Char.isAlphaNum c || c=='_' || c=='-' || c=='+') <* P.char ':' p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text p_addr = P.takeWhileP (Just "addr") $ \c -> Char.isAlphaNum c || c=='%' || c=='/' || c=='(' || c==')' || c=='-' || c=='_' || c=='.' || c=='#' || c=='?' || c=='=' || c=='@' pairBorders :: Foldable f => Pair -> f a -> (TL.Text,TL.Text) pairBorders p ts | null ts = pairBordersSingle p | otherwise = pairBordersDouble p pairBordersSingle :: Pair -> (TL.Text,TL.Text) pairBordersSingle = \case PairElem n as -> ("<"<>n<>foldMap f as<>"/>","") where f (elemAttr_white,ElemAttr{..}) = elemAttr_white <> elemAttr_name <> elemAttr_open <> elemAttr_value <> elemAttr_close p -> pairBordersDouble p pairBordersDouble :: Pair -> (TL.Text,TL.Text) pairBordersDouble = \case PairElem n as -> ("<"<>n<>foldMap f as<>">","n<>">") where f (elemAttr_white,ElemAttr{..}) = elemAttr_white <> elemAttr_name <> elemAttr_open <> elemAttr_value <> elemAttr_close PairHash -> ("#","#") PairStar -> ("*","*") PairSlash -> ("/","/") PairUnderscore -> ("_","_") PairDash -> ("-","-") PairBackquote -> ("`","`") PairSinglequote -> ("'","'") PairDoublequote -> ("\"","\"") PairFrenchquote -> ("«","»") PairParen -> ("(",")") PairBrace -> ("{","}") PairBracket -> ("[","]") -- * Class 'TagFrom' class TagFrom a where tagFrom :: a -> Maybe (Cell Tag, a) instance TagFrom Tokens where tagFrom ts = case Seq.viewl ts of EmptyL -> Nothing Tree0 (Cell loc0@(Span _f0 _b0 e0:|_l0) n) :< ns -> case n of NodeToken (TokenText t) -> case tagFrom $ Cell loc0 t of Nothing -> Nothing Just (t0,r0) -> if TL.null $ unCell r0 then case tagFrom ns of Just (t1@(Cell (Span _f1 b1 _e1:|_l1) _), r1) | e0 == b1 -> Just (t0<>t1, r1) _ -> Just (t0, ns) else Just (t0, pure n0 `unionTokens` ns) where n0 = Tree0 $ NodeToken . TokenText <$> r0 _ -> Nothing _ -> Nothing instance TagFrom (Cell TL.Text) where tagFrom (Cell (Span fp bp ep:|sp) t) | (w,r) <- TL.span isTagChar t , not $ TL.null w , ew <- pos_column bp + sum (Text.length <$> TL.toChunks w) = Just ( Cell (Span fp bp bp{pos_column=ew}:|sp) w , Cell (Span fp bp{pos_column=ew} ep:|sp) r ) tagFrom _ = Nothing isTagChar :: Char -> Bool isTagChar c = Char.isAlphaNum c || c=='·' || case Char.generalCategory c of Char.DashPunctuation -> True Char.ConnectorPunctuation -> True _ -> False {- -- | Build 'Tokens' from many 'Token's. tokens :: [Cell Token] -> Tokens tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts -- | Build 'Tokens' from one 'Token'. tokens1 :: Tree (Cell Node) -> Tokens tokens1 = Seq.singleton unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens)) unTokenElem toks = case toList $ {-Seq.dropWhileR isTokenWhite-} toks of [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts)) _ -> Nothing isTokenElem :: Tokens -> Bool isTokenElem toks = case toList $ {-Seq.dropWhileR isTokenWhite-} toks of [Tree (unCell -> NodePair PairElem{}) _] -> True _ -> False -}