{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Rank2Types #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeFamilies #-} {-# LANGUAGE ViewPatterns #-} {-# OPTIONS_GHC -fno-warn-orphans #-} module Language.TCT.Read.Token where import Control.Applicative (Applicative(..), Alternative(..)) import Control.Monad (Monad(..)) import Data.Bool import Data.Char (Char) import Data.Either (Either(..)) import Data.Eq (Eq(..)) import Data.Foldable (Foldable(..)) import Data.Function (($), (.)) import Data.Functor ((<$>), ($>)) import Data.List.NonEmpty (NonEmpty(..)) import Data.Maybe (Maybe(..)) import Data.Monoid (Monoid(..)) import Data.Semigroup (Semigroup(..)) import Data.Sequence (ViewL(..), ViewR(..), (<|)) import Data.String (String) import Data.TreeSeq.Strict (Tree(..), Trees) import Data.Tuple (fst,snd) import Data.Void (Void) import Prelude (Num(..)) import Text.Show (Show(..)) import qualified Data.Char as Char import qualified Data.List.NonEmpty as NonEmpty import qualified Data.Sequence as Seq import qualified Data.Text as Text import qualified Data.Text.Lazy as TL import qualified Text.Megaparsec as P import qualified Text.Megaparsec.Char as P import Language.TCT.Debug import Language.TCT.Cell import Language.TCT.Elem import Language.TCT.Tree import Language.TCT.Read.Elem import Language.TCT.Read.Cell -- * Type 'Pairs' -- | Right-only Dyck language, -- to keep track of opened 'Pair's. type Pairs = (Tokens,[Opening]) type Tokens = Trees (Cell Node) -- ** Type 'Opening' -- | An opened 'Pair' and its content so far. type Opening = (Cell Pair,Tokens) appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs appendPairsToken ps t = appendPairsTokens ps (pure t) appendPairsText :: Pairs -> Cell TL.Text -> Pairs appendPairsText ps (Cell bp ep t) = appendPairsToken ps $ Tree0 $ Cell bp ep $ NodeToken $ TokenText t appendPairsTokens :: Pairs -> Tokens -> Pairs appendPairsTokens (ts,[]) toks = (ts`unionTokens`toks,[]) appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps) -- | Unify two 'Tokens', merging border 'TokenText's if any. unionTokens :: Tokens -> Tokens -> Tokens unionTokens x y = case (Seq.viewr x, Seq.viewl y) of (xs :> x0, y0 :< ys) -> case (x0,y0) of ( Tree (Cell bx _ex (NodeToken (TokenText tx))) sx , Tree (Cell _by ey (NodeToken (TokenText ty))) sy ) -> xs `unionTokens` pure (Tree (Cell bx ey $ NodeToken $ TokenText $ tx <> ty) (sx<>sy)) `unionTokens` ys _ -> x <> y (EmptyR, _) -> y (_, EmptyL) -> x unionsTokens :: Foldable f => f Tokens -> Tokens unionsTokens = foldl' unionTokens mempty openPair :: Pairs -> Cell Pair -> Pairs openPair (t,ps) p = (t,(p,mempty):ps) -- | Close a 'Pair' when there is a matching 'LexemePairClose'. closePair :: Pairs -> Cell Pair -> Pairs closePair ps@(_,[]) (Cell bp ep p) = -- debug0 "closePair" $ appendPairsText ps $ Cell bp ep $ snd $ pairBordersDouble p closePair (t,(p1,t1):ts) p = -- debug0 "closePair" $ case (p1,p) of (Cell bx _ex (PairElem nx ax), Cell _by ey (PairElem ny ay)) | nx == ny -> appendPairsToken (t,ts) $ Tree (Cell bx ey $ NodePair $ PairElem nx as) t1 where as | null ay = ax | otherwise = ax<>ay (Cell bx _ex x, Cell _by ey y) | x == y -> appendPairsToken (t,ts) $ Tree (Cell bx ey $ NodePair x) t1 _ -> (`closePair` p) $ appendPairsTokens (t,ts) (closeImpaired mempty (p1,t1)) -- | Close a 'Pair' when there is no matching 'LexemePairClose'. closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens closeImpaired acc (Cell bp ep pair, toks) = -- debug0 "closeImpaired" $ case pair of -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'. PairHash | Just (Cell _bt et tag, rest) <- tagFrom body -> Tree0 (Cell bp et $ NodeToken $ TokenTag tag) <| rest -- NOTE: use bp (not bt) to include the '#' _ -> pure open `unionTokens` body where body = toks `unionTokens` acc open = Tree0 $ Cell bp ep $ NodeToken $ TokenText $ fst $ pairBordersDouble pair -- | Close remaining 'Pair's at end of parsing. closePairs :: Pairs -> Tokens closePairs (t0,ps) = -- debug0 "closePairs" $ t0 `unionTokens` foldl' closeImpaired mempty ps appendLexeme :: Lexeme -> Pairs -> Pairs appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc -> case lex of LexemePairOpen ps -> foldl' open acc ps where -- NOTE: insert an empty node to encode , not open a p@(Cell _bp ep (PairElem{})) = openPair a p `appendPairsText` Cell ep ep "" open a p = openPair a p LexemePairClose ps -> foldl' closePair acc ps LexemePairAny ps -> foldl' openPair acc ps {- LexemePairAny ps -> appendPairsText acc $ sconcat $ ((fst . pairBordersSingle) <$>) <$> ps -} LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps LexemeEscape c -> appendPairsToken acc $ Tree0 $ NodeToken . TokenEscape <$> c LexemeLink t -> appendPairsToken acc $ Tree0 $ NodeToken . TokenLink <$> t {-LexemeWhite (unCell -> "") -> acc-} -- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc LexemeWhite t -> appendPairsText acc t LexemeAlphaNum t -> appendPairsText acc t LexemeOther t -> appendPairsText acc t LexemeTree t -> appendPairsToken acc t LexemeEnd -> acc appendLexemes :: Pairs -> [Lexeme] -> Pairs appendLexemes = foldr appendLexeme -- * Type 'Lexeme' -- | 'Lexeme's cut the input in the longest chunks of common semantic, -- this enables 'orientLexemePairAny' to work with a more meaningful context. data Lexeme = LexemePairOpen !(NonEmpty (Cell Pair)) | LexemePairClose !(NonEmpty (Cell Pair)) | LexemePairAny !(NonEmpty (Cell Pair)) -- ^ orientation depending on the surrounding 'Lexeme's, -- see 'orientLexemePairAny' | LexemePairBoth !(NonEmpty (Cell Pair)) | LexemeEscape !(Cell Char) | LexemeLink !(Cell TL.Text) | LexemeWhite !(Cell TL.Text) | LexemeAlphaNum !(Cell TL.Text) | LexemeOther !(Cell TL.Text) | LexemeTree !(Tree (Cell Node)) | LexemeEnd deriving (Eq, Show) instance Pretty Lexeme parseTokens :: [Lexeme] -> Tokens parseTokens ps = closePairs $ appendLexemes mempty $ -- debug0 "Lexemes (post orient)" $ orientLexemePairAny $ LexemeEnd : ps parseLexemes :: String -> Cell TL.Text -> Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme] parseLexemes inp = runParserOnCell inp (p_Lexemes <* P.eof) -- | Parse 'Lexeme's, returning them in reverse order -- to apply 'orientLexemePairAny'. p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme] p_Lexemes = debugParser "Lexemes" $ go [] where go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme] go acc = (P.eof $> acc) <|> (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc) -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme', -- so that it can try to orient nearby 'LexemePairAny' -- to 'LexemePairOpen' or 'LexemePairClose'. orientLexemePairAny :: [Lexeme] -> [Lexeme] orientLexemePairAny = \case -- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc -- "    t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc --    " LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc LexemePairAny p:[] -> LexemePairOpen p:[] --    ,,," LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[] -- ",,,    w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc -- ",,,AAA an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc -- ,,,"AAA an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc -- ") c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc -- (" LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc -- "( o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc -- )" LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc acc -> acc p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme p_Lexeme = debugParser "Lexeme" $ P.choice [ P.try $ LexemeWhite <$> p_Cell p_Spaces1 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny) , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle) , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen) , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose) , P.try $ LexemeEscape <$> p_Cell p_Escape , P.try $ LexemeLink <$> p_Cell p_Link , P.try $ LexemeAlphaNum <$> p_Cell (P.takeWhile1P (Just "AlphaNum") Char.isAlphaNum) , LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar) ] p_some :: Parser e s a -> Parser e s (NonEmpty a) p_some p = NonEmpty.fromList <$> P.some p pairAny :: Char -> Maybe Pair pairAny = \case '-' -> Just PairDash '/' -> Just PairSlash '"' -> Just PairDoublequote '\'' -> Just PairSinglequote '`' -> Just PairBackquote '_' -> Just PairUnderscore '*' -> Just PairStar '#' -> Just PairHash _ -> Nothing pairOpen :: Char -> Maybe Pair pairOpen = \case '(' -> Just PairParen '[' -> Just PairBracket '{' -> Just PairBrace '«' -> Just PairFrenchquote _ -> Nothing pairClose :: Char -> Maybe Pair pairClose = \case ')' -> Just PairParen ']' -> Just PairBracket '}' -> Just PairBrace '»' -> Just PairFrenchquote _ -> Nothing p_Escape :: Parser e s Char p_Escape = P.char '\\' *> P.satisfy Char.isPrint p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text p_Link = P.try (P.char '<' *> p <* P.char '>') <|> p where p :: P.Tokens s ~ TL.Text => Parser e s TL.Text p = (\scheme addr -> scheme <> "//" <> addr) <$> P.option "" (P.try p_scheme) <* P.string "//" <*> p_addr p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text p_scheme = (<> ":") <$> (P.takeWhile1P (Just "scheme") $ \c -> Char.isAlphaNum c || c=='_' || c=='-' || c=='+') <* P.char ':' p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text p_addr = P.takeWhileP (Just "addr") $ \c -> Char.isAlphaNum c || c=='%' || c=='/' || c=='(' || c==')' || c=='-' || c=='_' || c=='.' || c=='#' || c=='?' || c=='=' pairBorders :: Foldable f => Pair -> f a -> (TL.Text,TL.Text) pairBorders p ts | null ts = pairBordersSingle p | otherwise = pairBordersDouble p pairBordersSingle :: Pair -> (TL.Text,TL.Text) pairBordersSingle = \case PairElem n as -> ("<"<>n<>foldMap f as<>"/>","") where f (elemAttr_white,ElemAttr{..}) = elemAttr_white <> elemAttr_name <> elemAttr_open <> elemAttr_value <> elemAttr_close p -> pairBordersDouble p pairBordersDouble :: Pair -> (TL.Text,TL.Text) pairBordersDouble = \case PairElem n as -> ("<"<>n<>foldMap f as<>">","n<>">") where f (elemAttr_white,ElemAttr{..}) = elemAttr_white <> elemAttr_name <> elemAttr_open <> elemAttr_value <> elemAttr_close PairHash -> ("#","#") PairStar -> ("*","*") PairSlash -> ("/","/") PairUnderscore -> ("_","_") PairDash -> ("-","-") PairBackquote -> ("`","`") PairSinglequote -> ("'","'") PairDoublequote -> ("\"","\"") PairFrenchquote -> ("«","»") PairParen -> ("(",")") PairBrace -> ("{","}") PairBracket -> ("[","]") -- * Class 'TagFrom' class TagFrom a where tagFrom :: a -> Maybe (Cell Tag, a) instance TagFrom Tokens where tagFrom ts = case Seq.viewl ts of EmptyL -> Nothing Tree0 (Cell b0 e0 n) :< ns -> case n of NodeToken (TokenText t) -> case tagFrom $ Cell b0 e0 t of Nothing -> Nothing Just (t0,r0) -> if TL.null (unCell r0) then case tagFrom ns of Just (t1@(Cell b1 _e1 _), r1) | e0 == b1 -> Just (t0<>t1, r1) _ -> Just (t0, pure n0 `unionTokens` ns) else Just (t0, pure n0 `unionTokens` ns) where n0 = (Tree0 $ NodeToken . TokenText <$> r0) _ -> Nothing _ -> Nothing instance TagFrom (Cell TL.Text) where tagFrom (Cell bp ep t) | (w,r) <- TL.span isTagChar t , not $ TL.null w , ew <- pos_column bp + sum (Text.length <$> (TL.toChunks w)) = Just ( Cell bp bp{pos_column=ew} w , Cell bp{pos_column=ew} ep r ) tagFrom _ = Nothing isTagChar :: Char -> Bool isTagChar c = Char.isAlphaNum c || c=='·' || case Char.generalCategory c of Char.DashPunctuation -> True Char.ConnectorPunctuation -> True _ -> False {- -- | Build 'Tokens' from many 'Token's. tokens :: [Cell Token] -> Tokens tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts -- | Build 'Tokens' from one 'Token'. tokens1 :: Tree (Cell Node) -> Tokens tokens1 = Seq.singleton unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens)) unTokenElem toks = case toList $ {-Seq.dropWhileR isTokenWhite-} toks of [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts)) _ -> Nothing isTokenElem :: Tokens -> Bool isTokenElem toks = case toList $ {-Seq.dropWhileR isTokenWhite-} toks of [Tree (unCell -> NodePair PairElem{}) _] -> True _ -> False -}