{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Rank2Types #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeFamilies #-} {-# LANGUAGE ViewPatterns #-} {-# OPTIONS_GHC -fno-warn-orphans #-} module Language.TCT.Read.Token where import Control.Applicative (Applicative(..), Alternative(..)) import Control.Monad (Monad(..)) import Data.Bool import Data.Char (Char) import Data.Either (Either(..)) import Data.Eq (Eq(..)) import Data.Foldable (Foldable(..)) import Data.Function (($), (.)) import Data.Functor ((<$>), ($>)) import Data.List.NonEmpty (NonEmpty(..)) import Data.Maybe (Maybe(..)) import Data.Monoid (Monoid(..)) import Data.Semigroup (Semigroup(..)) import Data.Sequence (ViewL(..), ViewR(..), (<|), (|>)) import Data.String (String) import Data.TreeSeq.Strict (Tree(..), Trees) import Data.Tuple (fst,snd) import Data.Void (Void) import Prelude (Num(..)) import Text.Show (Show(..)) import qualified Data.Char as Char import qualified Data.List.NonEmpty as NonEmpty import qualified Data.Sequence as Seq import qualified Data.Text as Text import qualified Data.Text.Lazy as TL import qualified Text.Megaparsec as P import qualified Text.Megaparsec.Char as P import Language.TCT.Debug import Language.TCT.Cell import Language.TCT.Elem import Language.TCT.Tree import Language.TCT.Read.Elem import Language.TCT.Read.Cell -- * Type 'Pairs' -- | Right-only Dyck language, -- to keep track of opened 'Pair's. type Pairs = (Tokens,[Opening]) type Tokens = Trees (Cell Node) -- ** Type 'Opening' -- | An opened 'Pair' and its content so far. type Opening = (Cell Pair,Tokens) appendToken :: Pairs -> Tree (Cell Node) -> Pairs appendToken (ts,[]) tok = (ts|>tok,[]) appendToken (ts,(p0,t0):ps) tok = (ts,(p0,t0|>tok):ps) appendTokens :: Pairs -> Tokens -> Pairs appendTokens (ts,[]) toks = (ts<>toks,[]) appendTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0<>toks):ps) -- | Appending 'TL.Text' is a special case -- to append at the 'TokenText' level is possible, -- instead of the higher 'NodeToken' level. appendText :: Pairs -> Cell TL.Text -> Pairs appendText ps tok = case ps of (ts,[]) -> (appendTokenText ts tok,[]) (ts,(p0,ts0):pss) -> (ts,(p0,appendTokenText ts0 tok):pss) appendTokenText :: Tokens -> Cell TL.Text -> Tokens appendTokenText ts (Cell bn en n) = {- | TL.null n = ts | otherwise = -} case Seq.viewr ts of EmptyR -> pure $ Tree0 $ Cell bn en $ NodeToken $ TokenText n is :> Tree (Cell bo _eo nod) st -> case nod of NodeToken (TokenText o) -> is |> i where i = Tree (Cell bo en (NodeToken $ TokenText (o <> n))) st _ -> ts |> Tree0 (Cell bn en $ NodeToken $ TokenText n) prependTokenText :: Tokens -> Cell TL.Text -> Tokens prependTokenText ts (Cell bn en n) {- | TL.null n = ts | otherwise-} = case Seq.viewl ts of EmptyL -> pure $ Tree0 $ Cell bn en $ NodeToken $ TokenText n Tree (Cell _bo eo nod) st :< is -> case nod of NodeToken (TokenText o) -> i <| is where i = Tree (Cell bn eo (NodeToken $ TokenText (n <> o))) st _ -> Tree0 (Cell bn en $ NodeToken $ TokenText n) <| ts openPair :: Pairs -> Cell Pair -> Pairs openPair (t,ps) p = (t,(p,mempty):ps) -- | Close a 'Pair' when there is a matching 'LexemePairClose'. closePair :: Pairs -> Cell Pair -> Pairs closePair ps@(_,[]) (Cell bp ep p) = -- debug0 "closePair" $ appendText ps $ Cell bp ep $ snd $ pairBorders p closePair (t,(p1,t1):ts) p = -- debug0 "closePair" $ case (p1,p) of (Cell bx _ex (PairElem nx ax), Cell _by ey (PairElem ny ay)) | nx == ny -> appendToken (t,ts) $ Tree (Cell bx ey $ NodePair $ PairElem nx as) t1 where as | null ay = ax | otherwise = ax<>ay (Cell bx _ex x, Cell _by ey y) | x == y -> appendToken (t,ts) $ Tree (Cell bx ey $ NodePair x) t1 _ -> (`closePair` p) $ appendTokens (t,ts) (closeImpaired mempty (p1,t1)) -- | Close a 'Pair' when there is no matching 'LexemePairClose'. closeImpaired :: Tokens -> (Cell Pair,Tokens) -> Tokens closeImpaired acc (Cell bp ep p,toks) = -- debug0 "closeImpaired" $ case p of -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'. PairHash | Just (Cell _bt et t, ts) <- tagFrom $ toks <> acc -> Tree0 (Cell bp et $ NodeToken $ TokenTag t) <| ts {- PairHash | (Tree0 (Cell bt et (TokenPlain t))) :< ts <- Seq.viewl $ toks <> acc -> case Text.span isTagChar t of ("",_) | Text.null t -> toksHash mempty <> toks <> acc | otherwise -> Tree0 (Cell bp et (TokenTag t)) <| ts (tag,t') -> let len = Text.length tag in Tree0 (Cell bp bt{columnPos = columnPos bt + len} (TokenTag tag)) <| Tree0 (Cell bt{columnPos = columnPos bt + len + 1} et (TokenPlain t')) <| ts -} _ -> prependTokenText (toks <> acc) toksHash where toksHash :: Cell TL.Text toksHash = Cell bp ep $ fst $ pairBorders p -- | Close remaining 'Pair's at end of parsing. closePairs :: Pairs -> Tokens closePairs (t0,ps) = -- debug0 "closePairs" $ t0 <> foldl' closeImpaired mempty ps appendLexeme :: Lexeme -> Pairs -> Pairs appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc -> case lex of LexemePairOpen ps -> foldl' open acc ps where -- NOTE: insert an empty node to encode , not open a p@(Cell _bp ep (PairElem{})) = openPair a p `appendToken` (Tree0 $ Cell ep ep $ NodeToken $ TokenText "") open a p = openPair a p LexemePairClose ps -> foldl' closePair acc ps LexemePairAny ps -> foldl' openPair acc ps {- LexemePairAny ps -> appendText acc $ sconcat $ ((fst . pairBordersWithoutContent) <$>) <$> ps -} LexemePairBoth ps -> appendTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps LexemeEscape c -> appendToken acc $ Tree0 $ NodeToken . TokenEscape <$> c LexemeLink t -> appendToken acc $ Tree0 $ NodeToken . TokenLink <$> t {-LexemeWhite (unCell -> "") -> acc-} -- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc LexemeWhite t -> appendText acc t LexemeAlphaNum t -> appendText acc t LexemeOther t -> appendText acc t LexemeTree t -> appendToken acc t LexemeEnd -> acc appendLexemes :: Pairs -> [Lexeme] -> Pairs appendLexemes = foldr appendLexeme -- * Type 'Lexeme' -- | 'Lexeme's cut the input in the longest chunks of common semantic, -- this enables 'orientLexemePairAny' to work with a more meaningful context. data Lexeme = LexemePairOpen !(NonEmpty (Cell Pair)) | LexemePairClose !(NonEmpty (Cell Pair)) | LexemePairAny !(NonEmpty (Cell Pair)) -- ^ orientation depending on the surrounding 'Lexeme's, -- see 'orientLexemePairAny' | LexemePairBoth !(NonEmpty (Cell Pair)) | LexemeEscape !(Cell Char) | LexemeLink !(Cell TL.Text) | LexemeWhite !(Cell TL.Text) | LexemeAlphaNum !(Cell TL.Text) | LexemeOther !(Cell TL.Text) | LexemeTree !(Tree (Cell Node)) | LexemeEnd deriving (Eq, Show) instance Pretty Lexeme parseTokens :: [Lexeme] -> Tokens parseTokens ps = closePairs $ appendLexemes mempty $ -- debug0 "Lexemes (post orient)" $ orientLexemePairAny $ LexemeEnd : ps parseLexemes :: String -> Cell TL.Text -> Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme] parseLexemes inp = runParserOnCell inp (p_Lexemes <* P.eof) -- | Parse 'Lexeme's, returning them in reverse order -- to apply 'orientLexemePairAny'. p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme] p_Lexemes = debugParser "Lexemes" $ go [] where go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme] go acc = (P.eof $> acc) <|> (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc) -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme', -- so that it can try to orient nearby 'LexemePairAny' -- to 'LexemePairOpen' or 'LexemePairClose'. orientLexemePairAny :: [Lexeme] -> [Lexeme] orientLexemePairAny = \case -- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc -- "    t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc --    " LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc LexemePairAny p:[] -> LexemePairOpen p:[] --    ,,," LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[] -- ",,,    w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc -- ",,,AAA an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc -- ,,,"AAA an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc -- ") c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc -- (" LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc -- "( o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc -- )" LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc acc -> acc p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme p_Lexeme = debugParser "Lexeme" $ P.choice [ P.try $ LexemeWhite <$> p_Cell p_Spaces1 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny) , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle) , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen) , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose) , P.try $ LexemeEscape <$> p_Cell p_Escape , P.try $ LexemeLink <$> p_Cell p_Link , P.try $ LexemeAlphaNum <$> p_Cell (P.takeWhile1P (Just "AlphaNum") Char.isAlphaNum) , LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar) ] p_some :: Parser e s a -> Parser e s (NonEmpty a) p_some p = NonEmpty.fromList <$> P.some p pairAny :: Char -> Maybe Pair pairAny = \case '-' -> Just PairDash '/' -> Just PairSlash '"' -> Just PairDoublequote '\'' -> Just PairSinglequote '`' -> Just PairBackquote '_' -> Just PairUnderscore '*' -> Just PairStar '#' -> Just PairHash _ -> Nothing pairOpen :: Char -> Maybe Pair pairOpen = \case '(' -> Just PairParen '[' -> Just PairBracket '{' -> Just PairBrace '«' -> Just PairFrenchquote _ -> Nothing pairClose :: Char -> Maybe Pair pairClose = \case ')' -> Just PairParen ']' -> Just PairBracket '}' -> Just PairBrace '»' -> Just PairFrenchquote _ -> Nothing p_Escape :: Parser e s Char p_Escape = P.char '\\' *> P.satisfy Char.isPrint p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text p_Link = P.try (P.char '<' *> p <* P.char '>') <|> p where p :: P.Tokens s ~ TL.Text => Parser e s TL.Text p = (\scheme addr -> scheme <> "//" <> addr) <$> P.option "" (P.try p_scheme) <* P.string "//" <*> p_addr p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text p_scheme = (<> ":") <$> (P.takeWhile1P (Just "scheme") $ \c -> Char.isAlphaNum c || c=='_' || c=='-' || c=='+') <* P.char ':' p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text p_addr = P.takeWhileP (Just "addr") $ \c -> Char.isAlphaNum c || c=='%' || c=='/' || c=='(' || c==')' || c=='-' || c=='_' || c=='.' || c=='#' || c=='?' || c=='=' -- | Build 'Tokens' from many 'Token's. tokens :: [Cell Token] -> Tokens tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts -- | Build 'Tokens' from one 'Token'. tokens1 :: Tree (Cell Node) -> Tokens tokens1 = Seq.singleton unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens)) unTokenElem toks = case toList $ {-Seq.dropWhileR isTokenWhite-} toks of [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts)) _ -> Nothing isTokenElem :: Tokens -> Bool isTokenElem toks = case toList $ {-Seq.dropWhileR isTokenWhite-} toks of [Tree (unCell -> NodePair PairElem{}) _] -> True _ -> False pairBordersWithoutContent :: Pair -> (TL.Text,TL.Text) pairBordersWithoutContent = \case PairElem n as -> ("<"<>n<>foldMap f as<>"/>","") where f (elemAttr_white,ElemAttr{..}) = elemAttr_white <> elemAttr_name <> elemAttr_open <> elemAttr_value <> elemAttr_close p -> pairBorders p pairBorders :: Pair -> (TL.Text,TL.Text) pairBorders = \case PairElem n as -> ("<"<>n<>foldMap f as<>">","n<>">") where f (elemAttr_white,ElemAttr{..}) = elemAttr_white <> elemAttr_name <> elemAttr_open <> elemAttr_value <> elemAttr_close PairHash -> ("#","#") PairStar -> ("*","*") PairSlash -> ("/","/") PairUnderscore -> ("_","_") PairDash -> ("-","-") PairBackquote -> ("`","`") PairSinglequote -> ("'","'") PairDoublequote -> ("\"","\"") PairFrenchquote -> ("«","»") PairParen -> ("(",")") PairBrace -> ("{","}") PairBracket -> ("[","]") -- * Class 'TagFrom' class TagFrom a where tagFrom :: a -> Maybe (Cell Tag, a) instance TagFrom Tokens where tagFrom ts = case Seq.viewl ts of EmptyL -> Nothing Tree0 (Cell b0 e0 n) :< ns -> case n of NodeToken (TokenText t) -> case tagFrom $ Cell b0 e0 t of Nothing -> Nothing Just (t0,r0) -> if TL.null (unCell r0) then case tagFrom ns of Just (t1@(Cell b1 _e1 _), r1) | e0 == b1 -> Just (t0<>t1, r1) _ -> Just (t0, n0 <| ns) else Just (t0, n0 <| ns) where n0 = (Tree0 $ NodeToken . TokenText <$> r0) _ -> Nothing _ -> Nothing instance TagFrom (Cell TL.Text) where tagFrom (Cell bp ep t) | (w,r) <- TL.span isTagChar t , not $ TL.null w , ew <- pos_column bp + sum (Text.length <$> (TL.toChunks w)) = Just ( Cell bp bp{pos_column=ew} w , Cell bp{pos_column=ew} ep r ) tagFrom _ = Nothing isTagChar :: Char -> Bool isTagChar c = Char.isAlphaNum c || c=='·' || case Char.generalCategory c of Char.DashPunctuation -> True Char.ConnectorPunctuation -> True _ -> False