{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Rank2Types #-} {-# LANGUAGE TypeFamilies #-} module Language.TCT.Read.Token where import Control.Applicative (Applicative(..), Alternative(..)) import Control.Monad (Monad(..)) import Data.Bool import Data.Char (Char) import Data.Eq (Eq(..)) import Data.Foldable (Foldable(..)) import Data.Function (($), (.), flip) import Data.Functor ((<$>), ($>), (<$)) import Data.Maybe (Maybe(..)) import Data.Monoid (Monoid(..)) import Data.Semigroup (Semigroup(..)) import Data.Sequence (ViewL(..), (<|)) import Data.Text (Text) import Data.Text.Buildable (Buildable(..)) import Data.Tuple (fst,snd) import Text.Show (Show(..)) import qualified Data.Char as Char import qualified Data.Sequence as Seq import qualified Data.Text as Text import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Builder as Builder import qualified Text.Megaparsec as P import Language.TCT.Token import Language.TCT.Elem -- hiding (dbg) import Language.TCT.Read.Elem -- hiding (pdbg) {- import Debug.Trace (trace) dbg m x = trace (m <> ": " <> show x) x pdbg m p = P.dbg m p -} textOf :: Buildable a => a -> Text textOf = TL.toStrict . Builder.toLazyText . build -- * Type 'Pairs' type Pairs = (Tokens,[(Pair,Tokens)]) openPair :: Pair -> Pairs -> Pairs openPair g (t,ms) = (t,(g,mempty):ms) insertToken :: Token -> Pairs -> Pairs insertToken tok (t,[]) = (t<>tokens [tok],[]) insertToken tok (t,(p0,t0):ps) = (t,(p0,t0<>tokens [tok]):ps) insertTokens :: Tokens -> Pairs -> Pairs insertTokens toks (t,[]) = (t<>toks,[]) insertTokens toks (t,(p0,t0):ps) = (t,(p0,t0<>toks):ps) -- | Close a 'Pair' when there is a matching 'LexemePairClose'. closePair :: Pair -> Pairs -> Pairs closePair p (t,[]) = dbg "closePair" $ (t<>tokens [TokenPlain $ snd $ pairBorders p $ tokens [TokenPlain ""]],[]) closePair p (t,(p1,t1):ts) = dbg "closePair" $ case (p,p1) of (PairElem x ax, PairElem y ay) | x == y -> insertToken (TokenPair (PairElem x (ax<>ay)) t1) (t,ts) (x,y) | x == y -> insertToken (TokenPair p1 t1) (t,ts) _ -> closePair p $ insertTokens (closeUnpaired mempty (p1,t1)) (t,ts) -- | Close a 'Pair' when there is not a matching 'LexemePairClose'. closeUnpaired :: Tokens -> (Pair,Tokens) -> Tokens closeUnpaired acc (p,tn) = dbg "closeUnpaired" $ case p of -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'. PairHash | TokenPlain t :< ts <- Seq.viewl $ unTokens $ tn <> acc -> case Text.findIndex (not . isTagChar) t of Just 0 -> tokens [TokenPlain $ fst $ pairBorders p mempty] <> tn <> acc Just i -> Tokens $ TokenTag tag <| TokenPlain t' <| ts where (tag,t') = Text.splitAt i t Nothing -> Tokens $ TokenTag t <| ts _ -> tokens [TokenPlain $ fst $ pairBorders p $ tokens [TokenPlain ""]] <> tn <> acc where isTagChar c = Char.isAlphaNum c || c=='·' || case Char.generalCategory c of Char.DashPunctuation -> True Char.ConnectorPunctuation -> True _ -> False -- | Close remaining 'Pair's at end of parsing. closePairs :: Pairs -> Tokens closePairs (t0,ps) = dbg "closePairs" $ t0 <> foldl' closeUnpaired mempty ps -- * Type 'Lexeme' data Lexeme = LexemePairOpen Pair | LexemePairClose Pair | LexemePunctOrSym Char | LexemeWhite Text | LexemeWord Text | LexemeToken Tokens | LexemeEscape Char | LexemeLink Text deriving (Show, Eq) appendLexeme :: Lexeme -> Pairs -> Pairs appendLexeme lex ps = dbg "appendLexeme" $ case dbg "appendLexeme" lex of LexemePairOpen p -> openPair p ps LexemePairClose p -> closePair p ps LexemePunctOrSym c -> insertToken (TokenPlain (Text.singleton c)) ps LexemeWhite wh -> insertToken (TokenPlain wh) ps LexemeWord wo -> insertToken (TokenPlain wo) ps LexemeToken ts -> insertTokens ts ps LexemeEscape c -> insertToken (TokenEscape c) ps LexemeLink lnk -> insertToken (TokenLink lnk) ps appendLexemes :: Pairs -> [Lexeme] -> Pairs appendLexemes = foldl' (flip appendLexeme) -- * Parsers p_Tokens :: Parser e s Tokens p_Tokens = closePairs <$> p_Pairs (mempty,[]) p_Pairs :: Pairs -> Parser e s Pairs p_Pairs gs = pdbg "Pairs" $ (p_Lexemes (mempty == gs) >>= p_Pairs . appendLexemes gs) <|> (P.eof $> gs) p_Lexemes :: Bool -> Parser e s [Lexeme] p_Lexemes isBOF = pdbg "Lexemes" $ P.choice [ P.try $ p_PairCloseWhite , P.try $ p_PairWhiteOpen isBOF , P.try $ p_PairCloseBorder , P.try $ p_PairBorderOpen , P.try $ p_PairClose , P.try $ p_PairOpen , P.try $ pure . LexemePunctOrSym <$> p_PunctOrSym , P.try $ pure <$> p_White , pure . LexemeWord <$> p_Word ] p_White :: Parser e s Lexeme p_White = pdbg "White" $ LexemeWhite <$> p_Spaces p_PunctOrSym :: Parser e s Char p_PunctOrSym = P.satisfy $ \c -> Char.isPunctuation c || Char.isSymbol c p_PairCloseWhite :: Parser e s [Lexeme] p_PairCloseWhite = pdbg "PairCloseWhite" $ (\c b o -> mconcat c <> b <> mconcat o) <$> P.some (P.try $ P.try p_ElemOpen <|> P.try p_ElemClose <|> P.try p_PairClose <|> pure . LexemePunctOrSym <$> p_PunctOrSym) <*> (pure <$> p_White <|> P.eof $> []) <*> P.many (P.try $ P.try p_ElemOpen <|> P.try p_ElemClose <|> P.try p_PairOpen <|> pure . LexemePunctOrSym <$> p_PunctOrSym) p_PairWhiteOpen :: Bool -> Parser e s [Lexeme] p_PairWhiteOpen isBOF = pdbg "PairWhiteOpen" $ (\b o -> b <> mconcat o) <$> (if isBOF then return [] else pure <$> p_White) <*> P.some (P.try $ P.try p_ElemOpen <|> P.try p_ElemClose <|> P.try p_PairOpen <|> pure . LexemePunctOrSym <$> p_PunctOrSym) p_PairCloseBorder :: Parser e s [Lexeme] p_PairCloseBorder = pdbg "PairCloseBorder" $ P.try p0 <|> p1 where p0 = (\c b -> mconcat $ c <> b) <$> P.some (P.try p_PairClose) <*> P.some (P.try $ P.choice [ P.try p_ElemOpen , P.try p_ElemClose , do c <- p_PunctOrSym case l_PairOpen c <|> l_PairClose c of Just l -> return [l] Nothing -> fail "PairCloseBorder" ]) p1 = (\c b -> mconcat c <> [LexemePunctOrSym b]) <$> P.some (P.try p_PairClose) <*> p_PunctOrSym p_PairBorderOpen :: Parser e s [Lexeme] p_PairBorderOpen = pdbg "PairBorderOpen" $ P.try p0 <|> p1 where p0 = (\b o -> mconcat $ b <> o) <$> P.some (P.try $ P.choice [ P.try p_ElemOpen , P.try p_ElemClose , do c <- p_PunctOrSym case l_PairOpen c <|> l_PairClose c of Just l -> return [l] Nothing -> fail "PairBorderOpen" ]) <*> P.some (P.try p_PairOpen) p1 = (\b o -> LexemePunctOrSym b : mconcat o) <$> p_PunctOrSym <*> P.some (P.try p_PairOpen) p_PairOpen :: Parser e s [Lexeme] p_PairOpen = pdbg "PairOpen" $ do P.choice [ P.try p_ElemOpen , P.try (pure <$> p_Escape) , P.try (pure <$> p_Link) , do c <- p_PunctOrSym case l_PairOpenOrClose LexemePairOpen c of Just l -> return [l] _ -> fail "PairOpen" ] p_PairClose :: Parser e s [Lexeme] p_PairClose = pdbg "PairClose" $ do P.choice [ P.try p_ElemClose , P.try p_ElemSingle , P.try (pure <$> p_Escape) , P.try (pure <$> p_Link) , do c <- p_PunctOrSym case l_PairOpenOrClose LexemePairClose c of Just l -> return [l] _ -> fail "PairClose" ] l_PairOpenOrClose :: (Pair -> Lexeme) -> Char -> Maybe Lexeme l_PairOpenOrClose lxm c = l_PairOpenAndClose lxm c <|> l_PairOpen c <|> l_PairClose c l_PairOpenAndClose :: (Pair -> Lexeme) -> Char -> Maybe Lexeme l_PairOpenAndClose lxm c = case c of '/' -> Just $ lxm PairSlash '-' -> Just $ lxm PairDash '"' -> Just $ lxm PairDoublequote '\'' -> Just $ lxm PairSinglequote '`' -> Just $ lxm PairBackquote '_' -> Just $ lxm PairUnderscore '*' -> Just $ lxm PairStar '#' -> Just $ lxm PairHash _ -> Nothing l_PairOpen :: Char -> Maybe Lexeme l_PairOpen c = case c of '(' -> Just $ LexemePairOpen PairParen '[' -> Just $ LexemePairOpen PairBracket '{' -> Just $ LexemePairOpen PairBrace '«' -> Just $ LexemePairOpen PairFrenchquote _ -> Nothing l_PairClose :: Char -> Maybe Lexeme l_PairClose c = case c of ')' -> Just $ LexemePairClose PairParen ']' -> Just $ LexemePairClose PairBracket '}' -> Just $ LexemePairClose PairBrace '»' -> Just $ LexemePairClose PairFrenchquote _ -> Nothing p_Link :: Parser e s Lexeme p_Link = (\scheme ss addr -> LexemeLink $ Text.pack $ scheme <> ss <> addr) <$> P.option "" (P.try p_scheme) <*> P.string "//" <*> p_addr where p_scheme = (<>) <$> P.some (P.try $ P.satisfy $ \c -> Char.isAlphaNum c || c=='_' || c=='-' || c=='+') <*> P.string ":" p_addr = P.many $ P.try $ P.satisfy $ \c -> Char.isAlphaNum c || c=='%' || c=='/' || c=='(' || c==')' || c=='-' || c=='_' || c=='.' p_Escape :: Parser e s Lexeme p_Escape = LexemeEscape <$ P.char '\\' <*> P.satisfy Char.isPrint p_ElemSingle :: Parser e s [Lexeme] p_ElemSingle = pdbg "ElemSingle" $ (\e as -> [ LexemePairOpen $ PairElem e as , LexemeToken $ mempty , LexemePairClose $ PairElem e [] ]) <$ P.char '<' <*> p_Word <*> p_Attrs <* P.string "/>" p_ElemOpen :: Parser e s [Lexeme] p_ElemOpen = pdbg "ElemOpen" $ (\e as oc -> case oc of True -> [ LexemePairOpen $ PairElem e as , LexemeToken $ mempty , LexemePairClose $ PairElem e [] ] False -> [ LexemePairOpen $ PairElem e as , LexemeToken $ tokens [TokenPlain ""] ]) <$ P.char '<' <*> p_Word <*> p_Attrs <*> P.option False (True <$ P.char '/') <* P.char '>' p_ElemClose :: Parser e s [Lexeme] p_ElemClose = pdbg "ElemClose" $ (\e -> [LexemePairClose $ PairElem e []]) <$ P.string " p_Word <* P.char '>'