{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Rank2Types #-} {-# LANGUAGE TypeFamilies #-} module Language.TCT.Read.Token where import Control.Applicative (Applicative(..), Alternative(..)) import Control.Monad (Monad(..)) import Data.Bool import Data.Char (Char) import Data.Eq (Eq(..)) import Data.Foldable (Foldable(..)) import Data.Function (($), (.), flip) import Data.Functor ((<$>), ($>), (<$)) import Data.Maybe (Maybe(..), fromMaybe) import Data.Monoid (Monoid(..)) import Data.Semigroup (Semigroup(..)) import Data.Sequence (ViewL(..), (<|)) import Data.Text (Text) import Data.Text.Buildable (Buildable(..)) import Data.Tuple (fst,snd) import Text.Show (Show(..)) import qualified Data.Char as Char import qualified Data.Sequence as Seq import qualified Data.Text as Text import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Builder as Builder import qualified Text.Megaparsec as P import Language.TCT.Token import Language.TCT.Elem import Language.TCT.Read.Elem -- hiding (pdbg) -- pdbg m p = P.dbg m p textOf :: Buildable a => a -> Text textOf = TL.toStrict . Builder.toLazyText . build -- * Type 'Pairs' type Pairs = (Token,[(Pair,Token)]) openPair :: Pair -> Pairs -> Pairs openPair g (t,ms) = (t,(g,mempty):ms) insertToken :: Token -> Pairs -> Pairs insertToken tok (t,[]) = (t<>tok,[]) insertToken tok (t,(g0,t0):gs) = (t,(g0,t0<>tok):gs) -- | Close a 'Pair' when there is a matching 'LexemePairClose'. closePair :: Pair -> Pairs -> Pairs closePair g (t,[]) = dbg "closePair" $ (t<>TokenPlain (snd $ pairBorders g mempty),[]) closePair g (t,(g1,m1):ms) = dbg "closePair" $ case (g,g1) of (PairElem x ax, PairElem y ay) | x == y -> insertToken (TokenPair (PairElem x (ax<>ay)) m1) (t,ms) (x,y) | x == y -> insertToken (TokenPair g1 m1) (t,ms) _ -> closePair g $ insertToken (closelessPair mempty (g1,m1)) (t,ms) -- | Close a 'Pair' when there is not a matching 'LexemePairClose'. closelessPair :: Token -> (Pair,Token) -> Token closelessPair acc (g,t) = dbg "closelessPair" $ case g of -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'. PairHash | TokenPlain p :< toks <- Seq.viewl $ unTokens $ t <> acc -> case Text.findIndex (not . isTagChar) p of Just 0 -> TokenPlain (fst $ pairBorders g mempty) <> t <> acc Just i -> Tokens $ TokenTag tag <| TokenPlain p' <| toks where (tag,p') = Text.splitAt i p Nothing -> Tokens $ TokenTag p <| toks _ -> TokenPlain (fst $ pairBorders g mempty) <> t <> acc where isTagChar c = Char.isAlphaNum c || c=='·' || case Char.generalCategory c of Char.DashPunctuation -> True Char.ConnectorPunctuation -> True _ -> False -- | Close remaining 'Pair's at end of parsing. closePairs :: Pairs -> Token closePairs (t0,gs) = dbg "closePairs" $ t0 <> foldl' closelessPair mempty gs -- * Type 'Lexeme' data Lexeme = LexemePairOpen Pair | LexemePairClose Pair | LexemePunctOrSym Char | LexemeWhite Text | LexemeWord Text | LexemeToken Token | LexemeEscape Char | LexemeLink Text deriving (Show, Eq) appendLexeme :: Lexeme -> Pairs -> Pairs appendLexeme lex gs = dbg "appendLexeme" $ case dbg "appendLexeme" lex of LexemePairOpen g -> openPair g gs LexemePairClose g -> closePair g gs LexemePunctOrSym c -> insertToken (TokenPlain (Text.singleton c)) gs LexemeWhite wh -> insertToken (TokenPlain wh) gs LexemeWord wo -> insertToken (TokenPlain wo) gs LexemeToken tok -> insertToken tok gs LexemeEscape c -> insertToken (TokenEscape c) gs LexemeLink lnk -> insertToken (TokenLink lnk) gs appendLexemes :: Pairs -> [Lexeme] -> Pairs appendLexemes = foldl' (flip appendLexeme) -- * Parsers p_Token :: Parser e s Token p_Token = closePairs <$> p_Pairs (mempty,[]) p_Pairs :: Pairs -> Parser e s Pairs p_Pairs gs = pdbg "Pairs" $ (p_Lexemes (mempty == gs) >>= p_Pairs . appendLexemes gs) <|> (P.eof $> gs) p_Lexemes :: Bool -> Parser e s [Lexeme] p_Lexemes isBOF = pdbg "Lexemes" $ P.choice [ P.try $ p_PairCloseWhite , P.try $ p_PairWhiteOpen isBOF , P.try $ p_PairCloseBorder , P.try $ p_PairBorderOpen , P.try $ p_PairClose , P.try $ p_PairOpen , P.try $ pure . LexemePunctOrSym <$> p_PunctOrSym , P.try $ pure <$> p_White , pure . LexemeWord <$> p_Word ] p_White :: Parser e s Lexeme p_White = pdbg "White" $ LexemeWhite <$> p_Spaces p_PunctOrSym :: Parser e s Char p_PunctOrSym = P.satisfy $ \c -> Char.isPunctuation c || Char.isSymbol c p_PairCloseWhite :: Parser e s [Lexeme] p_PairCloseWhite = pdbg "PairCloseWhite" $ (\c b -> mconcat c <> b) <$> P.some (P.try p_PairClose <|> pure . LexemePunctOrSym <$> p_PunctOrSym) <*> ((pure <$> p_White) <|> P.eof $> []) p_PairWhiteOpen :: Bool -> Parser e s [Lexeme] p_PairWhiteOpen isBOF = pdbg "PairWhiteOpen" $ (\b o -> b <> mconcat o) <$> (if isBOF then return [] else pure <$> p_White) <*> P.some (P.try p_PairOpen <|> pure . LexemePunctOrSym <$> p_PunctOrSym) p_PairCloseBorder :: Parser e s [Lexeme] p_PairCloseBorder = pdbg "PairCloseBorder" $ P.try p0 <|> p1 where p0 = (\c b -> mconcat $ c <> b) <$> P.some (P.try p_PairClose) <*> P.some (P.try $ P.choice [ P.try p_ElemOpen , P.try p_ElemClose , do c <- p_PunctOrSym case l_PairClose c of Just l -> return [l] Nothing -> case l_PairOpenAndClose LexemePairOpen c <|> l_PairOpen c of Nothing -> return [LexemePunctOrSym c] _ -> fail "" ]) p1 = (\c b -> mconcat c <> [LexemePunctOrSym b]) <$> P.some (P.try p_PairClose) <*> p_PunctOrSym p_PairBorderOpen :: Parser e s [Lexeme] p_PairBorderOpen = pdbg "PairBorderOpen" $ P.try p0 <|> p1 where p0 = (\b o -> mconcat $ b <> o) <$> P.some (P.try $ P.choice [ P.try p_ElemOpen , P.try p_ElemClose , do c <- p_PunctOrSym case l_PairOpen c <|> l_PairClose c of Just l -> return [l] Nothing -> fail "" ]) <*> P.some (P.try p_PairOpen) p1 = (\b o -> LexemePunctOrSym b : mconcat o) <$> p_PunctOrSym <*> P.some (P.try p_PairOpen) p_PairOpen :: Parser e s [Lexeme] p_PairOpen = pdbg "PairOpen" $ do P.choice [ P.try p_ElemOpen , P.try (pure <$> p_Escape) , P.try (pure <$> p_Link) , do c <- p_PunctOrSym case l_PairOpenOrClose LexemePairOpen c of Just l -> return [l] _ -> fail "" ] p_PairClose :: Parser e s [Lexeme] p_PairClose = pdbg "PairClose" $ do P.choice [ P.try p_ElemClose , P.try p_ElemSingle , P.try (pure <$> p_Escape) , P.try (pure <$> p_Link) , do c <- p_PunctOrSym case l_PairOpenOrClose LexemePairClose c of Just l -> return [l] _ -> fail "" ] p_PairPlain :: (Char -> Maybe Lexeme) -> Parser e s [Lexeme] p_PairPlain pair = pdbg "PairPlain" $ do (<$> p_PunctOrSym) $ \c -> pure $ LexemePunctOrSym c `fromMaybe` pair c l_PairOpenAndClose :: (Pair -> Lexeme) -> Char -> Maybe Lexeme l_PairOpenAndClose lxm c = case c of '/' -> Just $ lxm PairSlash '-' -> Just $ lxm PairDash '"' -> Just $ lxm PairDoublequote '\'' -> Just $ lxm PairSinglequote '`' -> Just $ lxm PairBackquote '_' -> Just $ lxm PairUnderscore '*' -> Just $ lxm PairStar '#' -> Just $ lxm PairHash _ -> Nothing l_PairOpenOrClose :: (Pair -> Lexeme) -> Char -> Maybe Lexeme l_PairOpenOrClose lxm c = l_PairOpenAndClose lxm c <|> l_PairOpen c <|> l_PairClose c l_PairOpen :: Char -> Maybe Lexeme l_PairOpen c = case c of '(' -> Just $ LexemePairOpen PairParen '[' -> Just $ LexemePairOpen PairBracket '{' -> Just $ LexemePairOpen PairBrace '«' -> Just $ LexemePairOpen PairFrenchquote _ -> Nothing l_PairClose :: Char -> Maybe Lexeme l_PairClose c = case c of ')' -> Just $ LexemePairClose PairParen ']' -> Just $ LexemePairClose PairBracket '}' -> Just $ LexemePairClose PairBrace '»' -> Just $ LexemePairClose PairFrenchquote _ -> Nothing p_Link :: Parser e s Lexeme p_Link = (\scheme ss addr -> LexemeLink $ Text.pack $ scheme <> ss <> addr) <$> P.option "" (P.try p_scheme) <*> P.string "//" <*> p_addr where p_scheme = (<>) <$> P.some (P.satisfy $ \c -> Char.isAlphaNum c || c=='_' || c=='-' || c=='+') <*> P.string ":" p_addr = P.many $ P.satisfy $ \c -> Char.isAlphaNum c || c=='%' || c=='/' || c=='(' || c==')' || c=='-' || c=='_' || c=='.' p_Escape :: Parser e s Lexeme p_Escape = LexemeEscape <$ P.char '\\' <*> P.satisfy Char.isPrint p_ElemSingle :: Parser e s [Lexeme] p_ElemSingle = pdbg "ElemOpen" $ (\e as -> [ LexemePairOpen $ PairElem e as , LexemeToken $ Tokens mempty -- NOTE: encode that it's the same Elem for open and close , LexemePairClose $ PairElem e [] ]) <$ P.char '<' <*> p_Word <*> p_Attrs <* P.string "/>" p_ElemOpen :: Parser e s [Lexeme] p_ElemOpen = pdbg "ElemOpen" $ (\e as oc -> case oc of True -> [ LexemePairOpen $ PairElem e as , LexemeToken $ Tokens mempty , LexemePairClose $ PairElem e [] ] False -> [LexemePairOpen $ PairElem e as]) <$ P.char '<' <*> p_Word <*> p_Attrs <*> P.option False (True <$ P.char '/') <* P.char '>' p_ElemClose :: Parser e s [Lexeme] p_ElemClose = pdbg "ElemClose" $ (\e -> [LexemePairClose $ PairElem e []]) <$ P.string " p_Word <* P.char '>'