{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Rank2Types #-} {-# LANGUAGE TypeFamilies #-} module Language.TCT.Read.Token where import Control.Applicative (Applicative(..), Alternative(..)) import Control.Monad (Monad(..)) import Data.Bool import Data.Char (Char) import Data.Eq (Eq(..)) import Data.Foldable (Foldable(..)) import Data.Function (($), (.), flip) import Data.Functor ((<$>), ($>), (<$)) import Data.Maybe (Maybe(..), fromMaybe) import Data.Monoid (Monoid(..)) import Data.Semigroup (Semigroup(..)) import Data.Sequence (ViewL(..), (<|)) import Data.Text (Text) import Data.Text.Buildable (Buildable(..)) import Data.Tuple (fst,snd) import Text.Show (Show(..)) import qualified Data.Char as Char import qualified Data.Sequence as Seq import qualified Data.Text as Text import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Builder as Builder import qualified Text.Megaparsec as P import Language.TCT.Token import Language.TCT.Elem import Language.TCT.Read.Elem -- hiding (pdbg) -- pdbg m p = P.dbg m p textOf :: Buildable a => a -> Text textOf = TL.toStrict . Builder.toLazyText . build -- * Type 'Pairs' type Pairs = (Tokens,[(Pair,Tokens)]) openPair :: Pair -> Pairs -> Pairs openPair g (t,ms) = (t,(g,mempty):ms) insertToken :: Token -> Pairs -> Pairs insertToken tok (t,[]) = (t<>tokens [tok],[]) insertToken tok (t,(p0,t0):ps) = (t,(p0,t0<>tokens [tok]):ps) insertTokens :: Tokens -> Pairs -> Pairs insertTokens toks (t,[]) = (t<>toks,[]) insertTokens toks (t,(p0,t0):ps) = (t,(p0,t0<>toks):ps) -- | Close a 'Pair' when there is a matching 'LexemePairClose'. closePair :: Pair -> Pairs -> Pairs closePair p (t,[]) = dbg "closePair" $ (t<>tokens [TokenPlain (snd $ pairBorders p mempty)],[]) closePair p (t,(p1,t1):ts) = dbg "closePair" $ case (p,p1) of (PairElem x ax, PairElem y ay) | x == y -> insertToken (TokenPair (PairElem x (ax<>ay)) t1) (t,ts) (x,y) | x == y -> insertToken (TokenPair p1 t1) (t,ts) _ -> closePair p $ insertTokens (closeUnpaired mempty (p1,t1)) (t,ts) -- | Close a 'Pair' when there is not a matching 'LexemePairClose'. closeUnpaired :: Tokens -> (Pair,Tokens) -> Tokens closeUnpaired acc (p,tn) = dbg "closeUnpaired" $ case p of -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'. PairHash | TokenPlain t :< ts <- Seq.viewl $ unTokens $ tn <> acc -> case Text.findIndex (not . isTagChar) t of Just 0 -> tokens [TokenPlain $ fst $ pairBorders p mempty] <> tn <> acc Just i -> Tokens $ TokenTag tag <| TokenPlain t' <| ts where (tag,t') = Text.splitAt i t Nothing -> Tokens $ TokenTag t <| ts _ -> tokens [TokenPlain $ fst $ pairBorders p mempty] <> tn <> acc where isTagChar c = Char.isAlphaNum c || c=='·' || case Char.generalCategory c of Char.DashPunctuation -> True Char.ConnectorPunctuation -> True _ -> False -- | Close remaining 'Pair's at end of parsing. closePairs :: Pairs -> Tokens closePairs (t0,ps) = dbg "closePairs" $ t0 <> foldl' closeUnpaired mempty ps -- * Type 'Lexeme' data Lexeme = LexemePairOpen Pair | LexemePairClose Pair | LexemePunctOrSym Char | LexemeWhite Text | LexemeWord Text | LexemeToken Tokens | LexemeEscape Char | LexemeLink Text deriving (Show, Eq) appendLexeme :: Lexeme -> Pairs -> Pairs appendLexeme lex ps = dbg "appendLexeme" $ case dbg "appendLexeme" lex of LexemePairOpen p -> openPair p ps LexemePairClose p -> closePair p ps LexemePunctOrSym c -> insertToken (TokenPlain (Text.singleton c)) ps LexemeWhite wh -> insertToken (TokenPlain wh) ps LexemeWord wo -> insertToken (TokenPlain wo) ps LexemeToken ts -> insertTokens ts ps LexemeEscape c -> insertToken (TokenEscape c) ps LexemeLink lnk -> insertToken (TokenLink lnk) ps appendLexemes :: Pairs -> [Lexeme] -> Pairs appendLexemes = foldl' (flip appendLexeme) -- * Parsers p_Tokens :: Parser e s Tokens p_Tokens = closePairs <$> p_Pairs (mempty,[]) p_Pairs :: Pairs -> Parser e s Pairs p_Pairs gs = pdbg "Pairs" $ (p_Lexemes (mempty == gs) >>= p_Pairs . appendLexemes gs) <|> (P.eof $> gs) p_Lexemes :: Bool -> Parser e s [Lexeme] p_Lexemes isBOF = pdbg "Lexemes" $ P.choice [ P.try $ p_PairCloseWhite , P.try $ p_PairWhiteOpen isBOF , P.try $ p_PairCloseBorder , P.try $ p_PairBorderOpen , P.try $ p_PairClose , P.try $ p_PairOpen , P.try $ pure . LexemePunctOrSym <$> p_PunctOrSym , P.try $ pure <$> p_White , pure . LexemeWord <$> p_Word ] p_White :: Parser e s Lexeme p_White = pdbg "White" $ LexemeWhite <$> p_Spaces p_PunctOrSym :: Parser e s Char p_PunctOrSym = P.satisfy $ \c -> Char.isPunctuation c || Char.isSymbol c p_PairCloseWhite :: Parser e s [Lexeme] p_PairCloseWhite = pdbg "PairCloseWhite" $ (\c b -> mconcat c <> b) <$> P.some (P.try p_PairClose <|> pure . LexemePunctOrSym <$> p_PunctOrSym) <*> ((pure <$> p_White) <|> P.eof $> []) p_PairWhiteOpen :: Bool -> Parser e s [Lexeme] p_PairWhiteOpen isBOF = pdbg "PairWhiteOpen" $ (\b o -> b <> mconcat o) <$> (if isBOF then return [] else pure <$> p_White) <*> P.some (P.try p_PairOpen <|> pure . LexemePunctOrSym <$> p_PunctOrSym) p_PairCloseBorder :: Parser e s [Lexeme] p_PairCloseBorder = pdbg "PairCloseBorder" $ P.try p0 <|> p1 where p0 = (\c b -> mconcat $ c <> b) <$> P.some (P.try p_PairClose) <*> P.some (P.try $ P.choice [ P.try p_ElemOpen , P.try p_ElemClose , do c <- p_PunctOrSym case l_PairClose c of Just l -> return [l] Nothing -> case l_PairOpenAndClose LexemePairOpen c <|> l_PairOpen c of Nothing -> return [LexemePunctOrSym c] _ -> fail "" ]) p1 = (\c b -> mconcat c <> [LexemePunctOrSym b]) <$> P.some (P.try p_PairClose) <*> p_PunctOrSym p_PairBorderOpen :: Parser e s [Lexeme] p_PairBorderOpen = pdbg "PairBorderOpen" $ P.try p0 <|> p1 where p0 = (\b o -> mconcat $ b <> o) <$> P.some (P.try $ P.choice [ P.try p_ElemOpen , P.try p_ElemClose , do c <- p_PunctOrSym case l_PairOpen c <|> l_PairClose c of Just l -> return [l] Nothing -> fail "" ]) <*> P.some (P.try p_PairOpen) p1 = (\b o -> LexemePunctOrSym b : mconcat o) <$> p_PunctOrSym <*> P.some (P.try p_PairOpen) p_PairOpen :: Parser e s [Lexeme] p_PairOpen = pdbg "PairOpen" $ do P.choice [ P.try p_ElemOpen , P.try (pure <$> p_Escape) , P.try (pure <$> p_Link) , do c <- p_PunctOrSym case l_PairOpenOrClose LexemePairOpen c of Just l -> return [l] _ -> fail "" ] p_PairClose :: Parser e s [Lexeme] p_PairClose = pdbg "PairClose" $ do P.choice [ P.try p_ElemClose , P.try p_ElemSingle , P.try (pure <$> p_Escape) , P.try (pure <$> p_Link) , do c <- p_PunctOrSym case l_PairOpenOrClose LexemePairClose c of Just l -> return [l] _ -> fail "" ] p_PairPlain :: (Char -> Maybe Lexeme) -> Parser e s [Lexeme] p_PairPlain pair = pdbg "PairPlain" $ do (<$> p_PunctOrSym) $ \c -> pure $ LexemePunctOrSym c `fromMaybe` pair c l_PairOpenAndClose :: (Pair -> Lexeme) -> Char -> Maybe Lexeme l_PairOpenAndClose lxm c = case c of '/' -> Just $ lxm PairSlash '-' -> Just $ lxm PairDash '"' -> Just $ lxm PairDoublequote '\'' -> Just $ lxm PairSinglequote '`' -> Just $ lxm PairBackquote '_' -> Just $ lxm PairUnderscore '*' -> Just $ lxm PairStar '#' -> Just $ lxm PairHash _ -> Nothing l_PairOpenOrClose :: (Pair -> Lexeme) -> Char -> Maybe Lexeme l_PairOpenOrClose lxm c = l_PairOpenAndClose lxm c <|> l_PairOpen c <|> l_PairClose c l_PairOpen :: Char -> Maybe Lexeme l_PairOpen c = case c of '(' -> Just $ LexemePairOpen PairParen '[' -> Just $ LexemePairOpen PairBracket '{' -> Just $ LexemePairOpen PairBrace '«' -> Just $ LexemePairOpen PairFrenchquote _ -> Nothing l_PairClose :: Char -> Maybe Lexeme l_PairClose c = case c of ')' -> Just $ LexemePairClose PairParen ']' -> Just $ LexemePairClose PairBracket '}' -> Just $ LexemePairClose PairBrace '»' -> Just $ LexemePairClose PairFrenchquote _ -> Nothing p_Link :: Parser e s Lexeme p_Link = (\scheme ss addr -> LexemeLink $ Text.pack $ scheme <> ss <> addr) <$> P.option "" (P.try p_scheme) <*> P.string "//" <*> p_addr where p_scheme = (<>) <$> P.some (P.satisfy $ \c -> Char.isAlphaNum c || c=='_' || c=='-' || c=='+') <*> P.string ":" p_addr = P.many $ P.satisfy $ \c -> Char.isAlphaNum c || c=='%' || c=='/' || c=='(' || c==')' || c=='-' || c=='_' || c=='.' p_Escape :: Parser e s Lexeme p_Escape = LexemeEscape <$ P.char '\\' <*> P.satisfy Char.isPrint p_ElemSingle :: Parser e s [Lexeme] p_ElemSingle = pdbg "ElemOpen" $ (\e as -> [ LexemePairOpen $ PairElem e as , LexemeToken $ Tokens mempty -- NOTE: encode that it's the same Elem for open and close , LexemePairClose $ PairElem e [] ]) <$ P.char '<' <*> p_Word <*> p_Attrs <* P.string "/>" p_ElemOpen :: Parser e s [Lexeme] p_ElemOpen = pdbg "ElemOpen" $ (\e as oc -> case oc of True -> [ LexemePairOpen $ PairElem e as , LexemeToken $ Tokens mempty , LexemePairClose $ PairElem e [] ] False -> [LexemePairOpen $ PairElem e as]) <$ P.char '<' <*> p_Word <*> p_Attrs <*> P.option False (True <$ P.char '/') <* P.char '>' p_ElemClose :: Parser e s [Lexeme] p_ElemClose = pdbg "ElemClose" $ (\e -> [LexemePairClose $ PairElem e []]) <$ P.string " p_Word <* P.char '>'