{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Rank2Types #-} {-# LANGUAGE TypeFamilies #-} module Language.TCT.Read.Token where import Control.Applicative (Applicative(..), Alternative(..)) import Control.Monad (Monad(..)) import Data.Bool import Data.Char (Char) import Data.Eq (Eq(..)) import Data.Foldable (Foldable(..)) import Data.Function (($), (.), flip) import Data.Functor ((<$>), ($>), (<$)) import Data.Maybe (Maybe(..), fromMaybe) import Data.Monoid (Monoid(..)) import Data.Semigroup (Semigroup(..)) import Data.Sequence (ViewL(..), (<|)) import Data.Text (Text) import Data.Text.Buildable (Buildable(..)) import Data.Tuple (fst,snd) import Text.Show (Show(..)) import qualified Data.Char as Char import qualified Data.Sequence as Seq import qualified Data.Text as Text import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Builder as Builder import qualified Text.Megaparsec as P import Language.TCT.Token import Language.TCT.Elem import Language.TCT.Read.Elem -- hiding (pdbg) -- pdbg m p = P.dbg m p textOf :: Buildable a => a -> Text textOf = TL.toStrict . Builder.toLazyText . build -- * Type 'Groups' type Groups = (Token,[(Group,Token)]) openGroup :: Group -> Groups -> Groups openGroup g (t,ms) = (t,(g,mempty):ms) insertToken :: Token -> Groups -> Groups insertToken tok (t,[]) = (t<>tok,[]) insertToken tok (t,(g0,t0):gs) = (t,(g0,t0<>tok):gs) -- | Close a 'Group' when there is a matching 'LexemeGroupClose'. closeGroup :: Group -> Groups -> Groups closeGroup g (t,[]) = dbg "closeGroup" $ (t<>TokenPlain (snd $ groupBorders g mempty),[]) closeGroup g (t,(g1,m1):ms) = dbg "closeGroup" $ case (g,g1) of (GroupElem x ax, GroupElem y ay) | x == y -> insertToken (TokenGroup (GroupElem x (ax<>ay)) m1) (t,ms) (x,y) | x == y -> insertToken (TokenGroup g1 m1) (t,ms) _ -> closeGroup g $ insertToken (closelessGroup mempty (g1,m1)) (t,ms) -- | Close a 'Group' when there is not a matching 'LexemeGroupClose'. closelessGroup :: Token -> (Group,Token) -> Token closelessGroup acc (g,t) = dbg "closelessGroup" $ case g of -- NOTE: try to close 'GroupHash' as 'TokenTag' instead of 'TokenPlain'. GroupHash | TokenPlain p :< toks <- Seq.viewl $ unTokens $ t <> acc -> case Text.findIndex (not . isTagChar) p of Just 0 -> TokenPlain (fst $ groupBorders g mempty) <> t <> acc Just i -> Tokens $ TokenTag tag <| TokenPlain p' <| toks where (tag,p') = Text.splitAt i p Nothing -> Tokens $ TokenTag p <| toks _ -> TokenPlain (fst $ groupBorders g mempty) <> t <> acc where isTagChar c = Char.isAlphaNum c || c=='·' || case Char.generalCategory c of Char.DashPunctuation -> True Char.ConnectorPunctuation -> True _ -> False -- | Close remaining 'Group's at end of parsing. closeGroups :: Groups -> Token closeGroups (t0,gs) = dbg "closeGroups" $ t0 <> foldl' closelessGroup mempty gs -- * Type 'Lexeme' data Lexeme = LexemeGroupOpen Group | LexemeGroupClose Group | LexemePunctOrSym Char | LexemeWhite Text | LexemeWord Text | LexemeToken Token | LexemeEscape Char | LexemeLink Text deriving (Show, Eq) appendLexeme :: Lexeme -> Groups -> Groups appendLexeme lex gs = dbg "appendLexeme" $ case dbg "appendLexeme" lex of LexemeGroupOpen g -> openGroup g gs LexemeGroupClose g -> closeGroup g gs LexemePunctOrSym c -> insertToken (TokenPlain (Text.singleton c)) gs LexemeWhite wh -> insertToken (TokenPlain wh) gs LexemeWord wo -> insertToken (TokenPlain wo) gs LexemeToken tok -> insertToken tok gs LexemeEscape c -> insertToken (TokenEscape c) gs LexemeLink lnk -> insertToken (TokenLink lnk) gs appendLexemes :: Groups -> [Lexeme] -> Groups appendLexemes = foldl' (flip appendLexeme) -- * Parsers p_Token :: Parser e s Token p_Token = closeGroups <$> p_Groups (mempty,[]) p_Groups :: Groups -> Parser e s Groups p_Groups gs = pdbg "Groups" $ (p_Lexemes (mempty == gs) >>= p_Groups . appendLexemes gs) <|> (P.eof $> gs) p_Lexemes :: Bool -> Parser e s [Lexeme] p_Lexemes isBOF = pdbg "Lexemes" $ P.choice [ P.try $ p_GroupCloseWhite , P.try $ p_GroupWhiteOpen isBOF , P.try $ p_GroupCloseBorder , P.try $ p_GroupBorderOpen , P.try $ p_GroupClose , P.try $ p_GroupOpen , P.try $ pure . LexemePunctOrSym <$> p_PunctOrSym , P.try $ pure <$> p_White , pure . LexemeWord <$> p_Word ] p_White :: Parser e s Lexeme p_White = pdbg "White" $ LexemeWhite <$> p_Spaces p_PunctOrSym :: Parser e s Char p_PunctOrSym = P.satisfy $ \c -> Char.isPunctuation c || Char.isSymbol c p_GroupCloseWhite :: Parser e s [Lexeme] p_GroupCloseWhite = pdbg "GroupCloseWhite" $ (\c b -> mconcat c <> b) <$> P.some (P.try p_GroupClose <|> pure . LexemePunctOrSym <$> p_PunctOrSym) <*> ((pure <$> p_White) <|> P.eof $> []) p_GroupWhiteOpen :: Bool -> Parser e s [Lexeme] p_GroupWhiteOpen isBOF = pdbg "GroupWhiteOpen" $ (\b o -> b <> mconcat o) <$> (if isBOF then return [] else pure <$> p_White) <*> P.some (P.try p_GroupOpen <|> pure . LexemePunctOrSym <$> p_PunctOrSym) p_GroupCloseBorder :: Parser e s [Lexeme] p_GroupCloseBorder = pdbg "GroupCloseBorder" $ (\c b -> mconcat $ c <> b) <$> P.some (P.try p_GroupClose) <*> P.some (P.try $ P.choice [ P.try p_ElemOpen , P.try p_ElemClose , do c <- p_PunctOrSym case l_GroupClose c of Just l -> return [l] Nothing -> case l_GroupOpenOrClose LexemeGroupOpen c <|> l_GroupOpen c of Nothing -> return [LexemePunctOrSym c] _ -> fail "" ]) p_GroupBorderOpen :: Parser e s [Lexeme] p_GroupBorderOpen = pdbg "GroupBorderOpen" $ (\b o -> mconcat $ b <> o) <$> P.some (P.try $ P.choice [ P.try p_ElemOpen , P.try p_ElemClose , do c <- p_PunctOrSym case l_GroupOpen c of Just l -> return [l] Nothing -> case l_GroupOpenOrClose LexemeGroupClose c <|> l_GroupClose c of Nothing -> return [LexemePunctOrSym c] _ -> fail "" ]) <*> P.some (P.try p_GroupOpen) p_GroupOpen :: Parser e s [Lexeme] p_GroupOpen = pdbg "GroupOpen" $ do P.choice [ P.try p_ElemOpen , P.try (pure <$> p_Escape) , P.try (pure <$> p_Link) , do c <- p_PunctOrSym case l_GroupOpenAndClose LexemeGroupOpen c of Just l -> return [l] _ -> fail "" ] p_GroupClose :: Parser e s [Lexeme] p_GroupClose = pdbg "GroupClose" $ do P.choice [ P.try p_ElemClose , P.try p_ElemSingle , P.try (pure <$> p_Escape) , P.try (pure <$> p_Link) , do c <- p_PunctOrSym case l_GroupOpenAndClose LexemeGroupClose c of Just l -> return [l] _ -> fail "" ] p_GroupPlain :: (Char -> Maybe Lexeme) -> Parser e s [Lexeme] p_GroupPlain grp = pdbg "GroupPlain" $ do (<$> p_PunctOrSym) $ \c -> pure $ LexemePunctOrSym c `fromMaybe` grp c l_GroupOpenOrClose :: (Group -> Lexeme) -> Char -> Maybe Lexeme l_GroupOpenOrClose lxm c = case c of '/' -> Just $ lxm GroupSlash '-' -> Just $ lxm GroupDash '"' -> Just $ lxm GroupDoublequote '\'' -> Just $ lxm GroupSinglequote '`' -> Just $ lxm GroupBackquote '_' -> Just $ lxm GroupUnderscore '*' -> Just $ lxm GroupStar '#' -> Just $ lxm GroupHash _ -> Nothing l_GroupOpenAndClose :: (Group -> Lexeme) -> Char -> Maybe Lexeme l_GroupOpenAndClose lxm c = l_GroupOpenOrClose lxm c <|> l_GroupOpen c <|> l_GroupClose c l_GroupOpen :: Char -> Maybe Lexeme l_GroupOpen c = case c of '(' -> Just $ LexemeGroupOpen GroupParen '[' -> Just $ LexemeGroupOpen GroupBracket '{' -> Just $ LexemeGroupOpen GroupBrace '«' -> Just $ LexemeGroupOpen GroupFrenchquote _ -> Nothing l_GroupClose :: Char -> Maybe Lexeme l_GroupClose c = case c of ')' -> Just $ LexemeGroupClose GroupParen ']' -> Just $ LexemeGroupClose GroupBracket '}' -> Just $ LexemeGroupClose GroupBrace '»' -> Just $ LexemeGroupClose GroupFrenchquote _ -> Nothing p_Link :: Parser e s Lexeme p_Link = (\scheme ss addr -> LexemeLink $ Text.pack $ scheme <> ss <> addr) <$> P.option "" (P.try p_scheme) <*> P.string "//" <*> p_addr where p_scheme = (<>) <$> P.some (P.satisfy $ \c -> Char.isAlphaNum c || c=='_' || c=='-' || c=='+') <*> P.string ":" p_addr = P.many $ P.satisfy $ \c -> Char.isAlphaNum c || c=='%' || c=='/' || c=='(' || c==')' || c=='-' || c=='_' || c=='.' p_Escape :: Parser e s Lexeme p_Escape = LexemeEscape <$ P.char '\\' <*> P.satisfy Char.isPrint p_ElemSingle :: Parser e s [Lexeme] p_ElemSingle = pdbg "ElemOpen" $ (\e as -> [ LexemeGroupOpen $ GroupElem e as , LexemeToken $ Tokens mempty -- NOTE: encode that it's the same Elem for open and close , LexemeGroupClose $ GroupElem e [] ]) <$ P.char '<' <*> p_Word <*> p_Attrs <* P.string "/>" p_ElemOpen :: Parser e s [Lexeme] p_ElemOpen = pdbg "ElemOpen" $ (\e as oc -> case oc of True -> [ LexemeGroupOpen $ GroupElem e as , LexemeToken $ Tokens mempty , LexemeGroupClose $ GroupElem e [] ] False -> [LexemeGroupOpen $ GroupElem e as]) <$ P.char '<' <*> p_Word <*> p_Attrs <*> P.option False (True <$ P.char '/') <* P.char '>' p_ElemClose :: Parser e s [Lexeme] p_ElemClose = pdbg "ElemClose" $ (\e -> [LexemeGroupClose $ GroupElem e []]) <$ P.string " p_Word <* P.char '>'