{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Rank2Types #-} {-# LANGUAGE TypeFamilies #-} {-# LANGUAGE ViewPatterns #-} module Language.TCT.Read.Token where import Control.Applicative (Applicative(..), Alternative(..)) import Control.Monad (Monad(..)) import Data.Bool import Data.Char (Char) import Data.Eq (Eq(..)) import Data.Foldable (Foldable(..)) import Data.Function (($), (.)) import Data.Functor ((<$>), ($>), (<$)) import Data.Maybe (Maybe(..)) import Data.Monoid (Monoid(..)) import Data.Semigroup (Semigroup(..)) import Data.Sequence (ViewL(..), (<|)) import Data.Text (Text) -- import Data.Text.Buildable (Buildable(..)) import Data.TreeSeq.Strict (Tree(..)) import Data.Tuple (fst,snd) import Prelude (Num(..)) import Text.Show (Show(..)) import qualified Data.Char as Char import qualified Data.Sequence as Seq import qualified Data.Text as Text -- import qualified Data.Text.Lazy as TL -- import qualified Data.Text.Lazy.Builder as Builder import qualified Text.Megaparsec as P import qualified Text.Megaparsec.Char as P import Language.TCT.Token import Language.TCT.Cell import Language.TCT.Elem import Language.TCT.Read.Elem import Language.TCT.Read.Cell {- textOf :: Buildable a => a -> Text textOf = TL.toStrict . Builder.toLazyText . build -} -- * Type 'Pairs' type Pairs = (Tokens,[(Cell Pair,Tokens)]) appendToken :: Pairs -> Token -> Pairs appendToken ps = appendTokens ps . Seq.singleton appendTokens :: Pairs -> Tokens -> Pairs appendTokens (t,[]) toks = (t<>toks,[]) appendTokens (t,(p0,t0):ps) toks = (t,(p0,t0<>toks):ps) openPair :: Pairs -> Cell Pair -> Pairs openPair (t,ms) p = (t,(p,mempty):ms) -- | Close a 'Pair' when there is a matching 'LexemePairClose'. closePair :: Pairs -> Cell Pair -> Pairs closePair ps@(_,[]) (Cell bp ep p) = dbg "closePair" $ appendToken ps $ Tree0 $ Cell bp ep $ TokenPlain $ snd $ pairBorders p tokensPlainEmpty closePair (t,(p1,t1):ts) p = dbg "closePair" $ case (p1,p) of (Cell bx _ex (PairElem x ax), Cell _by ey (PairElem y ay)) | x == y -> appendToken (t,ts) $ TreeN (Cell bx ey $ PairElem x (ax<>ay)) t1 (Cell bx _ex x, Cell _by ey y) | x == y -> appendToken (t,ts) $ TreeN (Cell bx ey x) t1 _ -> (`closePair` p) $ appendTokens (t,ts) (closeUnpaired mempty (p1,t1)) -- | Close a 'Pair' when there is not a matching 'LexemePairClose'. closeUnpaired :: Tokens -> (Cell Pair,Tokens) -> Tokens closeUnpaired acc (Cell bp ep p,toks) = dbg "closeUnpaired" $ case p of -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'. PairHash | (Tree0 (Cell bt et (TokenPlain t))) :< ts <- Seq.viewl $ toks <> acc -> case Text.findIndex (not . isTagChar) t of -- Just 0 -> toksHash mempty <> toks <> acc Just i -> Tree0 (Cell bp bt{columnPos = columnPos bt + i} (TokenTag tag)) <| Tree0 (Cell bt{columnPos = columnPos bt + i + 1} et (TokenPlain t')) <| ts where (tag,t') = Text.splitAt i t Nothing | Text.null t -> toksHash mempty <> toks <> acc Nothing -> Tree0 (Cell bp et (TokenTag t)) <| ts _ -> toksHash tokensPlainEmpty <> toks <> acc where toksHash = tokens1 . Tree0 . Cell bp ep . TokenPlain . fst . pairBorders p isTagChar c = Char.isAlphaNum c || c=='·' || case Char.generalCategory c of Char.DashPunctuation -> True Char.ConnectorPunctuation -> True _ -> False -- | Close remaining 'Pair's at end of parsing. closePairs :: Pairs -> Tokens closePairs (t0,ps) = dbg "closePairs" $ t0 <> foldl' closeUnpaired mempty ps appendLexeme :: Lexeme -> Pairs -> Pairs appendLexeme lex acc = dbg "appendLexeme" $ case lex of LexemePairOpen ps -> foldl' open acc ps where open a p@(Cell _bp ep (PairElem{})) = openPair a p `appendToken` (Tree0 $ Cell ep ep $ TokenPlain "") open a p = openPair a p LexemePairClose ps -> foldl' closePair acc ps LexemePairAny ps -> appendTokens acc $ tokens $ Tree0 . ((\p -> TokenPlain $ fst $ pairBorders p mempty) <$>) <$> ps LexemePairBoth ps -> appendTokens acc $ tokens $ (`TreeN`mempty) <$> ps LexemeEscape c -> appendToken acc $ Tree0 $ TokenEscape <$> c LexemeLink t -> appendToken acc $ Tree0 $ TokenLink <$> t LexemeWhite (unCell -> "") -> acc LexemeWhite cs -> appendToken acc $ Tree0 $ TokenPlain <$> cs LexemeAlphaNum cs -> appendToken acc $ Tree0 $ TokenPlain . Text.pack <$> cs LexemeAny cs -> appendToken acc $ Tree0 $ TokenPlain . Text.pack <$> cs LexemeToken ts -> appendTokens acc ts -- * Type 'Lexeme' data Lexeme = LexemePairOpen ![Cell Pair] | LexemePairClose ![Cell Pair] | LexemePairAny ![Cell Pair] | LexemePairBoth ![Cell Pair] | LexemeEscape !(Cell Char) | LexemeLink !(Cell Text) | LexemeWhite !(Cell White) | LexemeAlphaNum !(Cell [Char]) | LexemeAny !(Cell [Char]) | LexemeToken !Tokens deriving (Eq, Show) p_Tokens :: Parser e s Tokens p_Tokens = pdbg "Tokens" $ closePairs . foldr appendLexeme mempty . dbg "Lexemes" . mangleLexemes . (LexemeWhite (cell0 "") :) <$> go [LexemeWhite (cell0 "")] where go :: [Lexeme] -> Parser e s [Lexeme] go acc = (P.eof $> acc) <|> (p_Lexeme >>= \next -> go $ mangleLexemes $ next:acc) mangleLexemes = \case LexemeAny (Cell _bx ex x):LexemeAny (Cell by _ey y):acc -> LexemeAny (Cell by ex (x<>y)):acc -- "    w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc --    " LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc --    ,,," LexemePairAny p:a@LexemeAny{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc -- ",,,    w@LexemeWhite{}:a@LexemeAny{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc -- ",,,AAA an@LexemeAlphaNum{}:a@LexemeAny{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc -- ,,,"AAA an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeAny{}:acc -> an:LexemePairOpen p:a:acc -- ") c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc -- (" LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc -- "( o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc -- )" LexemePairAny p:c@LexemePairClose{}:acc -> c:LexemePairClose p:acc acc -> acc pairAny :: Char -> Maybe Pair pairAny = \case '-' -> Just PairDash '/' -> Just PairSlash '"' -> Just PairDoublequote '\'' -> Just PairSinglequote '`' -> Just PairBackquote '_' -> Just PairUnderscore '*' -> Just PairStar '#' -> Just PairHash _ -> Nothing pairOpen :: Char -> Maybe Pair pairOpen = \case '(' -> Just PairParen '[' -> Just PairBracket '{' -> Just PairBrace '«' -> Just PairFrenchquote _ -> Nothing pairClose :: Char -> Maybe Pair pairClose = \case ')' -> Just PairParen ']' -> Just PairBracket '}' -> Just PairBrace '»' -> Just PairFrenchquote _ -> Nothing p_Cell :: Parser e s a -> Parser e s (Cell a) p_Cell pa = do bp <- p_Position a <- pa ep <- p_Position return $ Cell bp ep a p_Lexeme :: Parser e s Lexeme p_Lexeme = pdbg "Lexeme" $ P.choice [ P.try $ LexemeWhite <$> p_Cell p_Spaces , P.try $ LexemePairAny <$> P.some (p_Cell $ p_satisfyMaybe pairAny) , P.try $ LexemePairBoth <$> P.some (P.try $ p_Cell p_ElemSingle) , P.try $ LexemePairOpen <$> P.some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen) , P.try $ LexemePairClose <$> P.some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose) , P.try $ LexemeEscape <$> p_Cell p_Escape , P.try $ LexemeLink <$> p_Cell p_Link , P.try $ LexemeAlphaNum <$> p_Cell (P.some p_AlphaNum) , LexemeAny <$> p_Cell (pure <$> P.anyChar) ] p_AlphaNum :: Parser e s Char p_AlphaNum = P.satisfy Char.isAlphaNum p_Escape :: Parser e s Char p_Escape = P.char '\\' *> P.satisfy Char.isPrint p_Link :: Parser e s Text p_Link = P.try (P.char '<' *> p <* P.char '>') <|> p where p = (\scheme addr -> Text.pack $ scheme <> "//" <> addr) <$> P.option "" (P.try p_scheme) <* P.string "//" <*> p_addr p_scheme = (<> ":") <$> P.some (P.satisfy $ \c -> Char.isAlphaNum c || c=='_' || c=='-' || c=='+') <* P.char ':' p_addr = P.many $ P.satisfy $ \c -> Char.isAlphaNum c || c=='%' || c=='/' || c=='(' || c==')' || c=='-' || c=='_' || c=='.' || c=='#' || c=='?' || c=='=' p_ElemSingle :: Parser e s Pair p_ElemSingle = pdbg "ElemSingle" $ PairElem <$ P.char '<' <*> p_Word <*> p_Attrs <* P.string "/>" p_ElemOpen :: Parser e s Pair p_ElemOpen = pdbg "ElemOpen" $ PairElem <$ P.char '<' <*> p_Word <*> p_Attrs <* P.char '>' p_ElemClose :: Parser e s Pair p_ElemClose = pdbg "ElemClose" $ (`PairElem` []) <$ P.string " p_Word <* P.char '>' {- p_ElemOpenOrSingle :: Parser e s Pair p_ElemOpenOrSingle = p_ElemOpen >>= \p -> P.char '>' $> LexemePairOpen p <|> P.string "/>" $> LexemePairAny p -}