{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Rank2Types #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeFamilies #-} {-# LANGUAGE ViewPatterns #-} module Language.TCT.Read.Token where import Control.Applicative (Applicative(..), Alternative(..)) import Control.Monad (Monad(..)) import Data.Bool import Data.Char (Char) import Data.Eq (Eq(..)) import Data.Either (Either(..)) import Data.Foldable (Foldable(..)) import Data.Function (($), (.)) import Data.Functor ((<$>), ($>)) import Data.List.NonEmpty (NonEmpty(..)) import Data.Maybe (Maybe(..)) import Data.Monoid (Monoid(..)) import Data.Semigroup (Semigroup(..)) import Data.Sequence (Seq, ViewL(..), ViewR(..), (<|), (|>)) import Data.String (String) import Data.TreeSeq.Strict (Tree(..), Trees) import Data.Tuple (fst,snd) import Data.Void (Void) import Prelude (Num(..)) import Text.Show (Show(..)) import qualified Data.Char as Char import qualified Data.List.NonEmpty as NonEmpty import qualified Data.Sequence as Seq import qualified Data.Text as Text import qualified Data.Text.Lazy as TL import qualified Text.Megaparsec as P import qualified Text.Megaparsec.Char as P import Language.TCT.Cell import Language.TCT.Elem -- import Language.TCT.Token import Language.TCT.Tree import Language.TCT.Read.Elem import Language.TCT.Read.Cell instance Pretty Pair where pretty = return . show instance Pretty a => Pretty (Cell a) where pretty (Cell bp ep m) = do s <- pretty m return $ "Cell "<>show bp<>":"<>show ep<>" "<>s instance Pretty Lexeme where pretty = return . show -- * Type 'Pairs' -- | Right-only Dyck language type Pairs = (Tokens,[Opening]) type Tokens = Trees (Cell Node) -- ** Type 'Opening' type Opening = (Cell Pair,Tokens) appendToken :: Pairs -> Tree (Cell Node) -> Pairs appendToken (ts,[]) tok = (ts|>tok,[]) appendToken (ts,(p0,t0):ps) tok = (ts,(p0,t0|>tok):ps) appendTokens :: Pairs -> Tokens -> Pairs appendTokens (ts,[]) toks = (ts<>toks,[]) appendTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0<>toks):ps) appendText :: Pairs -> Cell TL.Text -> Pairs appendText ps tok = case ps of (ts,[]) -> (appendTokenText ts tok,[]) (ts,(p0,ts0):pss) -> (ts,(p0,appendTokenText ts0 tok):pss) appendTokenText :: Tokens -> Cell TL.Text -> Tokens appendTokenText ts (Cell bn en n) {- | TL.null n = ts | otherwise-} = case Seq.viewr ts of EmptyR -> pure $ Tree0 $ Cell bn en $ NodeToken $ TokenText n is :> Tree (Cell bo _eo nod) st -> case nod of NodeToken (TokenText o) -> is |> i where i = Tree (Cell bo en (NodeToken $ TokenText (o <> n))) st _ -> ts |> Tree0 (Cell bn en $ NodeToken $ TokenText n) prependTokenText :: Tokens -> Cell TL.Text -> Tokens prependTokenText ts (Cell bn en n) {- | TL.null n = ts | otherwise-} = case Seq.viewl ts of EmptyL -> pure $ Tree0 $ Cell bn en $ NodeToken $ TokenText n Tree (Cell _bo eo nod) st :< is -> case nod of NodeToken (TokenText o) -> i <| is where i = Tree (Cell bn eo (NodeToken $ TokenText (n <> o))) st _ -> Tree0 (Cell bn en $ NodeToken $ TokenText n) <| ts openPair :: Pairs -> Cell Pair -> Pairs openPair (t,ps) p = (t,(p,mempty):ps) -- | Close a 'Pair' when there is a matching 'LexemePairClose'. closePair :: Pairs -> Cell Pair -> Pairs closePair ps@(_,[]) (Cell bp ep p) = -- dbg "closePair" $ appendText ps $ Cell bp ep $ snd $ pairBorders p closePair (t,(p1,t1):ts) p = -- dbg "closePair" $ case (p1,p) of (Cell bx _ex (PairElem nx ax), Cell _by ey (PairElem ny ay)) | nx == ny -> appendToken (t,ts) $ Tree (Cell bx ey $ NodePair $ PairElem nx as) t1 where as | null ay = ax | otherwise = ax<>ay (Cell bx _ex x, Cell _by ey y) | x == y -> appendToken (t,ts) $ Tree (Cell bx ey $ NodePair x) t1 _ -> (`closePair` p) $ appendTokens (t,ts) (closeImpaired mempty (p1,t1)) -- | Close a 'Pair' when there is no matching 'LexemePairClose'. closeImpaired :: Tokens -> (Cell Pair,Tokens) -> Tokens closeImpaired acc (Cell bp ep p,toks) = dbg "closeImpaired" $ case p of -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'. PairHash | Just (Cell bt et t, ts) <- tagFrom $ toks <> acc -> Tree0 (Cell bt et $ NodeToken $ TokenTag t) <| ts {- PairHash | (Tree0 (Cell bt et (TokenPlain t))) :< ts <- Seq.viewl $ toks <> acc -> case Text.span isTagChar t of ("",_) | Text.null t -> toksHash mempty <> toks <> acc | otherwise -> Tree0 (Cell bp et (TokenTag t)) <| ts (tag,t') -> let len = Text.length tag in Tree0 (Cell bp bt{columnPos = columnPos bt + len} (TokenTag tag)) <| Tree0 (Cell bt{columnPos = columnPos bt + len + 1} et (TokenPlain t')) <| ts -} _ -> prependTokenText (toks <> acc) toksHash where toksHash :: Cell TL.Text toksHash = Cell bp ep $ fst $ pairBorders p isTagChar :: Char -> Bool isTagChar c = Char.isAlphaNum c || c=='·' || case Char.generalCategory c of Char.DashPunctuation -> True Char.ConnectorPunctuation -> True _ -> False -- * Class 'TagFrom' class TagFrom a where tagFrom :: a -> Maybe (Cell Tag, a) instance TagFrom Tokens where tagFrom ts = case Seq.viewl ts of EmptyL -> Nothing Tree0 (Cell b0 e0 n) :< ns -> case n of NodeToken (TokenText t) -> case tagFrom $ Cell b0 e0 t of Nothing -> Nothing Just (t0,r0) -> if TL.null (unCell r0) then case tagFrom ns of Just (t1@(Cell b1 _e1 _), r1) | e0 == b1 -> Just (t0<>t1, r1) _ -> Just (t0, n0 <| ns) else Just (t0, n0 <| ns) where n0 = (Tree0 $ NodeToken . TokenText <$> r0) _ -> Nothing _ -> Nothing instance TagFrom (Cell TL.Text) where tagFrom (Cell bp ep t) | (w,r) <- TL.span isTagChar t , not $ TL.null w , ew <- pos_column bp + sum (Text.length <$> (TL.toChunks w)) = Just ( Cell bp bp{pos_column=ew} w , Cell bp{pos_column=ew} ep r ) tagFrom _ = Nothing -- | Close remaining 'Pair's at end of parsing. closePairs :: Pairs -> Tokens closePairs (t0,ps) = -- dbg "closePairs" $ t0 <> foldl' closeImpaired mempty ps appendLexeme :: Lexeme -> Pairs -> Pairs appendLexeme lex acc = -- dbg "appendLexeme" $ case lex of LexemePairOpen ps -> foldl' openPair acc ps {- where open a p@(Cell _bp ep (PairElem{})) = openPair a p `appendToken` (Tree0 $ Cell ep ep $ TokenPhrase $ PhraseWhite "") open a p = openPair a p -} LexemePairClose ps -> foldl' closePair acc ps LexemePairAny ps -> appendText acc $ sconcat $ ((fst . pairBordersWithoutContent) <$>) <$> ps LexemePairBoth ps -> appendTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps LexemeEscape c -> appendToken acc $ Tree0 $ NodeToken . TokenEscape <$> c LexemeLink t -> appendToken acc $ Tree0 $ NodeToken . TokenLink <$> t {-LexemeWhite (unCell -> "") -> acc-} -- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc LexemeWhite t -> appendText acc t LexemeAlphaNum t -> appendText acc t LexemeOther t -> appendText acc t LexemeTree t -> appendToken acc t LexemeEnd -> acc {- TODEL appendTokenChild :: Pairs -> Tree (Cell Node) -> Pairs appendTokenChild pairs tree = debug "appendTokenChild" "pairs" pairs $ debug "appendTokenChild" "tree" tree $ dbg "appendTokenChild" $ go pairs tree where go (ts@(toList -> [unTree -> Cell bo _eo NodeText{}]),[]) tok@(Tree (Cell _bn en _n) _ns) = (pure $ Tree (Cell bo en NodePara) (ts |> tok),[]) go (ts,[]) tok = (ts |> tok,[]) go (ts,(p0,t0):ps) tok = (ts,(p0,t0|>tok):ps) -} appendLexemes :: Pairs -> [Lexeme] -> Pairs appendLexemes = foldr appendLexeme -- * Type 'Lexeme' data Lexeme = LexemePairOpen !(NonEmpty (Cell Pair)) | LexemePairClose !(NonEmpty (Cell Pair)) | LexemePairAny !(NonEmpty (Cell Pair)) | LexemePairBoth !(NonEmpty (Cell Pair)) | LexemeEscape !(Cell Char) | LexemeLink !(Cell TL.Text) | LexemeWhite !(Cell TL.Text) | LexemeAlphaNum !(Cell TL.Text) | LexemeOther !(Cell TL.Text) | LexemeTree !(Tree (Cell Node)) | LexemeEnd deriving (Eq, Show) -- ** Type 'Lexemes' type Lexemes = Seq Lexeme parseTokens :: [Lexeme] -> Tokens parseTokens ps = closePairs $ appendLexemes mempty $ -- dbg "Lexemes (post orient)" $ orientLexemePairAny $ LexemeEnd : ps parseLexemes :: String -> Cell TL.Text -> Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme] parseLexemes inp = runParserOnCell inp (p_Lexemes <* P.eof) -- | Parse 'Lexeme's, returning them in reverse order to apply 'orientLexemePairAny'. p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme] p_Lexemes = pdbg "Lexemes" $ go [] where go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme] go acc = (P.eof $> acc) <|> (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc) orientLexemePairAny :: [Lexeme] -> [Lexeme] orientLexemePairAny = \case -- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc -- "    t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc --    " LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc LexemePairAny p:[] -> LexemePairOpen p:[] --    ,,," LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[] -- ",,,    w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc -- ",,,AAA an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc -- ,,,"AAA an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc -- ") c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc -- (" LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc -- "( o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc -- )" LexemePairAny p:c@LexemePairClose{}:acc -> c:LexemePairClose p:acc acc -> acc p_some :: Parser e s a -> Parser e s (NonEmpty a) p_some p = NonEmpty.fromList <$> P.some p p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme p_Lexeme = pdbg "Lexeme" $ P.choice [ P.try $ LexemeWhite <$> p_Cell p_Spaces1 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny) , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle) , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen) , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose) , P.try $ LexemeEscape <$> p_Cell p_Escape , P.try $ LexemeLink <$> p_Cell p_Link , P.try $ LexemeAlphaNum <$> p_Cell (P.takeWhile1P (Just "AlphaNum") Char.isAlphaNum) , LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar) ] pairAny :: Char -> Maybe Pair pairAny = \case '-' -> Just PairDash '/' -> Just PairSlash '"' -> Just PairDoublequote '\'' -> Just PairSinglequote '`' -> Just PairBackquote '_' -> Just PairUnderscore '*' -> Just PairStar '#' -> Just PairHash _ -> Nothing pairOpen :: Char -> Maybe Pair pairOpen = \case '(' -> Just PairParen '[' -> Just PairBracket '{' -> Just PairBrace '«' -> Just PairFrenchquote _ -> Nothing pairClose :: Char -> Maybe Pair pairClose = \case ')' -> Just PairParen ']' -> Just PairBracket '}' -> Just PairBrace '»' -> Just PairFrenchquote _ -> Nothing p_Escape :: Parser e s Char p_Escape = P.char '\\' *> P.satisfy Char.isPrint p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text p_Link = P.try (P.char '<' *> p <* P.char '>') <|> p where p :: P.Tokens s ~ TL.Text => Parser e s TL.Text p = (\scheme addr -> scheme <> "//" <> addr) <$> P.option "" (P.try p_scheme) <* P.string "//" <*> p_addr p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text p_scheme = (<> ":") <$> (P.takeWhile1P (Just "scheme") $ \c -> Char.isAlphaNum c || c=='_' || c=='-' || c=='+') <* P.char ':' p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text p_addr = P.takeWhileP (Just "addr") $ \c -> Char.isAlphaNum c || c=='%' || c=='/' || c=='(' || c==')' || c=='-' || c=='_' || c=='.' || c=='#' || c=='?' || c=='=' -- | Build 'Tokens' from many 'Token's. tokens :: [Cell Token] -> Tokens tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts -- | Build 'Tokens' from one 'Token'. tokens1 :: Tree (Cell Node) -> Tokens tokens1 = Seq.singleton unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens)) unTokenElem toks = case toList $ {-Seq.dropWhileR isTokenWhite-} toks of [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts)) _ -> Nothing isTokenElem :: Tokens -> Bool isTokenElem toks = case toList $ {-Seq.dropWhileR isTokenWhite-} toks of [Tree (unCell -> NodePair PairElem{}) _] -> True _ -> False pairBordersWithoutContent :: Pair -> (TL.Text,TL.Text) pairBordersWithoutContent = \case PairElem n as -> ("<"<>n<>foldMap f as<>"/>","") where f (elemAttr_white,ElemAttr{..}) = elemAttr_white <> elemAttr_name <> elemAttr_open <> elemAttr_value <> elemAttr_close p -> pairBorders p pairBorders :: Pair -> (TL.Text,TL.Text) pairBorders = \case PairElem n as -> ("<"<>n<>foldMap f as<>">","n<>">") where f (elemAttr_white,ElemAttr{..}) = elemAttr_white <> elemAttr_name <> elemAttr_open <> elemAttr_value <> elemAttr_close PairHash -> ("#","#") PairStar -> ("*","*") PairSlash -> ("/","/") PairUnderscore -> ("_","_") PairDash -> ("-","-") PairBackquote -> ("`","`") PairSinglequote -> ("'","'") PairDoublequote -> ("\"","\"") PairFrenchquote -> ("«","»") PairParen -> ("(",")") PairBrace -> ("{","}") PairBracket -> ("[","]")