{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Rank2Types #-} {-# LANGUAGE TypeFamilies #-} {-# LANGUAGE ViewPatterns #-} module Language.TCT.Read.Token where -- import Data.Text.Buildable (Buildable(..)) -- import qualified Data.Text.Lazy as TL -- import qualified Data.Text.Lazy.Builder as Builder import Control.Applicative (Applicative(..), Alternative(..)) import Control.Monad (Monad(..)) import Data.Bool import Data.Char (Char) import Data.Int (Int) import Data.Eq (Eq(..)) import Data.Ord (Ord(..)) import Data.Foldable (Foldable(..)) import Data.Sequence (Seq) import Data.Function (($), (.)) import Data.Functor ((<$>), ($>), (<$)) import Data.Maybe (Maybe(..)) import Data.Monoid (Monoid(..)) import Data.Semigroup (Semigroup(..)) import Data.Sequence (ViewL(..), (<|)) import Data.Text (Text) import Data.TreeSeq.Strict (Tree(..)) import Data.Tuple (fst,snd) import Prelude (Num(..)) import Text.Show (Show(..)) import qualified Data.Char as Char import qualified Data.Sequence as Seq import qualified Data.Text as Text import qualified Text.Megaparsec as P import qualified Text.Megaparsec.Char as P import qualified System.FilePath as FP import Language.TCT.Cell import Language.TCT.Elem import Language.TCT.Read.Elem import Language.TCT.Read.Cell -- * Type 'Row' -- | In normal order: a list of 'Key's, maybe ended by 'Value', all read on the same line. type Row = [Tree (Cell Key) (Cell Value)] -- * Type 'Key' data Key = KeyColon !Name !White -- ^ @name: @ | KeyEqual !Name !White -- ^ @name=@ | KeyBar !Name !White -- ^ @name|@ | KeyGreat !Name !White -- ^ @name>@ | KeyLower !Name !Attrs -- ^ @value@ | PairStar -- ^ @*value*@ | PairSlash -- ^ @/value/@ | PairUnderscore -- ^ @_value_@ | PairDash -- ^ @-value-@ | PairBackquote -- ^ @`value`@ | PairSinglequote -- ^ @'value'@ | PairDoublequote -- ^ @"value"@ | PairFrenchquote -- ^ @«value»@ | PairParen -- ^ @(value)@ | PairBrace -- ^ @{value}@ | PairBracket -- ^ @[value]@ deriving (Eq,Ord,Show) -- ** Type 'TokenValue' data TokenValue = TokenPlain !Text | TokenTag !Tag | TokenEscape !Char | TokenLink !Text | TokenTree (Tree (Cell Key) (Cell Value)) deriving (Eq,Ord,Show) -- ** Type 'Tag' type Tag = Text -- * Type 'Pairs' -- | Right-only Dyck language type Pairs = (Tokens,[Opening]) -- ** Type 'Opening' type Opening = (Cell Pair,Tokens) appendToken :: Pairs -> Token -> Pairs appendToken ps = appendTokens ps . Seq.singleton appendTokens :: Pairs -> Tokens -> Pairs appendTokens (t,[]) toks = (t<>toks,[]) appendTokens (t,(p0,t0):ps) toks = (t,(p0,t0<>toks):ps) openPair :: Pairs -> Cell Pair -> Pairs openPair (t,ms) p = (t,(p,mempty):ms) -- | Close a 'Pair' when there is a matching 'LexemePairClose'. closePair :: Pairs -> Cell Pair -> Pairs closePair ps@(_,[]) (Cell bp ep p) = dbg "closePair" $ appendToken ps $ Tree0 $ Cell bp ep $ TokenPlain $ snd $ pairBorders p tokensPlainEmpty closePair (t,(p1,t1):ts) p = dbg "closePair" $ case (p1,p) of (Cell bx _ex (PairElem x ax), Cell _by ey (PairElem y ay)) | x == y -> appendToken (t,ts) $ TreeN (Cell bx ey $ PairElem x (ax<>ay)) t1 (Cell bx _ex x, Cell _by ey y) | x == y -> appendToken (t,ts) $ TreeN (Cell bx ey x) t1 _ -> (`closePair` p) $ appendTokens (t,ts) (closeImpaired mempty (p1,t1)) -- | Close a 'Pair' when there is no matching 'LexemePairClose'. closeImpaired :: Tokens -> (Cell Pair,Tokens) -> Tokens closeImpaired acc (Cell bp ep p,toks) = dbg "closeImpaired" $ case p of -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'. PairHash | (Tree0 (Cell bt et (TokenPlain t))) :< ts <- Seq.viewl $ toks <> acc -> case Text.span isTagChar t of ("",_) | Text.null t -> toksHash mempty <> toks <> acc | otherwise -> Tree0 (Cell bp et (TokenTag t)) <| ts (tag,t') -> let len = Text.length tag in Tree0 (Cell bp bt{columnPos = columnPos bt + len} (TokenTag tag)) <| Tree0 (Cell bt{columnPos = columnPos bt + len + 1} et (TokenPlain t')) <| ts _ -> toksHash tokensPlainEmpty <> toks <> acc where toksHash = tokens1 . Tree0 . Cell bp ep . TokenPlain . fst . pairBorders p isTagChar c = Char.isAlphaNum c || c=='·' || case Char.generalCategory c of Char.DashPunctuation -> True Char.ConnectorPunctuation -> True _ -> False -- | Close remaining 'Pair's at end of parsing. closePairs :: Pairs -> Tokens closePairs (t0,ps) = dbg "closePairs" $ t0 <> foldl' closeImpaired mempty ps appendLexeme :: Lexeme -> Pairs -> Pairs appendLexeme lex acc = dbg "appendLexeme" $ case lex of LexemePairOpen ps -> foldl' open acc ps where open a p@(Cell _bp ep (PairElem{})) = openPair a p `appendToken` (Tree0 $ Cell ep ep $ TokenPlain "") open a p = openPair a p LexemePairClose ps -> foldl' closePair acc ps LexemePairAny ps -> appendTokens acc $ tokens $ Tree0 . ((\p -> TokenPlain $ fst $ pairBorders p mempty) <$>) <$> ps LexemePairBoth ps -> appendTokens acc $ tokens $ (`TreeN`mempty) <$> ps LexemeEscape c -> appendToken acc $ Tree0 $ TokenEscape <$> c LexemeLink t -> appendToken acc $ Tree0 $ TokenLink <$> t {-LexemeWhite (unCell -> "") -> acc-} LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc LexemeWhite cs -> appendToken acc $ Tree0 $ TokenPlain <$> cs LexemeAlphaNum cs -> appendToken acc $ Tree0 $ TokenPlain . Text.pack <$> cs LexemeAny cs -> appendToken acc $ Tree0 $ TokenPlain . Text.pack <$> cs -- LexemeToken ts -> appendTokens acc ts appendLexemes :: Pairs -> [Lexeme] -> Pairs appendLexemes = foldr appendLexeme -- * Type 'Lexeme' data Lexeme = LexemePairOpen ![Cell Pair] | LexemePairClose ![Cell Pair] | LexemePairAny ![Cell Pair] | LexemePairBoth ![Cell Pair] | LexemeEscape !(Cell Char) | LexemeLink !(Cell Text) | LexemeWhite !(Cell White) | LexemeAlphaNum !(Cell [Char]) | LexemeAny !(Cell [Char]) | LexemeTree !(Tree (Cell Key) Tokens) deriving (Eq, Ord, Show) -- ** Type 'Lexemes' type Lexemes = Seq Lexeme parseTokens :: [Lexeme] -> Tokens parseTokens ps = closePairs $ appendLexemes mempty $ dbg "Lexemes" $ orientLexemePairAny $ LexemeWhite (cell0 "") : ps -- | Parse 'Lexeme's, returning them in reverse order to apply 'orientLexemePairAny'. p_Lexemes :: Parser e s [Lexeme] p_Lexemes = pdbg "Lexemes" $ go [] where go :: [Lexeme] -> Parser e s [Lexeme] go acc = (P.eof $> acc) <|> (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc) orientLexemePairAny :: [Lexeme] -> [Lexeme] orientLexemePairAny = \case LexemeAny (Cell _bx ex x):LexemeAny (Cell by _ey y):acc -> LexemeAny (Cell by ex (x<>y)):acc -- "    w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc --    " LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc LexemePairAny p:[] -> LexemePairOpen p:[] --    ,,," LexemePairAny p:a@LexemeAny{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc LexemePairAny p:a@LexemeAny{}:[] -> LexemePairOpen p:a:[] -- ",,,    w@LexemeWhite{}:a@LexemeAny{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc -- ",,,AAA an@LexemeAlphaNum{}:a@LexemeAny{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc -- ,,,"AAA an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeAny{}:acc -> an:LexemePairOpen p:a:acc -- ") c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc -- (" LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc -- "( o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc -- )" LexemePairAny p:c@LexemePairClose{}:acc -> c:LexemePairClose p:acc acc -> acc p_Lexeme :: Parser e s Lexeme p_Lexeme = pdbg "Lexeme" $ P.choice [ P.try $ LexemeWhite <$> p_Cell p_Spaces , P.try $ LexemePairAny <$> P.some (p_Cell $ p_satisfyMaybe pairAny) , P.try $ LexemePairBoth <$> P.some (P.try $ p_Cell p_ElemSingle) , P.try $ LexemePairOpen <$> P.some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen) , P.try $ LexemePairClose <$> P.some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose) , P.try $ LexemeEscape <$> p_Cell p_Escape , P.try $ LexemeLink <$> p_Cell p_Link , P.try $ LexemeAlphaNum <$> p_Cell (P.some p_AlphaNum) , LexemeAny <$> p_Cell (pure <$> P.anyChar) ] p_Cell :: Parser e s a -> Parser e s (Cell a) p_Cell pa = do bp <- p_Position a <- pa ep <- p_Position return $ Cell bp ep a pairAny :: Char -> Maybe Pair pairAny = \case '-' -> Just PairDash '/' -> Just PairSlash '"' -> Just PairDoublequote '\'' -> Just PairSinglequote '`' -> Just PairBackquote '_' -> Just PairUnderscore '*' -> Just PairStar '#' -> Just PairHash _ -> Nothing pairOpen :: Char -> Maybe Pair pairOpen = \case '(' -> Just PairParen '[' -> Just PairBracket '{' -> Just PairBrace '«' -> Just PairFrenchquote _ -> Nothing pairClose :: Char -> Maybe Pair pairClose = \case ')' -> Just PairParen ']' -> Just PairBracket '}' -> Just PairBrace '»' -> Just PairFrenchquote _ -> Nothing p_AlphaNum :: Parser e s Char p_AlphaNum = P.satisfy Char.isAlphaNum p_Escape :: Parser e s Char p_Escape = P.char '\\' *> P.satisfy Char.isPrint p_Link :: Parser e s Text p_Link = P.try (P.char '<' *> p <* P.char '>') <|> p where p = (\scheme addr -> Text.pack $ scheme <> "//" <> addr) <$> P.option "" (P.try p_scheme) <* P.string "//" <*> p_addr p_scheme = (<> ":") <$> P.some (P.satisfy $ \c -> Char.isAlphaNum c || c=='_' || c=='-' || c=='+') <* P.char ':' p_addr = P.many $ P.satisfy $ \c -> Char.isAlphaNum c || c=='%' || c=='/' || c=='(' || c==')' || c=='-' || c=='_' || c=='.' || c=='#' || c=='?' || c=='=' p_ElemSingle :: Parser e s Pair p_ElemSingle = pdbg "ElemSingle" $ PairElem <$ P.char '<' <*> p_Word <*> p_Attrs <* P.string "/>" p_ElemOpen :: Parser e s Pair p_ElemOpen = pdbg "ElemOpen" $ PairElem <$ P.char '<' <*> p_Word <*> p_Attrs <* P.char '>' p_ElemClose :: Parser e s Pair p_ElemClose = pdbg "ElemClose" $ (`PairElem` []) <$ P.string " p_Word <* P.char '>' {- p_ElemOpenOrSingle :: Parser e s Pair p_ElemOpenOrSingle = p_ElemOpen >>= \p -> P.char '>' $> LexemePairOpen p <|> P.string "/>" $> LexemePairAny p -} -- | Build 'Tokens' from many 'Token's. tokens :: [Token] -> Tokens tokens = Seq.fromList -- | Build 'Tokens' from one 'Token'. tokens1 :: Token -> Tokens tokens1 = Seq.singleton tokensPlainEmpty :: Tokens tokensPlainEmpty = tokens1 $ Tree0 $ cell1 $ TokenPlain "" isTokenWhite :: Token -> Bool isTokenWhite (Tree0 (unCell -> TokenPlain t)) = Text.all Char.isSpace t isTokenWhite _ = False unTokenElem :: Tokens -> Maybe (Cell (Elem,Attrs,Tokens)) unTokenElem toks = case toList $ Seq.dropWhileR isTokenWhite toks of [TreeN (Cell bp ep (PairElem e as)) ts] -> Just (Cell bp ep (e,as,ts)) _ -> Nothing isTokenElem :: Tokens -> Bool isTokenElem toks = case toList $ Seq.dropWhileR isTokenWhite toks of [TreeN (unCell -> PairElem{}) _] -> True _ -> False pairBorders :: TokenKey -> Tokens -> (Text,Text) pairBorders p ts = case p of PairElem e attrs -> if Seq.null ts then ("<"<>e<>foldMap f attrs<>"/>","") else ("<"<>e<>foldMap f attrs<>">","e<>">") where f (attr_white,Attr{..}) = attr_white <> attr_name <> attr_open <> attr_value <> attr_close PairHash -> ("#","#") PairStar -> ("*","*") PairSlash -> ("/","/") PairUnderscore -> ("_","_") PairDash -> ("-","-") PairBackquote -> ("`","`") PairSinglequote -> ("'","'") PairDoublequote -> ("\"","\"") PairFrenchquote -> ("«","»") PairParen -> ("(",")") PairBrace -> ("{","}") PairBracket -> ("[","]")