{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE Rank2Types #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeFamilies #-} {-# OPTIONS_GHC -fno-warn-orphans #-} module Hdoc.TCT.Read.Token where import Control.Applicative (Applicative(..), Alternative(..)) import Control.Monad (Monad(..)) import Data.Bool import Data.Char (Char) import Data.Either (Either(..)) import Data.Eq (Eq(..)) import Data.Foldable (Foldable(..)) import Data.Function (($), (.)) import Data.Functor ((<$>), ($>), (<$)) import Data.List.NonEmpty (NonEmpty(..)) import Data.Maybe (Maybe(..)) import Data.Monoid (Monoid(..)) import Data.Semigroup (Semigroup(..)) import Data.Sequence (ViewL(..), ViewR(..), (<|)) import Data.TreeSeq.Strict (Tree(..), Trees) import Data.Tuple (fst,snd) import Data.Void (Void) import Prelude (Num(..)) import Text.Show (Show(..)) import qualified Data.Char as Char import qualified Data.List.NonEmpty as NonEmpty import qualified Data.Sequence as Seq import qualified Data.Text as Text import qualified Data.Text.Lazy as TL import qualified Language.Symantic.XML as XML import qualified Text.Megaparsec as P import qualified Text.Megaparsec.Char as P import Hdoc.TCT.Debug import Hdoc.TCT.Cell import Hdoc.TCT.Elem import Hdoc.TCT.Tree import Hdoc.TCT.Read.Elem import Hdoc.TCT.Read.Cell -- * Type 'Pairs' -- | Right-only Dyck language, -- to keep track of opened 'Pair's. type Pairs = (Tokens,[Opening]) type Tokens = Trees (Cell Node) -- ** Type 'Opening' -- | An opened 'Pair' and its content so far. type Opening = (Cell Pair,Tokens) appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs appendPairsToken ps t = appendPairsTokens ps (pure t) appendPairsText :: Pairs -> Cell TL.Text -> Pairs appendPairsText ps (Sourced sp t) = appendPairsToken ps $ Tree0 $ Sourced sp $ NodeToken $ TokenText t appendPairsTokens :: Pairs -> Tokens -> Pairs appendPairsTokens (ts,[]) toks = (ts`unionTokens`toks,[]) appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps) -- | Unify two 'Tokens', merging border 'TokenText's if any. unionTokens :: Tokens -> Tokens -> Tokens unionTokens x y = case (Seq.viewr x, Seq.viewl y) of (xs :> x0, y0 :< ys) -> case (x0,y0) of ( Tree (Sourced (FileRange fx bx _ex:| lx) (NodeToken (TokenText tx))) tsx , Tree (Sourced (FileRange _fy _by ey:|_ly) (NodeToken (TokenText ty))) tsy ) -> xs `unionTokens` pure (Tree (Sourced (FileRange fx bx ey:|lx) $ NodeToken $ TokenText $ tx <> ty) (tsx<>tsy) ) `unionTokens` ys _ -> x <> y (EmptyR, _) -> y (_, EmptyL) -> x unionsTokens :: Foldable f => f Tokens -> Tokens unionsTokens = foldl' unionTokens mempty openPair :: Pairs -> Cell Pair -> Pairs openPair (t,ps) p = (t,(p,mempty):ps) -- | Close a 'Pair' when there is a matching 'LexemePairClose'. closePair :: Pairs -> Cell Pair -> Pairs closePair ps@(_,[]) (Sourced loc p) = -- debug0 "closePair" $ appendPairsText ps $ Sourced loc $ snd $ pairBordersDouble p closePair (t,(cx@(Sourced (FileRange fx bx _ex:| lx) px),t1):ts) cy@(Sourced (FileRange _fy _by ey:|_ly) py) = -- debug0 "closePair" $ case (px,py) of (PairElem nx ax, PairElem ny ay) | nx == ny -> appendPairsToken (t,ts) $ Tree (Sourced (FileRange fx bx ey:|lx) $ NodePair $ PairElem nx as) t1 where as | null ay = ax | otherwise = ax<>ay _ | px == py -> appendPairsToken (t,ts) $ Tree (Sourced (FileRange fx bx ey:|lx) $ NodePair px) t1 _ -> (`closePair` cy) $ appendPairsTokens (t,ts) (closeImpaired mempty (cx,t1)) -- | Close a 'Pair' when there is no matching 'LexemePairClose'. closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens closeImpaired acc (Sourced loc@(s0:|lp) pair, toks) = -- debug0 "closeImpaired" $ case pair of -- NOTE: try to close 'PairTag' as 'TokenTag' instead of 'TokenText'. PairTag isBackref | Just (Sourced (FileRange{fileRange_end}:|_lt) ref, rest) <- parseRef body -> Tree0 (Sourced (s0{fileRange_end}:|lp) $ NodeToken $ TokenTag isBackref ref) <| rest -- NOTE: use bp (not bt) to include the '#' -- NOTE: try to close 'PairAt' as 'TokenAt' instead of 'TokenText'. PairAt isBackref | Just (Sourced (FileRange{fileRange_end}:|_lt) ref, rest) <- parseRef body -> Tree0 (Sourced (s0{fileRange_end}:|lp) $ NodeToken $ TokenAt isBackref ref) <| rest -- NOTE: use bp (not bt) to include the '@' _ -> pure open `unionTokens` body where body = toks `unionTokens` acc open = Tree0 $ Sourced loc $ NodeToken $ TokenText $ fst $ pairBordersDouble pair -- | Close remaining 'Pair's at end of parsing. closePairs :: Pairs -> Tokens closePairs (t0,ps) = -- debug0 "closePairs" $ t0 `unionTokens` foldl' closeImpaired mempty ps appendLexeme :: Lexeme -> Pairs -> Pairs appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc -> case lex of LexemePairOpen ps -> foldl' open acc ps where -- NOTE: insert an empty node to encode , not open a p@(Sourced (FileRange{fileRange_end, ..}:|sp) PairElem{}) = openPair a p `appendPairsText` Sourced (FileRange{fileRange_begin=fileRange_end, ..}:|sp) "" open a p = openPair a p LexemePairClose ps -> foldl' closePair acc ps LexemePairAny ps -> foldl' openPair acc ps LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps LexemeEscape c -> appendPairsToken acc $ Tree0 $ NodeToken . TokenEscape <$> c LexemeLink t -> appendPairsToken acc $ Tree0 $ NodeToken . TokenLink <$> t {-LexemeWhite (unSourced -> "") -> acc-} -- LexemeWhite (unSourced -> Text.all (==' ') -> True) -> acc LexemeWhite t -> appendPairsText acc t LexemeAlphaNum t -> appendPairsText acc t LexemeOther t -> appendPairsText acc t LexemeTree t -> appendPairsToken acc t LexemeEnd -> acc appendLexemes :: Pairs -> [Lexeme] -> Pairs appendLexemes = foldr appendLexeme -- * Type 'Lexeme' -- | 'Lexeme's cut the input in the longest chunks of common semantic, -- this enables 'orientLexemePairAny' to work with a more meaningful context. data Lexeme = LexemePairOpen !(NonEmpty (Cell Pair)) | LexemePairClose !(NonEmpty (Cell Pair)) | LexemePairAny !(NonEmpty (Cell Pair)) -- ^ orientation depending on the surrounding 'Lexeme's, -- see 'orientLexemePairAny' | LexemePairBoth !(NonEmpty (Cell Pair)) | LexemeEscape !(Cell Char) | LexemeLink !(Cell TL.Text) | LexemeWhite !(Cell TL.Text) | LexemeAlphaNum !(Cell TL.Text) | LexemeOther !(Cell TL.Text) | LexemeTree !(Tree (Cell Node)) -- FIXME: useless? | LexemeEnd deriving (Eq, Show) instance Pretty Lexeme parseTokens :: [Lexeme] -> Tokens parseTokens ps = closePairs $ appendLexemes mempty $ -- debug0 "Lexemes (post orient)" $ orientLexemePairAny $ LexemeEnd : ps parseLexemes :: Cell TL.Text -> Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme] parseLexemes = runParserOnCell (p_Lexemes <* P.eof) -- | Parse 'Lexeme's, returning them in reverse order -- to apply 'orientLexemePairAny'. p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme] p_Lexemes = debugParser "Lexemes" $ go [] where go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme] go acc = (P.eof $> acc) <|> (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc) -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme', -- so that it can try to orient nearby 'LexemePairAny' -- to 'LexemePairOpen' or 'LexemePairClose'. orientLexemePairAny :: [Lexeme] -> [Lexeme] orientLexemePairAny = \case -- LexemeOther (Sourced _bx ex x):LexemeOther (Sourced by _ey y):acc -> LexemeOther (Sourced by ex (x<>y)):acc -- "    t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc --    " LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc LexemePairAny p:[] -> LexemePairOpen p:[] --    ,,," LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[] -- ",,,    w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc -- ",,,AAA an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc an@LexemeEscape{} :a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc -- ,,,"AAA an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc an@LexemeEscape{} :LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc -- ") c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc -- (" LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc -- "( o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc -- )" LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc -- AAA#AAA a1@LexemeAlphaNum{}:LexemePairAny p:a2@LexemeAlphaNum{}:acc -> a1:LexemeOther (sconcat (((fst . pairBordersDouble) <$>) <$> p)):a2:acc acc -> acc p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme p_Lexeme = debugParser "Lexeme" $ P.choice [ P.try $ LexemeWhite <$> p_Cell p_Spaces1 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny) , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle) , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen <|> P.try p_BackOpen) , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose) , P.try $ LexemeEscape <$> p_Cell p_Escape , P.try $ LexemeLink <$> p_Cell p_Link , P.try $ LexemeAlphaNum <$> p_Cell p_AlphaNums1 , LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar) ] p_some :: Parser e s a -> Parser e s (NonEmpty a) p_some p = NonEmpty.fromList <$> P.some p pairAny :: Char -> Maybe Pair pairAny = \case '-' -> Just PairDash '/' -> Just PairSlash '"' -> Just PairDoublequote '\'' -> Just PairSinglequote '`' -> Just PairBackquote '_' -> Just PairUnderscore '*' -> Just PairStar '#' -> Just $ PairTag False '@' -> Just $ PairAt False _ -> Nothing pairOpen :: Char -> Maybe Pair pairOpen = \case '(' -> Just PairParen '[' -> Just PairBracket '{' -> Just PairBrace '«' -> Just PairFrenchquote _ -> Nothing pairClose :: Char -> Maybe Pair pairClose = \case ')' -> Just PairParen ']' -> Just PairBracket '}' -> Just PairBrace '»' -> Just PairFrenchquote _ -> Nothing p_BackOpen :: P.Tokens s ~ TL.Text => Parser e s Pair p_BackOpen = debugParser "BackOpen" $ P.char '~' *> (PairAt True <$ P.char '@' <|> PairTag True <$ P.char '#') p_Escape :: Parser e s Char p_Escape = P.char '\\' *> P.satisfy Char.isPrint p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text p_Link = P.try (P.char '<' *> p <* P.char '>') <|> p where p :: P.Tokens s ~ TL.Text => Parser e s TL.Text p = (\scheme addr -> scheme <> "//" <> addr) <$> P.option "" (P.try p_scheme) <* P.string "//" <*> p_addr p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text p_scheme = (<> ":") <$> (P.takeWhile1P (Just "scheme") $ \c -> Char.isAlphaNum c || c=='_' || c=='-' || c=='+') <* P.char ':' p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text p_addr = P.takeWhileP (Just "addr") $ \c -> Char.isAlphaNum c || c=='%' || c=='/' || c=='(' || c==')' || c=='-' || c=='_' || c=='.' || c=='#' || c=='?' || c=='=' || c=='@' pairBorders :: Foldable f => Pair -> f a -> (TL.Text, TL.Text) pairBorders p ts | null ts = pairBordersSingle p | otherwise = pairBordersDouble p pairBordersSingle :: Pair -> (TL.Text,TL.Text) pairBordersSingle = \case PairElem (XML.NCName n) as -> ("<"<>n<>foldMap f as<>"/>","") where f (elemAttr_white,ElemAttr{..}) = elemAttr_white <> XML.unNCName elemAttr_name <> elemAttr_open <> elemAttr_value <> elemAttr_close p -> pairBordersDouble p pairBordersDouble :: Pair -> (TL.Text, TL.Text) pairBordersDouble = \case PairElem (XML.NCName n) as -> ("<"<>n<>foldMap f as<>">","n<>">") where f (elemAttr_white,ElemAttr{..}) = elemAttr_white <> XML.unNCName elemAttr_name <> elemAttr_open <> elemAttr_value <> elemAttr_close PairTag isBackref | isBackref -> ("^#","#") | otherwise -> ("#","#") PairAt isBackref | isBackref -> ("^@","@") | otherwise -> ("@","@") PairStar -> ("*","*") PairSlash -> ("/","/") PairUnderscore -> ("_","_") PairDash -> ("-","-") PairBackquote -> ("`","`") PairSinglequote -> ("'","'") PairDoublequote -> ("\"","\"") PairFrenchquote -> ("«","»") PairParen -> ("(",")") PairBrace -> ("{","}") PairBracket -> ("[","]") -- * Class 'ParseRef' class ParseRef a where parseRef :: a -> Maybe (Cell Ref, a) instance ParseRef Tokens where parseRef ts = case Seq.viewl ts of EmptyL -> Nothing Tree0 (Sourced src0@(FileRange _f0 _b0 e0:|_l0) n) :< ns -> case n of NodeToken (TokenText t) -> case parseRef $ Sourced src0 t of Nothing -> Nothing Just (t0,r0) -> if TL.null $ unSourced r0 then case parseRef ns of Just (t1@(Sourced (FileRange _f1 b1 _e1:|_l1) _), r1) | e0 == b1 -> Just (t0<>t1, r1) _ -> Just (t0, ns) else Just (t0, pure n0 `unionTokens` ns) where n0 = Tree0 $ NodeToken . TokenText <$> r0 _ -> Nothing _ -> Nothing instance ParseRef (Cell TL.Text) where parseRef (Sourced (FileRange fp bp ep:|sp) t) | (w,r) <- TL.span isTagChar t , not $ TL.null w , ew <- filePos_column bp + sum (Text.length <$> TL.toChunks w) = Just ( Sourced (FileRange fp bp bp{filePos_column=ew}:|sp) w , Sourced (FileRange fp bp{filePos_column=ew} ep:|sp) r ) parseRef _ = Nothing isTagChar :: Char -> Bool isTagChar c = Char.isAlphaNum c || c=='·' || case Char.generalCategory c of Char.DashPunctuation -> True Char.ConnectorPunctuation -> True _ -> False {- -- | Build 'Tokens' from many 'Token's. tokens :: [Cell Token] -> Tokens tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts -- | Build 'Tokens' from one 'Token'. tokens1 :: Tree (Cell Node) -> Tokens tokens1 = Seq.singleton unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens)) unTokenElem toks = case toList $ {-Seq.dropWhileR isTokenWhite-} toks of [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts)) _ -> Nothing isTokenElem :: Tokens -> Bool isTokenElem toks = case toList $ {-Seq.dropWhileR isTokenWhite-} toks of [Tree (unSourced -> NodePair PairElem{}) _] -> True _ -> False -}