{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE Rank2Types #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE ViewPatterns #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Language.TCT.Read.Token where
import Control.Applicative (Applicative(..), Alternative(..))
import Control.Monad (Monad(..))
import Data.Bool
import Data.Char (Char)
import Data.Either (Either(..))
import Data.Eq (Eq(..))
import Data.Foldable (Foldable(..))
import Data.Function (($), (.))
import Data.Functor ((<$>), ($>))
import Data.List.NonEmpty (NonEmpty(..))
import Data.Maybe (Maybe(..))
import Data.Monoid (Monoid(..))
import Data.Semigroup (Semigroup(..))
import Data.Sequence (ViewL(..), ViewR(..), (<|))
import Data.String (String)
import Data.TreeSeq.Strict (Tree(..), Trees)
import Data.Tuple (fst,snd)
import Data.Void (Void)
import Prelude (Num(..))
import Text.Show (Show(..))
import qualified Data.Char as Char
import qualified Data.List.NonEmpty as NonEmpty
import qualified Data.Sequence as Seq
import qualified Data.Text as Text
import qualified Data.Text.Lazy as TL
import qualified Text.Megaparsec as P
import qualified Text.Megaparsec.Char as P
import Language.TCT.Debug
import Language.TCT.Cell
import Language.TCT.Elem
import Language.TCT.Tree
import Language.TCT.Read.Elem
import Language.TCT.Read.Cell
-- * Type 'Pairs'
-- | Right-only Dyck language,
-- to keep track of opened 'Pair's.
type Pairs = (Tokens,[Opening])
type Tokens = Trees (Cell Node)
-- ** Type 'Opening'
-- | An opened 'Pair' and its content so far.
type Opening = (Cell Pair,Tokens)
appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs
appendPairsToken ps t = appendPairsTokens ps (pure t)
appendPairsText :: Pairs -> Cell TL.Text -> Pairs
appendPairsText ps (Cell bp ep t) =
appendPairsToken ps $
Tree0 $ Cell bp ep $
NodeToken $ TokenText t
appendPairsTokens :: Pairs -> Tokens -> Pairs
appendPairsTokens (ts,[]) toks = (ts`unionTokens`toks,[])
appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps)
-- | Unify two 'Tokens', merging border 'TokenText's if any.
unionTokens :: Tokens -> Tokens -> Tokens
unionTokens x y =
case (Seq.viewr x, Seq.viewl y) of
(xs :> x0, y0 :< ys) ->
case (x0,y0) of
( Tree (Cell bx _ex (NodeToken (TokenText tx))) sx
, Tree (Cell _by ey (NodeToken (TokenText ty))) sy ) ->
xs `unionTokens`
pure (Tree (Cell bx ey $ NodeToken $ TokenText $ tx <> ty) (sx<>sy)) `unionTokens`
ys
_ -> x <> y
(EmptyR, _) -> y
(_, EmptyL) -> x
unionsTokens :: Foldable f => f Tokens -> Tokens
unionsTokens = foldl' unionTokens mempty
openPair :: Pairs -> Cell Pair -> Pairs
openPair (t,ps) p = (t,(p,mempty):ps)
-- | Close a 'Pair' when there is a matching 'LexemePairClose'.
closePair :: Pairs -> Cell Pair -> Pairs
closePair ps@(_,[]) (Cell bp ep p) = -- debug0 "closePair" $
appendPairsText ps $ Cell bp ep $
snd $ pairBordersDouble p
closePair (t,(p1,t1):ts) p = -- debug0 "closePair" $
case (p1,p) of
(Cell bx _ex (PairElem nx ax), Cell _by ey (PairElem ny ay)) | nx == ny ->
appendPairsToken (t,ts) $
Tree (Cell bx ey $ NodePair $ PairElem nx as) t1
where as | null ay = ax
| otherwise = ax<>ay
(Cell bx _ex x, Cell _by ey y) | x == y ->
appendPairsToken (t,ts) $
Tree (Cell bx ey $ NodePair x) t1
_ ->
(`closePair` p) $
appendPairsTokens
(t,ts)
(closeImpaired mempty (p1,t1))
-- | Close a 'Pair' when there is no matching 'LexemePairClose'.
closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens
closeImpaired acc (Cell bp ep pair, toks) = -- debug0 "closeImpaired" $
case pair of
-- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
PairHash | Just (Cell _bt et tag, rest) <- tagFrom body ->
Tree0 (Cell bp et $ NodeToken $ TokenTag tag) <| rest
-- NOTE: use bp (not bt) to include the '#'
_ -> pure open `unionTokens` body
where
body = toks `unionTokens` acc
open = Tree0 $ Cell bp ep $ NodeToken $ TokenText $ fst $ pairBordersDouble pair
-- | Close remaining 'Pair's at end of parsing.
closePairs :: Pairs -> Tokens
closePairs (t0,ps) = -- debug0 "closePairs" $
t0 `unionTokens` foldl' closeImpaired mempty ps
appendLexeme :: Lexeme -> Pairs -> Pairs
appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
case lex of
LexemePairOpen ps -> foldl' open acc ps
where
-- NOTE: insert an empty node to encode , not
open a p@(Cell _bp ep (PairElem{})) = openPair a p `appendPairsText` Cell ep ep ""
open a p = openPair a p
LexemePairClose ps -> foldl' closePair acc ps
LexemePairAny ps -> foldl' openPair acc ps
{-
LexemePairAny ps ->
appendPairsText acc $ sconcat $
((fst . pairBordersSingle) <$>) <$> ps
-}
LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
LexemeEscape c -> appendPairsToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
LexemeLink t -> appendPairsToken acc $ Tree0 $ NodeToken . TokenLink <$> t
{-LexemeWhite (unCell -> "") -> acc-}
-- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc
LexemeWhite t -> appendPairsText acc t
LexemeAlphaNum t -> appendPairsText acc t
LexemeOther t -> appendPairsText acc t
LexemeTree t -> appendPairsToken acc t
LexemeEnd -> acc
appendLexemes :: Pairs -> [Lexeme] -> Pairs
appendLexemes = foldr appendLexeme
-- * Type 'Lexeme'
-- | 'Lexeme's cut the input in the longest chunks of common semantic,
-- this enables 'orientLexemePairAny' to work with a more meaningful context.
data Lexeme
= LexemePairOpen !(NonEmpty (Cell Pair))
| LexemePairClose !(NonEmpty (Cell Pair))
| LexemePairAny !(NonEmpty (Cell Pair))
-- ^ orientation depending on the surrounding 'Lexeme's,
-- see 'orientLexemePairAny'
| LexemePairBoth !(NonEmpty (Cell Pair))
| LexemeEscape !(Cell Char)
| LexemeLink !(Cell TL.Text)
| LexemeWhite !(Cell TL.Text)
| LexemeAlphaNum !(Cell TL.Text)
| LexemeOther !(Cell TL.Text)
| LexemeTree !(Tree (Cell Node))
| LexemeEnd
deriving (Eq, Show)
instance Pretty Lexeme
parseTokens :: [Lexeme] -> Tokens
parseTokens ps =
closePairs $
appendLexemes mempty $
-- debug0 "Lexemes (post orient)" $
orientLexemePairAny $ LexemeEnd :
ps
parseLexemes ::
String ->
Cell TL.Text ->
Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme]
parseLexemes inp = runParserOnCell inp (p_Lexemes <* P.eof)
-- | Parse 'Lexeme's, returning them in reverse order
-- to apply 'orientLexemePairAny'.
p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
p_Lexemes = debugParser "Lexemes" $ go []
where
go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
go acc =
(P.eof $> acc) <|>
(p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
-- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
-- so that it can try to orient nearby 'LexemePairAny'
-- to 'LexemePairOpen' or 'LexemePairClose'.
orientLexemePairAny :: [Lexeme] -> [Lexeme]
orientLexemePairAny = \case
-- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc
-- "
t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
-- "
LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
LexemePairAny p:[] -> LexemePairOpen p:[]
-- ,,,"
LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
-- ",,,
w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
-- ",,,AAA
an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
-- ,,,"AAA
an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
-- ")
c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
-- ("
LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
-- "(
o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
-- )"
LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc
acc -> acc
p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
p_Lexeme = debugParser "Lexeme" $
P.choice
[ P.try $ LexemeWhite <$> p_Cell p_Spaces1
, P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
, P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
, P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen)
, P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
, P.try $ LexemeEscape <$> p_Cell p_Escape
, P.try $ LexemeLink <$> p_Cell p_Link
, P.try $ LexemeAlphaNum <$> p_Cell (P.takeWhile1P (Just "AlphaNum") Char.isAlphaNum)
, LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar)
]
p_some :: Parser e s a -> Parser e s (NonEmpty a)
p_some p = NonEmpty.fromList <$> P.some p
pairAny :: Char -> Maybe Pair
pairAny = \case
'-' -> Just PairDash
'/' -> Just PairSlash
'"' -> Just PairDoublequote
'\'' -> Just PairSinglequote
'`' -> Just PairBackquote
'_' -> Just PairUnderscore
'*' -> Just PairStar
'#' -> Just PairHash
_ -> Nothing
pairOpen :: Char -> Maybe Pair
pairOpen = \case
'(' -> Just PairParen
'[' -> Just PairBracket
'{' -> Just PairBrace
'«' -> Just PairFrenchquote
_ -> Nothing
pairClose :: Char -> Maybe Pair
pairClose = \case
')' -> Just PairParen
']' -> Just PairBracket
'}' -> Just PairBrace
'»' -> Just PairFrenchquote
_ -> Nothing
p_Escape :: Parser e s Char
p_Escape = P.char '\\' *> P.satisfy Char.isPrint
p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text
p_Link =
P.try (P.char '<' *> p <* P.char '>') <|>
p
where
p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
p =
(\scheme addr -> scheme <> "//" <> addr)
<$> P.option "" (P.try p_scheme)
<* P.string "//"
<*> p_addr
p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
p_scheme =
(<> ":")
<$> (P.takeWhile1P (Just "scheme") $ \c ->
Char.isAlphaNum c
|| c=='_'
|| c=='-'
|| c=='+')
<* P.char ':'
p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
p_addr =
P.takeWhileP (Just "addr") $ \c ->
Char.isAlphaNum c
|| c=='%'
|| c=='/'
|| c=='('
|| c==')'
|| c=='-'
|| c=='_'
|| c=='.'
|| c=='#'
|| c=='?'
|| c=='='
pairBorders :: Foldable f => Pair -> f a -> (TL.Text,TL.Text)
pairBorders p ts | null ts = pairBordersSingle p
| otherwise = pairBordersDouble p
pairBordersSingle :: Pair -> (TL.Text,TL.Text)
pairBordersSingle = \case
PairElem n as ->
("<"<>n<>foldMap f as<>"/>","")
where f (elemAttr_white,ElemAttr{..}) =
elemAttr_white <>
elemAttr_name <>
elemAttr_open <>
elemAttr_value <>
elemAttr_close
p -> pairBordersDouble p
pairBordersDouble :: Pair -> (TL.Text,TL.Text)
pairBordersDouble = \case
PairElem n as -> ("<"<>n<>foldMap f as<>">",""<>n<>">")
where f (elemAttr_white,ElemAttr{..}) =
elemAttr_white <>
elemAttr_name <>
elemAttr_open <>
elemAttr_value <>
elemAttr_close
PairHash -> ("#","#")
PairStar -> ("*","*")
PairSlash -> ("/","/")
PairUnderscore -> ("_","_")
PairDash -> ("-","-")
PairBackquote -> ("`","`")
PairSinglequote -> ("'","'")
PairDoublequote -> ("\"","\"")
PairFrenchquote -> ("«","»")
PairParen -> ("(",")")
PairBrace -> ("{","}")
PairBracket -> ("[","]")
-- * Class 'TagFrom'
class TagFrom a where
tagFrom :: a -> Maybe (Cell Tag, a)
instance TagFrom Tokens where
tagFrom ts =
case Seq.viewl ts of
EmptyL -> Nothing
Tree0 (Cell b0 e0 n) :< ns ->
case n of
NodeToken (TokenText t) ->
case tagFrom $ Cell b0 e0 t of
Nothing -> Nothing
Just (t0,r0) ->
if TL.null $ unCell r0
then
case tagFrom ns of
Just (t1@(Cell b1 _e1 _), r1) | e0 == b1 ->
Just (t0<>t1, r1)
_ -> Just (t0, ns)
else Just (t0, pure n0 `unionTokens` ns)
where n0 = Tree0 $ NodeToken . TokenText <$> r0
_ -> Nothing
_ -> Nothing
instance TagFrom (Cell TL.Text) where
tagFrom (Cell bp ep t)
| (w,r) <- TL.span isTagChar t
, not $ TL.null w
, ew <- pos_column bp + sum (Text.length <$> (TL.toChunks w)) =
Just
( Cell bp bp{pos_column=ew} w
, Cell bp{pos_column=ew} ep r )
tagFrom _ = Nothing
isTagChar :: Char -> Bool
isTagChar c =
Char.isAlphaNum c ||
c=='·' ||
case Char.generalCategory c of
Char.DashPunctuation -> True
Char.ConnectorPunctuation -> True
_ -> False
{-
-- | Build 'Tokens' from many 'Token's.
tokens :: [Cell Token] -> Tokens
tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
-- | Build 'Tokens' from one 'Token'.
tokens1 :: Tree (Cell Node) -> Tokens
tokens1 = Seq.singleton
unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
unTokenElem toks =
case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
[Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
_ -> Nothing
isTokenElem :: Tokens -> Bool
isTokenElem toks =
case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
[Tree (unCell -> NodePair PairElem{}) _] -> True
_ -> False
-}