{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE Rank2Types #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeFamilies #-}
{-# LANGUAGE ViewPatterns #-}
{-# OPTIONS_GHC -fno-warn-orphans #-}
module Language.TCT.Read.Token where
import Control.Applicative (Applicative(..), Alternative(..))
import Control.Monad (Monad(..))
import Data.Bool
import Data.Char (Char)
import Data.Either (Either(..))
import Data.Eq (Eq(..))
import Data.Foldable (Foldable(..))
import Data.Function (($), (.))
import Data.Functor ((<$>), ($>))
import Data.List.NonEmpty (NonEmpty(..))
import Data.Maybe (Maybe(..))
import Data.Monoid (Monoid(..))
import Data.Semigroup (Semigroup(..))
import Data.Sequence (ViewL(..), ViewR(..), (<|), (|>))
import Data.String (String)
import Data.TreeSeq.Strict (Tree(..), Trees)
import Data.Tuple (fst,snd)
import Data.Void (Void)
import Prelude (Num(..))
import Text.Show (Show(..))
import qualified Data.Char as Char
import qualified Data.List.NonEmpty as NonEmpty
import qualified Data.Sequence as Seq
import qualified Data.Text as Text
import qualified Data.Text.Lazy as TL
import qualified Text.Megaparsec as P
import qualified Text.Megaparsec.Char as P
import Language.TCT.Debug
import Language.TCT.Cell
import Language.TCT.Elem
import Language.TCT.Tree
import Language.TCT.Read.Elem
import Language.TCT.Read.Cell
-- * Type 'Pairs'
-- | Right-only Dyck language,
-- to keep track of opened 'Pair's.
type Pairs = (Tokens,[Opening])
type Tokens = Trees (Cell Node)
-- ** Type 'Opening'
-- | An opened 'Pair' and its content so far.
type Opening = (Cell Pair,Tokens)
appendToken :: Pairs -> Tree (Cell Node) -> Pairs
appendToken (ts,[]) tok = (ts|>tok,[])
appendToken (ts,(p0,t0):ps) tok = (ts,(p0,t0|>tok):ps)
appendTokens :: Pairs -> Tokens -> Pairs
appendTokens (ts,[]) toks = (ts<>toks,[])
appendTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0<>toks):ps)
-- | Appending 'TL.Text' is a special case
-- to append at the 'TokenText' level is possible,
-- instead of the higher 'NodeToken' level.
appendText :: Pairs -> Cell TL.Text -> Pairs
appendText ps tok =
case ps of
(ts,[]) -> (appendTokenText ts tok,[])
(ts,(p0,ts0):pss) -> (ts,(p0,appendTokenText ts0 tok):pss)
appendTokenText :: Tokens -> Cell TL.Text -> Tokens
appendTokenText ts (Cell bn en n) =
{-
| TL.null n = ts
| otherwise = -}
case Seq.viewr ts of
EmptyR -> pure $ Tree0 $ Cell bn en $ NodeToken $ TokenText n
is :> Tree (Cell bo _eo nod) st ->
case nod of
NodeToken (TokenText o) -> is |> i
where i = Tree (Cell bo en (NodeToken $ TokenText (o <> n))) st
_ -> ts |> Tree0 (Cell bn en $ NodeToken $ TokenText n)
prependTokenText :: Tokens -> Cell TL.Text -> Tokens
prependTokenText ts (Cell bn en n)
{-
| TL.null n = ts
| otherwise-} =
case Seq.viewl ts of
EmptyL -> pure $ Tree0 $ Cell bn en $ NodeToken $ TokenText n
Tree (Cell _bo eo nod) st :< is ->
case nod of
NodeToken (TokenText o) -> i <| is
where i = Tree (Cell bn eo (NodeToken $ TokenText (n <> o))) st
_ -> Tree0 (Cell bn en $ NodeToken $ TokenText n) <| ts
openPair :: Pairs -> Cell Pair -> Pairs
openPair (t,ps) p = (t,(p,mempty):ps)
-- | Close a 'Pair' when there is a matching 'LexemePairClose'.
closePair :: Pairs -> Cell Pair -> Pairs
closePair ps@(_,[]) (Cell bp ep p) = -- debug0 "closePair" $
appendText ps $ Cell bp ep $ snd $ pairBorders p
closePair (t,(p1,t1):ts) p = -- debug0 "closePair" $
case (p1,p) of
(Cell bx _ex (PairElem nx ax), Cell _by ey (PairElem ny ay)) | nx == ny ->
appendToken (t,ts) $
Tree (Cell bx ey $ NodePair $ PairElem nx as) t1
where as | null ay = ax
| otherwise = ax<>ay
(Cell bx _ex x, Cell _by ey y) | x == y ->
appendToken (t,ts) $
Tree (Cell bx ey $ NodePair x) t1
_ ->
(`closePair` p) $
appendTokens
(t,ts)
(closeImpaired mempty (p1,t1))
-- | Close a 'Pair' when there is no matching 'LexemePairClose'.
closeImpaired :: Tokens -> (Cell Pair,Tokens) -> Tokens
closeImpaired acc (Cell bp ep p,toks) = -- debug0 "closeImpaired" $
case p of
-- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
PairHash | Just (Cell _bt et t, ts) <- tagFrom $ toks <> acc ->
Tree0 (Cell bp et $ NodeToken $ TokenTag t) <| ts
{-
PairHash | (Tree0 (Cell bt et (TokenPlain t))) :< ts <- Seq.viewl $ toks <> acc ->
case Text.span isTagChar t of
("",_) | Text.null t -> toksHash mempty <> toks <> acc
| otherwise -> Tree0 (Cell bp et (TokenTag t)) <| ts
(tag,t') ->
let len = Text.length tag in
Tree0 (Cell bp bt{columnPos = columnPos bt + len} (TokenTag tag)) <|
Tree0 (Cell bt{columnPos = columnPos bt + len + 1} et (TokenPlain t')) <|
ts
-}
_ -> prependTokenText (toks <> acc) toksHash
where
toksHash :: Cell TL.Text
toksHash = Cell bp ep $ fst $ pairBorders p
-- | Close remaining 'Pair's at end of parsing.
closePairs :: Pairs -> Tokens
closePairs (t0,ps) = -- debug0 "closePairs" $
t0 <> foldl' closeImpaired mempty ps
appendLexeme :: Lexeme -> Pairs -> Pairs
appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
case lex of
LexemePairOpen ps -> foldl' open acc ps
where
-- NOTE: insert an empty node to encode , not
open a p@(Cell _bp ep (PairElem{})) =
openPair a p `appendToken`
(Tree0 $ Cell ep ep $ NodeToken $ TokenText "")
open a p = openPair a p
LexemePairClose ps -> foldl' closePair acc ps
LexemePairAny ps -> foldl' openPair acc ps
{-
LexemePairAny ps ->
appendText acc $ sconcat $
((fst . pairBordersWithoutContent) <$>) <$> ps
-}
LexemePairBoth ps -> appendTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
LexemeEscape c -> appendToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
LexemeLink t -> appendToken acc $ Tree0 $ NodeToken . TokenLink <$> t
{-LexemeWhite (unCell -> "") -> acc-}
-- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc
LexemeWhite t -> appendText acc t
LexemeAlphaNum t -> appendText acc t
LexemeOther t -> appendText acc t
LexemeTree t -> appendToken acc t
LexemeEnd -> acc
appendLexemes :: Pairs -> [Lexeme] -> Pairs
appendLexemes = foldr appendLexeme
-- * Type 'Lexeme'
-- | 'Lexeme's cut the input in the longest chunks of common semantic,
-- this enables 'orientLexemePairAny' to work with a more meaningful context.
data Lexeme
= LexemePairOpen !(NonEmpty (Cell Pair))
| LexemePairClose !(NonEmpty (Cell Pair))
| LexemePairAny !(NonEmpty (Cell Pair))
-- ^ orientation depending on the surrounding 'Lexeme's,
-- see 'orientLexemePairAny'
| LexemePairBoth !(NonEmpty (Cell Pair))
| LexemeEscape !(Cell Char)
| LexemeLink !(Cell TL.Text)
| LexemeWhite !(Cell TL.Text)
| LexemeAlphaNum !(Cell TL.Text)
| LexemeOther !(Cell TL.Text)
| LexemeTree !(Tree (Cell Node))
| LexemeEnd
deriving (Eq, Show)
instance Pretty Lexeme
parseTokens :: [Lexeme] -> Tokens
parseTokens ps =
closePairs $
appendLexemes mempty $
-- debug0 "Lexemes (post orient)" $
orientLexemePairAny $ LexemeEnd :
ps
parseLexemes ::
String ->
Cell TL.Text ->
Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme]
parseLexemes inp = runParserOnCell inp (p_Lexemes <* P.eof)
-- | Parse 'Lexeme's, returning them in reverse order
-- to apply 'orientLexemePairAny'.
p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
p_Lexemes = debugParser "Lexemes" $ go []
where
go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
go acc =
(P.eof $> acc) <|>
(p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
-- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
-- so that it can try to orient nearby 'LexemePairAny'
-- to 'LexemePairOpen' or 'LexemePairClose'.
orientLexemePairAny :: [Lexeme] -> [Lexeme]
orientLexemePairAny = \case
-- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc
-- "
t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
-- "
LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
LexemePairAny p:[] -> LexemePairOpen p:[]
-- ,,,"
LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
-- ",,,
w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
-- ",,,AAA
an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
-- ,,,"AAA
an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
-- ")
c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
-- ("
LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
-- "(
o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
-- )"
LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc
acc -> acc
p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
p_Lexeme = debugParser "Lexeme" $
P.choice
[ P.try $ LexemeWhite <$> p_Cell p_Spaces1
, P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
, P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
, P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen)
, P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
, P.try $ LexemeEscape <$> p_Cell p_Escape
, P.try $ LexemeLink <$> p_Cell p_Link
, P.try $ LexemeAlphaNum <$> p_Cell (P.takeWhile1P (Just "AlphaNum") Char.isAlphaNum)
, LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar)
]
p_some :: Parser e s a -> Parser e s (NonEmpty a)
p_some p = NonEmpty.fromList <$> P.some p
pairAny :: Char -> Maybe Pair
pairAny = \case
'-' -> Just PairDash
'/' -> Just PairSlash
'"' -> Just PairDoublequote
'\'' -> Just PairSinglequote
'`' -> Just PairBackquote
'_' -> Just PairUnderscore
'*' -> Just PairStar
'#' -> Just PairHash
_ -> Nothing
pairOpen :: Char -> Maybe Pair
pairOpen = \case
'(' -> Just PairParen
'[' -> Just PairBracket
'{' -> Just PairBrace
'«' -> Just PairFrenchquote
_ -> Nothing
pairClose :: Char -> Maybe Pair
pairClose = \case
')' -> Just PairParen
']' -> Just PairBracket
'}' -> Just PairBrace
'»' -> Just PairFrenchquote
_ -> Nothing
p_Escape :: Parser e s Char
p_Escape = P.char '\\' *> P.satisfy Char.isPrint
p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text
p_Link =
P.try (P.char '<' *> p <* P.char '>') <|>
p
where
p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
p =
(\scheme addr -> scheme <> "//" <> addr)
<$> P.option "" (P.try p_scheme)
<* P.string "//"
<*> p_addr
p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
p_scheme =
(<> ":")
<$> (P.takeWhile1P (Just "scheme") $ \c ->
Char.isAlphaNum c
|| c=='_'
|| c=='-'
|| c=='+')
<* P.char ':'
p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
p_addr =
P.takeWhileP (Just "addr") $ \c ->
Char.isAlphaNum c
|| c=='%'
|| c=='/'
|| c=='('
|| c==')'
|| c=='-'
|| c=='_'
|| c=='.'
|| c=='#'
|| c=='?'
|| c=='='
-- | Build 'Tokens' from many 'Token's.
tokens :: [Cell Token] -> Tokens
tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
-- | Build 'Tokens' from one 'Token'.
tokens1 :: Tree (Cell Node) -> Tokens
tokens1 = Seq.singleton
unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
unTokenElem toks =
case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
[Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
_ -> Nothing
isTokenElem :: Tokens -> Bool
isTokenElem toks =
case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
[Tree (unCell -> NodePair PairElem{}) _] -> True
_ -> False
pairBordersWithoutContent :: Pair -> (TL.Text,TL.Text)
pairBordersWithoutContent = \case
PairElem n as ->
("<"<>n<>foldMap f as<>"/>","")
where f (elemAttr_white,ElemAttr{..}) =
elemAttr_white <>
elemAttr_name <>
elemAttr_open <>
elemAttr_value <>
elemAttr_close
p -> pairBorders p
pairBorders :: Pair -> (TL.Text,TL.Text)
pairBorders = \case
PairElem n as -> ("<"<>n<>foldMap f as<>">",""<>n<>">")
where f (elemAttr_white,ElemAttr{..}) =
elemAttr_white <>
elemAttr_name <>
elemAttr_open <>
elemAttr_value <>
elemAttr_close
PairHash -> ("#","#")
PairStar -> ("*","*")
PairSlash -> ("/","/")
PairUnderscore -> ("_","_")
PairDash -> ("-","-")
PairBackquote -> ("`","`")
PairSinglequote -> ("'","'")
PairDoublequote -> ("\"","\"")
PairFrenchquote -> ("«","»")
PairParen -> ("(",")")
PairBrace -> ("{","}")
PairBracket -> ("[","]")
-- * Class 'TagFrom'
class TagFrom a where
tagFrom :: a -> Maybe (Cell Tag, a)
instance TagFrom Tokens where
tagFrom ts =
case Seq.viewl ts of
EmptyL -> Nothing
Tree0 (Cell b0 e0 n) :< ns ->
case n of
NodeToken (TokenText t) ->
case tagFrom $ Cell b0 e0 t of
Nothing -> Nothing
Just (t0,r0) ->
if TL.null (unCell r0)
then
case tagFrom ns of
Just (t1@(Cell b1 _e1 _), r1) | e0 == b1 ->
Just (t0<>t1, r1)
_ -> Just (t0, n0 <| ns)
else Just (t0, n0 <| ns)
where n0 = (Tree0 $ NodeToken . TokenText <$> r0)
_ -> Nothing
_ -> Nothing
instance TagFrom (Cell TL.Text) where
tagFrom (Cell bp ep t)
| (w,r) <- TL.span isTagChar t
, not $ TL.null w
, ew <- pos_column bp + sum (Text.length <$> (TL.toChunks w)) =
Just
( Cell bp bp{pos_column=ew} w
, Cell bp{pos_column=ew} ep r )
tagFrom _ = Nothing
isTagChar :: Char -> Bool
isTagChar c =
Char.isAlphaNum c ||
c=='·' ||
case Char.generalCategory c of
Char.DashPunctuation -> True
Char.ConnectorPunctuation -> True
_ -> False