{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE Rank2Types #-}
{-# LANGUAGE TypeFamilies #-}
+{-# LANGUAGE ViewPatterns #-}
module Language.TCT.Read.Token where
import Control.Applicative (Applicative(..), Alternative(..))
import Data.Char (Char)
import Data.Eq (Eq(..))
import Data.Foldable (Foldable(..))
-import Data.Function (($), (.), flip)
+import Data.Function (($), (.))
import Data.Functor ((<$>), ($>), (<$))
-import Data.Maybe (Maybe(..), fromMaybe)
+import Data.Maybe (Maybe(..))
import Data.Monoid (Monoid(..))
import Data.Semigroup (Semigroup(..))
+import Data.Sequence (ViewL(..), (<|))
import Data.Text (Text)
import Data.Text.Buildable (Buildable(..))
import Data.Tuple (fst,snd)
+import Prelude (Num(..))
import Text.Show (Show(..))
import qualified Data.Char as Char
+import qualified Data.Sequence as Seq
import qualified Data.Text as Text
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Builder as Builder
import qualified Text.Megaparsec as P
+import qualified Text.Megaparsec.Char as P
import Language.TCT.Token
+import Language.TCT.Cell
import Language.TCT.Elem
import Language.TCT.Read.Elem
+import Language.TCT.Read.Cell
textOf :: Buildable a => a -> Text
textOf = TL.toStrict . Builder.toLazyText . build
--- * Type 'Groups'
-type Groups = (Token,[(Group,Token)])
+-- * Type 'Pairs'
+type Pairs = (Tokens,[(Cell Pair,Tokens)])
-openGroup :: Group -> Groups -> Groups
-openGroup g (t,ms) = (t,(g,mempty):ms)
+appendToken :: Pairs -> Cell Token -> Pairs
+appendToken ps = appendTokens ps . Seq.singleton
-groupToken :: Token -> Groups -> Groups
-groupToken mrk (t,[]) = (t<>mrk,[])
-groupToken mrk (t,(g0,m0):gs) = (t,(g0,m0<>mrk):gs)
+appendTokens :: Pairs -> Tokens -> Pairs
+appendTokens (t,[]) toks = (t<>toks,[])
+appendTokens (t,(p0,t0):ps) toks = (t,(p0,t0<>toks):ps)
-closeGroup :: Group -> Groups -> Groups
-closeGroup g (t,[]) = (t<>TokenPlain (snd $ groupBorders g mempty),[])
-closeGroup g (t,(g1,m1):ms) =
- case (g,g1) of
- (GroupElem x ax, GroupElem y ay) | x == y ->
- groupToken (TokenGroup (GroupElem x (ax<>ay)) m1) (t,ms)
- (x,y) | x == y -> groupToken (TokenGroup g1 m1) (t,ms)
+openPair :: Pairs -> Cell Pair -> Pairs
+openPair (t,ms) p = (t,(p,mempty):ms)
+
+-- | Close a 'Pair' when there is a matching 'LexemePairClose'.
+closePair :: Pairs -> Cell Pair -> Pairs
+closePair ps@(_,[]) (Cell bp ep p) = dbg "closePair" $
+ appendToken ps $
+ Cell bp ep $
+ TokenPlain $ snd $ pairBorders p tokensPlainEmpty
+closePair (t,(p1,t1):ts) p = dbg "closePair" $
+ case (p1,p) of
+ (Cell bx _ex (PairElem x ax), Cell _by ey (PairElem y ay)) | x == y ->
+ appendToken (t,ts) $
+ Cell bx ey $
+ TokenPair (PairElem x (ax<>ay)) t1
+ (Cell bx _ex x, Cell _by ey y) | x == y ->
+ appendToken (t,ts) $
+ Cell bx ey $
+ TokenPair x t1
_ ->
- closeGroup g $
- groupToken
- (TokenPlain (fst $ groupBorders g1 mempty) <> m1)
- (t,ms)
+ (`closePair` p) $
+ appendTokens
+ (t,ts)
+ (closeUnpaired mempty (p1,t1))
+
+-- | Close a 'Pair' when there is not a matching 'LexemePairClose'.
+closeUnpaired :: Tokens -> (Cell Pair,Tokens) -> Tokens
+closeUnpaired acc (Cell bp ep p,toks) = dbg "closeUnpaired" $
+ case p of
+ -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
+ PairHash | (Cell bt et (TokenPlain t)) :< ts <- Seq.viewl $ toks <> acc ->
+ case Text.findIndex (not . isTagChar) t of
+ -- Just 0 -> toksHash mempty <> toks <> acc
+ Just i ->
+ Cell bp bt{columnPos = columnPos bt + i} (TokenTag tag)
+ <| Cell bt{columnPos = columnPos bt + i + 1} et (TokenPlain t')
+ <| ts
+ where (tag,t') = Text.splitAt i t
+ Nothing | Text.null t -> toksHash mempty <> toks <> acc
+ Nothing -> Cell bp et (TokenTag t) <| ts
+ _ -> toksHash tokensPlainEmpty <> toks <> acc
+ where
+ toksHash = tokens1 . Cell bp ep . TokenPlain . fst . pairBorders p
+ isTagChar c =
+ Char.isAlphaNum c ||
+ c=='·' ||
+ case Char.generalCategory c of
+ Char.DashPunctuation -> True
+ Char.ConnectorPunctuation -> True
+ _ -> False
+
+-- | Close remaining 'Pair's at end of parsing.
+closePairs :: Pairs -> Tokens
+closePairs (t0,ps) = dbg "closePairs" $
+ t0 <> foldl' closeUnpaired mempty ps
-closeGroups :: Groups -> Token
-closeGroups grps =
- let (m0,gs) = appendLexeme (LexemeWhite "") grps in
- foldr (\(g,t) acc ->
- acc <> TokenPlain (fst $ groupBorders g mempty) <> t) m0 gs
+appendLexeme :: Lexeme -> Pairs -> Pairs
+appendLexeme lex acc =
+ dbg "appendLexeme" $
+ case lex of
+ LexemePairOpen ps -> foldl' open acc ps
+ where
+ open a p@(Cell _bp ep (PairElem{})) = openPair a p `appendToken` (Cell ep ep $ TokenPlain "")
+ open a p = openPair a p
+ LexemePairClose ps -> foldl' closePair acc ps
+ LexemePairAny ps -> appendTokens acc $ tokens $ ((\p -> TokenPlain $ fst $ pairBorders p mempty) <$>) <$> ps
+ LexemePairBoth ps -> appendTokens acc $ tokens $ ((`TokenPair`mempty) <$>) <$> ps
+ LexemeEscape c -> appendToken acc $ TokenEscape <$> c
+ LexemeLink t -> appendToken acc $ TokenLink <$> t
+ LexemeWhite (unCell -> "") -> acc
+ LexemeWhite cs -> appendToken acc $ TokenPlain <$> cs
+ LexemeAlphaNum cs -> appendToken acc $ TokenPlain . Text.pack <$> cs
+ LexemeAny cs -> appendToken acc $ TokenPlain . Text.pack <$> cs
+ LexemeToken ts -> appendTokens acc ts
-- * Type 'Lexeme'
data Lexeme
- = LexemeGroupOpen Group
- | LexemeGroupClose Group
- | LexemeGroupPlain Char
- | LexemeWhite Text
- | LexemeWord Text
- | LexemeToken Token
- | LexemeEscape Char
- | LexemeLink Text
- deriving (Show, Eq)
-
-appendLexeme :: Lexeme -> Groups -> Groups
-appendLexeme lex gs =
- case dbg "appendLexeme" lex of
- _ | (tok,(GroupHash,tag):gs') <- gs
- , (case lex of
- LexemeWord{} -> False
- LexemeEscape{} -> False
- LexemeGroupClose GroupHash -> False
- _ -> True) ->
- appendLexeme lex $
- groupToken (TokenTag (textOf tag)) (tok,gs')
- LexemeGroupOpen g -> openGroup g gs
- LexemeGroupClose g -> closeGroup g gs
- LexemeGroupPlain c -> groupToken (TokenPlain (Text.singleton c)) gs
- LexemeWhite wh -> groupToken (TokenPlain wh) gs
- LexemeWord wo -> groupToken (TokenPlain wo) gs
- LexemeToken tok -> groupToken tok gs
- LexemeEscape c -> groupToken (TokenEscape c) gs
- LexemeLink lnk -> groupToken (TokenLink lnk) gs
-
-appendLexemes :: Groups -> [Lexeme] -> Groups
-appendLexemes = foldl' (flip appendLexeme)
-
--- * Parsers
-
-p_Token :: Parser e s Token
-p_Token = closeGroups <$> p_Groups (mempty,[])
-
-p_Groups :: Groups -> Parser e s Groups
-p_Groups gs = pdbg "Groups" $
- (<|>)
- (p_Lexemes (mempty == gs) >>= p_Groups . appendLexemes gs)
- (P.eof $> gs)
-
-p_Lexemes :: Bool -> Parser e s [Lexeme]
-p_Lexemes isBOF = pdbg "Lexemes" $
- P.choice
- [ P.try $ p_GroupClose
- , P.try $ p_GroupOpen isBOF
- , P.try $ p_GroupOpenOrClose l_GroupOpenOrClose
- , P.try $ pure <$> p_White
- , pure . LexemeWord <$> p_Word
- ]
+ = LexemePairOpen ![Cell Pair]
+ | LexemePairClose ![Cell Pair]
+ | LexemePairAny ![Cell Pair]
+ | LexemePairBoth ![Cell Pair]
+ | LexemeEscape !(Cell Char)
+ | LexemeLink !(Cell Text)
+ | LexemeWhite !(Cell White)
+ | LexemeAlphaNum !(Cell [Char])
+ | LexemeAny !(Cell [Char])
+ | LexemeToken !Tokens
+ deriving (Eq, Show)
-p_White :: Parser e s Lexeme
-p_White = pdbg "White" $
- LexemeWhite <$> p_Spaces
+p_Tokens :: Parser e s Tokens
+p_Tokens = pdbg "Tokens" $
+ closePairs .
+ foldr appendLexeme mempty .
+ dbg "Lexemes" .
+ mangleLexemes .
+ (LexemeWhite (cell0 "") :) <$>
+ go [LexemeWhite (cell0 "")]
+ where
+ go :: [Lexeme] -> Parser e s [Lexeme]
+ go acc =
+ (P.eof $> acc) <|>
+ (p_Lexeme >>= \next -> go $ mangleLexemes $ next:acc)
+
+ mangleLexemes = \case
+ LexemeAny (Cell _bx ex x):LexemeAny (Cell by _ey y):acc -> LexemeAny (Cell by ex (x<>y)):acc
+
+ -- "
+ w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
+ -- "
+ LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
+
+ -- ,,,"
+ LexemePairAny p:a@LexemeAny{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
+ -- ",,,
+ w@LexemeWhite{}:a@LexemeAny{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
+
+ -- ",,,AAA
+ an@LexemeAlphaNum{}:a@LexemeAny{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
+ -- ,,,"AAA
+ an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeAny{}:acc -> an:LexemePairOpen p:a:acc
+
+ -- ")
+ c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
+ -- ("
+ LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
+
+ -- "(
+ o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
+ -- )"
+ LexemePairAny p:c@LexemePairClose{}:acc -> c:LexemePairClose p:acc
+
+ acc -> acc
-p_PunctOrSym :: Parser e s Char
-p_PunctOrSym = P.satisfy $ \c -> Char.isPunctuation c || Char.isSymbol c
+pairAny :: Char -> Maybe Pair
+pairAny = \case
+ '-' -> Just PairDash
+ '/' -> Just PairSlash
+ '"' -> Just PairDoublequote
+ '\'' -> Just PairSinglequote
+ '`' -> Just PairBackquote
+ '_' -> Just PairUnderscore
+ '*' -> Just PairStar
+ '#' -> Just PairHash
+ _ -> Nothing
-p_GroupOpen :: Bool -> Parser e s [Lexeme]
-p_GroupOpen isBOF = pdbg "GroupOpen" $ do
- wh <- if isBOF then return [] else pure <$> p_White
- ps <- mconcat <$> P.some (P.try $ p_GroupOpenOrClose $ l_GroupOpenAndClose LexemeGroupOpen)
- return $ wh<>ps
+pairOpen :: Char -> Maybe Pair
+pairOpen = \case
+ '(' -> Just PairParen
+ '[' -> Just PairBracket
+ '{' -> Just PairBrace
+ '«' -> Just PairFrenchquote
+ _ -> Nothing
-p_GroupClose :: Parser e s [Lexeme]
-p_GroupClose = pdbg "GroupClose" $ do
- ps <- mconcat <$> P.some (P.try $ p_GroupOpenOrClose $ l_GroupOpenAndClose LexemeGroupClose)
- wh <- pure <$> p_White <|> P.eof $> []
- return $ ps<>wh
+pairClose :: Char -> Maybe Pair
+pairClose = \case
+ ')' -> Just PairParen
+ ']' -> Just PairBracket
+ '}' -> Just PairBrace
+ '»' -> Just PairFrenchquote
+ _ -> Nothing
-p_GroupOpenOrClose :: (Char -> Maybe Lexeme) -> Parser e s [Lexeme]
-p_GroupOpenOrClose grp = pdbg "GroupOpenOrClose" $ do
+p_Cell :: Parser e s a -> Parser e s (Cell a)
+p_Cell pa = do
+ bp <- p_Position
+ a <- pa
+ ep <- p_Position
+ return $ Cell bp ep a
+
+p_Lexeme :: Parser e s Lexeme
+p_Lexeme = pdbg "Lexeme" $
P.choice
- [ P.try p_Elem
- , P.try (pure <$> p_Escape)
- , P.try (pure <$> p_Link)
- , (<$> p_PunctOrSym) $ \c ->
- pure $
- LexemeGroupPlain c `fromMaybe`
- grp c
+ [ P.try $ LexemeWhite <$> p_Cell p_Spaces
+ , P.try $ LexemePairAny <$> P.some (p_Cell $ p_satisfyMaybe pairAny)
+ , P.try $ LexemePairBoth <$> P.some (P.try $ p_Cell p_ElemSingle)
+ , P.try $ LexemePairOpen <$> P.some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen)
+ , P.try $ LexemePairClose <$> P.some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
+ , P.try $ LexemeEscape <$> p_Cell p_Escape
+ , P.try $ LexemeLink <$> p_Cell p_Link
+ , P.try $ LexemeAlphaNum <$> p_Cell (P.some p_AlphaNum)
+ , LexemeAny <$> p_Cell (pure <$> P.anyChar)
]
-l_GroupOpenAndClose :: (Group -> Lexeme) -> Char -> Maybe Lexeme
-l_GroupOpenAndClose lxm c = dbg "GroupOpenAndClose" $
- case c of
- '/' -> Just $ lxm GroupSlash
- '-' -> Just $ lxm GroupDash
- '"' -> Just $ lxm GroupDoublequote
- '\'' -> Just $ lxm GroupSinglequote
- '`' -> Just $ lxm GroupBackquote
- '_' -> Just $ lxm GroupUnderscore
- '*' -> Just $ lxm GroupStar
- '#' -> Just $ lxm GroupHash
- _ -> l_GroupOpenOrClose c
-
-l_GroupOpenOrClose :: Char -> Maybe Lexeme
-l_GroupOpenOrClose c = dbg "GroupOpenOrClose" $
- case c of
- '(' -> Just $ LexemeGroupOpen GroupParen
- '[' -> Just $ LexemeGroupOpen GroupBracket
- '{' -> Just $ LexemeGroupOpen GroupBrace
- '«' -> Just $ LexemeGroupOpen GroupFrenchquote
- ')' -> Just $ LexemeGroupClose GroupParen
- ']' -> Just $ LexemeGroupClose GroupBracket
- '}' -> Just $ LexemeGroupClose GroupBrace
- '»' -> Just $ LexemeGroupClose GroupFrenchquote
- _ -> Nothing
-
-p_Link :: Parser e s Lexeme
+p_AlphaNum :: Parser e s Char
+p_AlphaNum = P.satisfy Char.isAlphaNum
+
+p_Escape :: Parser e s Char
+p_Escape = P.char '\\' *> P.satisfy Char.isPrint
+
+p_Link :: Parser e s Text
p_Link =
- (\scheme ss addr -> LexemeLink $ Text.pack $ scheme <> ss <> addr)
+ (\scheme addr -> Text.pack $ scheme <> "//" <> addr)
<$> P.option "" (P.try p_scheme)
- <*> P.string "//"
+ <* P.string "//"
<*> p_addr
where
p_scheme =
- (<>)
+ (<> ":")
<$> P.some (P.satisfy $ \c ->
Char.isAlphaNum c
|| c=='_'
|| c=='-'
|| c=='+')
- <*> P.string ":"
+ <* P.char ':'
p_addr =
P.many $
P.satisfy $ \c ->
|| c=='-'
|| c=='_'
|| c=='.'
+ || c=='#'
+ || c=='?'
+ || c=='='
-p_Escape :: Parser e s Lexeme
-p_Escape =
- LexemeEscape
- <$ P.char '\\'
- <*> P.satisfy Char.isPrint
+p_ElemSingle :: Parser e s Pair
+p_ElemSingle = pdbg "ElemSingle" $
+ PairElem
+ <$ P.char '<'
+ <*> p_Word
+ <*> p_Attrs
+ <* P.string "/>"
-p_Elem :: Parser e s [Lexeme]
-p_Elem = pdbg "Elem" $ P.char '<' >> (p_close <|> p_open)
- where
- p_open =
- (\e as oc ->
- case oc of
- True -> [ LexemeGroupOpen $ GroupElem e as
- , LexemeToken $ Tokens mempty -- same elem for open and close
- , LexemeGroupClose $ GroupElem e [] ]
- False -> [LexemeGroupOpen $ GroupElem e as])
- <$> p_Word
- <*> p_Attrs
- <*> P.option False (True <$ P.char '/')
- <* P.char '>'
- p_close =
- (\e -> [LexemeGroupClose $ GroupElem e []])
- <$ P.char '/'
- <*> p_Word
- <* P.char '>'
+p_ElemOpen :: Parser e s Pair
+p_ElemOpen = pdbg "ElemOpen" $
+ PairElem
+ <$ P.char '<'
+ <*> p_Word
+ <*> p_Attrs
+ <* P.char '>'
+
+p_ElemClose :: Parser e s Pair
+p_ElemClose = pdbg "ElemClose" $
+ (`PairElem` [])
+ <$ P.string "</"
+ <*> p_Word
+ <* P.char '>'
+
+{-
+p_ElemOpenOrSingle :: Parser e s Pair
+p_ElemOpenOrSingle =
+ p_ElemOpen >>= \p ->
+ P.char '>' $> LexemePairOpen p <|>
+ P.string "/>" $> LexemePairAny p
+-}