1 {-# LANGUAGE FlexibleContexts #-}
 
   2 {-# LANGUAGE FlexibleInstances #-}
 
   3 {-# LANGUAGE OverloadedStrings #-}
 
   4 {-# LANGUAGE Rank2Types #-}
 
   5 {-# LANGUAGE ScopedTypeVariables #-}
 
   6 {-# LANGUAGE TypeFamilies #-}
 
   7 {-# OPTIONS_GHC -fno-warn-orphans #-}
 
   8 module Hdoc.TCT.Read.Token where
 
  10 import Control.Applicative (Applicative(..), Alternative(..))
 
  11 import Control.Monad (Monad(..))
 
  13 import Data.Char (Char)
 
  14 import Data.Either (Either(..))
 
  15 import Data.Eq (Eq(..))
 
  16 import Data.Foldable (Foldable(..))
 
  17 import Data.Function (($), (.))
 
  18 import Data.Functor ((<$>), ($>), (<$))
 
  19 import Data.List.NonEmpty (NonEmpty(..))
 
  20 import Data.Maybe (Maybe(..))
 
  21 import Data.Monoid (Monoid(..))
 
  22 import Data.Semigroup (Semigroup(..))
 
  23 import Data.Sequence (ViewL(..), ViewR(..), (<|))
 
  24 import Data.TreeSeq.Strict (Tree(..), Trees)
 
  25 import Data.Tuple (fst,snd)
 
  26 import Data.Void (Void)
 
  27 import Prelude (Num(..))
 
  28 import Text.Show (Show(..))
 
  29 import qualified Data.Char as Char
 
  30 import qualified Data.List.NonEmpty as NonEmpty
 
  31 import qualified Data.Sequence as Seq
 
  32 import qualified Data.Text as Text
 
  33 import qualified Data.Text.Lazy as TL
 
  34 import qualified Text.Megaparsec as P
 
  35 import qualified Text.Megaparsec.Char as P
 
  41 import Hdoc.TCT.Read.Elem
 
  42 import Hdoc.TCT.Read.Cell
 
  45 -- | Right-only Dyck language,
 
  46 -- to keep track of opened 'Pair's.
 
  47 type Pairs = (Tokens,[Opening])
 
  48 type Tokens = Trees (Cell Node)
 
  51 -- | An opened 'Pair' and its content so far.
 
  52 type Opening = (Cell Pair,Tokens)
 
  54 appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs
 
  55 appendPairsToken ps t = appendPairsTokens ps (pure t)
 
  57 appendPairsText :: Pairs -> Cell TL.Text -> Pairs
 
  58 appendPairsText ps (Cell sp t) =
 
  61                         NodeToken $ TokenText t
 
  63 appendPairsTokens :: Pairs -> Tokens -> Pairs
 
  64 appendPairsTokens (ts,[])         toks = (ts`unionTokens`toks,[])
 
  65 appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps)
 
  67 -- | Unify two 'Tokens', merging border 'TokenText's if any.
 
  68 unionTokens :: Tokens -> Tokens -> Tokens
 
  70         case (Seq.viewr x, Seq.viewl y) of
 
  71          (xs :> x0, y0 :< ys) ->
 
  73                  (  Tree (Cell (Span  fx  bx _ex:| lx) (NodeToken (TokenText tx))) tsx
 
  74                   , Tree (Cell (Span _fy _by  ey:|_ly) (NodeToken (TokenText ty))) tsy ) ->
 
  76                         pure (Tree (Cell (Span fx bx ey:|lx) $ NodeToken $ TokenText $ tx <> ty) (tsx<>tsy)) `unionTokens`
 
  82 unionsTokens :: Foldable f => f Tokens -> Tokens
 
  83 unionsTokens = foldl' unionTokens mempty
 
  85 openPair :: Pairs -> Cell Pair -> Pairs
 
  86 openPair (t,ps) p = (t,(p,mempty):ps)
 
  88 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
 
  89 closePair :: Pairs -> Cell Pair -> Pairs
 
  90 closePair ps@(_,[]) (Cell loc p) = -- debug0 "closePair" $
 
  91         appendPairsText ps $ Cell loc $
 
  92                 snd $ pairBordersDouble p
 
  93 closePair (t,(cx@(Cell (Span  fx  bx _ex:| lx) px),t1):ts)
 
  94               cy@(Cell (Span _fy _by  ey:|_ly) py) = -- debug0 "closePair" $
 
  96          (PairElem nx ax, PairElem ny ay) | nx == ny ->
 
  97                 appendPairsToken (t,ts) $
 
  98                 Tree (Cell (Span fx bx ey:|lx) $ NodePair $ PairElem nx as) t1
 
  99                 where as | null ay   = ax
 
 102                 appendPairsToken (t,ts) $
 
 103                 Tree (Cell (Span fx bx ey:|lx) $ NodePair px) t1
 
 108                  (closeImpaired mempty (cx,t1))
 
 110 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
 
 111 closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens
 
 112 closeImpaired acc (Cell loc@(s0:|lp) pair, toks) = -- debug0 "closeImpaired" $
 
 114          -- NOTE: try to close 'PairTag' as 'TokenTag' instead of 'TokenText'.
 
 115          PairTag isBackref | Just (Cell (Span{span_end}:|_lt) ref, rest) <- parseRef body ->
 
 116                 Tree0 (Cell (s0{span_end}:|lp) $ NodeToken $ TokenTag isBackref ref) <| rest
 
 117                 -- NOTE: use bp (not bt) to include the '#'
 
 118          -- NOTE: try to close 'PairAt' as 'TokenAt' instead of 'TokenText'.
 
 119          PairAt isBackref | Just (Cell (Span{span_end}:|_lt) ref, rest) <- parseRef body ->
 
 120                 Tree0 (Cell (s0{span_end}:|lp) $ NodeToken $ TokenAt isBackref ref) <| rest
 
 121                 -- NOTE: use bp (not bt) to include the '@'
 
 122          _ -> pure open `unionTokens` body
 
 124         body = toks `unionTokens` acc
 
 125         open = Tree0 $ Cell loc $ NodeToken $ TokenText $ fst $ pairBordersDouble pair
 
 127 -- | Close remaining 'Pair's at end of parsing.
 
 128 closePairs :: Pairs -> Tokens
 
 129 closePairs (t0,ps) = -- debug0 "closePairs" $
 
 130         t0 `unionTokens` foldl' closeImpaired mempty ps
 
 132 appendLexeme :: Lexeme -> Pairs -> Pairs
 
 133 appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
 
 135          LexemePairOpen ps -> foldl' open acc ps
 
 137                 -- NOTE: insert an empty node to encode <elem></elem>, not <elem/>
 
 138                 open a p@(Cell (Span{span_end, ..}:|sp) PairElem{}) = openPair a p `appendPairsText` Cell (Span{span_begin=span_end, ..}:|sp) ""
 
 139                 open a p = openPair a p
 
 140          LexemePairClose ps -> foldl' closePair acc ps
 
 141          LexemePairAny   ps -> foldl' openPair  acc ps
 
 142          LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
 
 143          LexemeEscape   c  -> appendPairsToken  acc $ Tree0 $ NodeToken . TokenEscape <$> c
 
 144          LexemeLink     t  -> appendPairsToken  acc $ Tree0 $ NodeToken . TokenLink <$> t
 
 145          {-LexemeWhite (unCell -> "") -> acc-}
 
 146          -- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc
 
 147          LexemeWhite    t  -> appendPairsText acc t
 
 148          LexemeAlphaNum t  -> appendPairsText acc t
 
 149          LexemeOther    t  -> appendPairsText acc t
 
 150          LexemeTree     t  -> appendPairsToken acc t
 
 153 appendLexemes :: Pairs -> [Lexeme] -> Pairs
 
 154 appendLexemes = foldr appendLexeme
 
 157 -- | 'Lexeme's cut the input in the longest chunks of common semantic,
 
 158 -- this enables 'orientLexemePairAny' to work with a more meaningful context.
 
 160  =   LexemePairOpen  !(NonEmpty (Cell Pair))
 
 161  |   LexemePairClose !(NonEmpty (Cell Pair))
 
 162  |   LexemePairAny   !(NonEmpty (Cell Pair))
 
 163      -- ^ orientation depending on the surrounding 'Lexeme's,
 
 164      -- see 'orientLexemePairAny'
 
 165  |   LexemePairBoth  !(NonEmpty (Cell Pair))
 
 166  |   LexemeEscape    !(Cell Char)
 
 167  |   LexemeLink      !(Cell TL.Text)
 
 168  |   LexemeWhite     !(Cell TL.Text)
 
 169  |   LexemeAlphaNum  !(Cell TL.Text)
 
 170  |   LexemeOther     !(Cell TL.Text)
 
 171  |   LexemeTree      !(Tree (Cell Node)) -- FIXME: useless?
 
 174 instance Pretty Lexeme
 
 176 parseTokens :: [Lexeme] -> Tokens
 
 179         appendLexemes mempty $
 
 180         -- debug0 "Lexemes (post orient)" $
 
 181         orientLexemePairAny $ LexemeEnd :
 
 186  Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme]
 
 187 parseLexemes = runParserOnCell (p_Lexemes <* P.eof)
 
 189 -- | Parse 'Lexeme's, returning them in reverse order
 
 190 -- to apply 'orientLexemePairAny'.
 
 191 p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
 
 192 p_Lexemes = debugParser "Lexemes" $ go []
 
 194         go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
 
 197                 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
 
 199 -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
 
 200 -- so that it can try to orient nearby 'LexemePairAny'
 
 201 -- to 'LexemePairOpen' or 'LexemePairClose'.
 
 202 orientLexemePairAny :: [Lexeme] -> [Lexeme]
 
 203 orientLexemePairAny = \case
 
 204          -- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc
 
 207          t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
 
 208          w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
 
 209          LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
 
 211          LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
 
 212          LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
 
 213          LexemePairAny p:[] -> LexemePairOpen p:[]
 
 216          LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
 
 217          LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
 
 219          w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
 
 220          LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
 
 223          an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
 
 224          an@LexemeEscape{}  :a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
 
 226          an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
 
 227          an@LexemeEscape{}  :LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
 
 230          c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
 
 232          LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
 
 235          o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
 
 237          LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc
 
 240          a1@LexemeAlphaNum{}:LexemePairAny p:a2@LexemeAlphaNum{}:acc ->
 
 241                 a1:LexemeOther (sconcat (((fst . pairBordersDouble) <$>) <$> p)):a2:acc
 
 245 p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
 
 246 p_Lexeme = debugParser "Lexeme" $
 
 248          [ P.try $ LexemeWhite     <$> p_Cell p_Spaces1
 
 249          , P.try $ LexemePairAny   <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
 
 250          , P.try $ LexemePairBoth  <$> p_some (P.try $ p_Cell p_ElemSingle)
 
 251          , P.try $ LexemePairOpen  <$> p_some (p_Cell $ p_satisfyMaybe pairOpen  <|> P.try p_ElemOpen <|> P.try p_BackOpen)
 
 252          , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
 
 253          , P.try $ LexemeEscape    <$> p_Cell p_Escape
 
 254          , P.try $ LexemeLink      <$> p_Cell p_Link
 
 255          , P.try $ LexemeAlphaNum  <$> p_Cell p_AlphaNums1
 
 256          ,         LexemeOther     <$> p_Cell (TL.singleton <$> P.anyChar)
 
 259 p_some :: Parser e s a -> Parser e s (NonEmpty a)
 
 260 p_some p = NonEmpty.fromList <$> P.some p
 
 262 pairAny :: Char -> Maybe Pair
 
 265  '/'  -> Just PairSlash
 
 266  '"'  -> Just PairDoublequote
 
 267  '\'' -> Just PairSinglequote
 
 268  '`'  -> Just PairBackquote
 
 269  '_'  -> Just PairUnderscore
 
 271  '#'  -> Just $ PairTag False
 
 272  '@'  -> Just $ PairAt  False
 
 275 pairOpen :: Char -> Maybe Pair
 
 277  '('  -> Just PairParen
 
 278  '['  -> Just PairBracket
 
 279  '{'  -> Just PairBrace
 
 280  '«'  -> Just PairFrenchquote
 
 283 pairClose :: Char -> Maybe Pair
 
 285  ')'  -> Just PairParen
 
 286  ']'  -> Just PairBracket
 
 287  '}'  -> Just PairBrace
 
 288  '»'  -> Just PairFrenchquote
 
 291 p_BackOpen :: P.Tokens s ~ TL.Text => Parser e s Pair
 
 292 p_BackOpen = debugParser "BackOpen" $
 
 294           *> (PairAt True <$ P.char '@'
 
 295          <|> PairTag True <$ P.char '#')
 
 297 p_Escape :: Parser e s Char
 
 298 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
 
 300 p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text
 
 302         P.try (P.char '<' *> p <* P.char '>') <|>
 
 305         p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
 
 307                 (\scheme addr -> scheme <> "//" <> addr)
 
 308                  <$> P.option "" (P.try p_scheme)
 
 311         p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
 
 314                  <$> (P.takeWhile1P (Just "scheme") $ \c ->
 
 320         p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
 
 322                 P.takeWhileP (Just "addr") $ \c ->
 
 336 pairBorders :: Foldable f => Pair -> f a -> (TL.Text,TL.Text)
 
 337 pairBorders p ts | null ts   = pairBordersSingle p
 
 338                  | otherwise = pairBordersDouble p
 
 340 pairBordersSingle :: Pair -> (TL.Text,TL.Text)
 
 341 pairBordersSingle = \case
 
 343         ("<"<>n<>foldMap f as<>"/>","")
 
 344         where f (elemAttr_white,ElemAttr{..}) =
 
 350  p -> pairBordersDouble p
 
 352 pairBordersDouble :: Pair -> (TL.Text,TL.Text)
 
 353 pairBordersDouble = \case
 
 354  PairElem n as -> ("<"<>n<>foldMap f as<>">","</"<>n<>">")
 
 355         where f (elemAttr_white,ElemAttr{..}) =
 
 361  PairTag isBackref | isBackref -> ("^#","#")
 
 362                    | otherwise -> ("#","#")
 
 363  PairAt  isBackref | isBackref -> ("^@","@")
 
 364                    | otherwise -> ("@","@")
 
 365  PairStar        -> ("*","*")
 
 366  PairSlash       -> ("/","/")
 
 367  PairUnderscore  -> ("_","_")
 
 368  PairDash        -> ("-","-")
 
 369  PairBackquote   -> ("`","`")
 
 370  PairSinglequote -> ("'","'")
 
 371  PairDoublequote -> ("\"","\"")
 
 372  PairFrenchquote -> ("«","»")
 
 373  PairParen       -> ("(",")")
 
 374  PairBrace       -> ("{","}")
 
 375  PairBracket     -> ("[","]")
 
 377 -- * Class 'ParseRef'
 
 378 class ParseRef a where
 
 379         parseRef :: a -> Maybe (Cell Ref, a)
 
 380 instance ParseRef Tokens where
 
 384                  Tree0 (Cell loc0@(Span _f0 _b0 e0:|_l0) n) :< ns ->
 
 386                          NodeToken (TokenText t) ->
 
 387                                 case parseRef $ Cell loc0 t of
 
 390                                         if TL.null $ unCell r0
 
 393                                                  Just (t1@(Cell (Span _f1 b1 _e1:|_l1) _), r1) | e0 == b1 ->
 
 396                                         else Just (t0, pure n0 `unionTokens` ns)
 
 397                                         where n0 = Tree0 $ NodeToken . TokenText <$> r0
 
 400 instance ParseRef (Cell TL.Text) where
 
 401         parseRef (Cell (Span fp bp ep:|sp) t)
 
 402          | (w,r) <- TL.span isTagChar t
 
 404          , ew <- pos_column bp + sum (Text.length <$> TL.toChunks w) =
 
 406                  ( Cell (Span fp bp bp{pos_column=ew}:|sp) w
 
 407                  , Cell (Span fp bp{pos_column=ew} ep:|sp) r )
 
 410 isTagChar :: Char -> Bool
 
 414         case Char.generalCategory c of
 
 415          Char.DashPunctuation -> True
 
 416          Char.ConnectorPunctuation -> True
 
 420 -- | Build 'Tokens' from many 'Token's.
 
 421 tokens :: [Cell Token] -> Tokens
 
 422 tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
 
 424 -- | Build 'Tokens' from one 'Token'.
 
 425 tokens1 :: Tree (Cell Node) -> Tokens
 
 426 tokens1 = Seq.singleton
 
 428 unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
 
 430         case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
 
 431          [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
 
 434 isTokenElem :: Tokens -> Bool
 
 436         case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
 
 437          [Tree (unCell -> NodePair PairElem{}) _] -> True