Language/TCT/Read/Token.hs

   1 {-# LANGUAGE FlexibleContexts #-}
   2 {-# LANGUAGE FlexibleInstances #-}
   3 {-# LANGUAGE OverloadedStrings #-}
   4 {-# LANGUAGE Rank2Types #-}
   5 {-# LANGUAGE ScopedTypeVariables #-}
   6 {-# LANGUAGE TypeFamilies #-}
   7 {-# OPTIONS_GHC -fno-warn-orphans #-}
   8 module Language.TCT.Read.Token where
   9
  10 import Control.Applicative (Applicative(..), Alternative(..))
  11 import Control.Monad (Monad(..))
  12 import Data.Bool
  13 import Data.Char (Char)
  14 import Data.Either (Either(..))
  15 import Data.Eq (Eq(..))
  16 import Data.Foldable (Foldable(..))
  17 import Data.Function (($), (.))
  18 import Data.Functor ((<$>), ($>))
  19 import Data.List.NonEmpty (NonEmpty(..))
  20 import Data.Maybe (Maybe(..))
  21 import Data.Monoid (Monoid(..))
  22 import Data.Semigroup (Semigroup(..))
  23 import Data.Sequence (ViewL(..), ViewR(..), (<|))
  24 import Data.String (String)
  25 import Data.TreeSeq.Strict (Tree(..), Trees)
  26 import Data.Tuple (fst,snd)
  27 import Data.Void (Void)
  28 import Prelude (Num(..))
  29 import Text.Show (Show(..))
  30 import qualified Data.Char as Char
  31 import qualified Data.List.NonEmpty as NonEmpty
  32 import qualified Data.Sequence as Seq
  33 import qualified Data.Text as Text
  34 import qualified Data.Text.Lazy as TL
  35 import qualified Text.Megaparsec as P
  36 import qualified Text.Megaparsec.Char as P
  37
  38 import Language.TCT.Debug
  39 import Language.TCT.Cell
  40 import Language.TCT.Elem
  41 import Language.TCT.Tree
  42 import Language.TCT.Read.Elem
  43 import Language.TCT.Read.Cell
  44
  45 -- * Type 'Pairs'
  46 -- | Right-only Dyck language,
  47 -- to keep track of opened 'Pair's.
  48 type Pairs = (Tokens,[Opening])
  49 type Tokens = Trees (Cell Node)
  50
  51 -- ** Type 'Opening'
  52 -- | An opened 'Pair' and its content so far.
  53 type Opening = (Cell Pair,Tokens)
  54
  55 appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs
  56 appendPairsToken ps t = appendPairsTokens ps (pure t)
  57
  58 appendPairsText :: Pairs -> Cell TL.Text -> Pairs
  59 appendPairsText ps (Cell bp ep t) =
  60         appendPairsToken ps $
  61                 Tree0 $ Cell bp ep $
  62                         NodeToken $ TokenText t
  63
  64 appendPairsTokens :: Pairs -> Tokens -> Pairs
  65 appendPairsTokens (ts,[])         toks = (ts`unionTokens`toks,[])
  66 appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps)
  67
  68 -- | Unify two 'Tokens', merging border 'TokenText's if any.
  69 unionTokens :: Tokens -> Tokens -> Tokens
  70 unionTokens x y =
  71         case (Seq.viewr x, Seq.viewl y) of
  72          (xs :> x0, y0 :< ys) ->
  73                 case (x0,y0) of
  74                  (  Tree (Cell bx _ex (NodeToken (TokenText tx))) sx
  75                   , Tree (Cell _by ey (NodeToken (TokenText ty))) sy ) ->
  76                         xs `unionTokens`
  77                         pure (Tree (Cell bx ey $ NodeToken $ TokenText $ tx <> ty) (sx<>sy)) `unionTokens`
  78                         ys
  79                  _ -> x <> y
  80          (EmptyR, _) -> y
  81          (_, EmptyL) -> x
  82
  83 unionsTokens :: Foldable f => f Tokens -> Tokens
  84 unionsTokens = foldl' unionTokens mempty
  85
  86 openPair :: Pairs -> Cell Pair -> Pairs
  87 openPair (t,ps) p = (t,(p,mempty):ps)
  88
  89 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
  90 closePair :: Pairs -> Cell Pair -> Pairs
  91 closePair ps@(_,[]) (Cell bp ep p) = -- debug0 "closePair" $
  92         appendPairsText ps $ Cell bp ep $
  93                 snd $ pairBordersDouble p
  94 closePair (t,(p1,t1):ts) p = -- debug0 "closePair" $
  95         case (p1,p) of
  96          (Cell bx _ex (PairElem nx ax), Cell _by ey (PairElem ny ay)) | nx == ny ->
  97                 appendPairsToken (t,ts) $
  98                 Tree (Cell bx ey $ NodePair $ PairElem nx as) t1
  99                 where as | null ay   = ax
 100                          | otherwise = ax<>ay
 101          (Cell bx _ex x, Cell _by ey y) | x == y ->
 102                 appendPairsToken (t,ts) $
 103                 Tree (Cell bx ey $ NodePair x) t1
 104          _ ->
 105                 (`closePair` p) $
 106                 appendPairsTokens
 107                  (t,ts)
 108                  (closeImpaired mempty (p1,t1))
 109
 110 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
 111 closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens
 112 closeImpaired acc (Cell bp ep pair, toks) = -- debug0 "closeImpaired" $
 113         case pair of
 114          -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
 115          PairHash | Just (Cell _bt et tag, rest) <- tagFrom body ->
 116                 Tree0 (Cell bp et $ NodeToken $ TokenTag tag) <| rest
 117                 -- NOTE: use bp (not bt) to include the '#'
 118          _ -> pure open `unionTokens` body
 119         where
 120         body = toks `unionTokens` acc
 121         open = Tree0 $ Cell bp ep $ NodeToken $ TokenText $ fst $ pairBordersDouble pair
 122
 123 -- | Close remaining 'Pair's at end of parsing.
 124 closePairs :: Pairs -> Tokens
 125 closePairs (t0,ps) = -- debug0 "closePairs" $
 126         t0 `unionTokens` foldl' closeImpaired mempty ps
 127
 128 appendLexeme :: Lexeme -> Pairs -> Pairs
 129 appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
 130         case lex of
 131          LexemePairOpen ps -> foldl' open acc ps
 132                 where
 133                 -- NOTE: insert an empty node to encode <elem></elem>, not <elem/>
 134                 open a p@(Cell _bp ep PairElem{}) = openPair a p `appendPairsText` Cell ep ep ""
 135                 open a p = openPair a p
 136          LexemePairClose ps -> foldl' closePair acc ps
 137          LexemePairAny   ps -> foldl' openPair  acc ps
 138          {-
 139          LexemePairAny ps ->
 140                 appendPairsText acc $ sconcat $
 141                         ((fst . pairBordersSingle) <$>) <$> ps
 142          -}
 143          LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
 144          LexemeEscape   c  -> appendPairsToken  acc $ Tree0 $ NodeToken . TokenEscape <$> c
 145          LexemeLink     t  -> appendPairsToken  acc $ Tree0 $ NodeToken . TokenLink <$> t
 146          {-LexemeWhite (unCell -> "") -> acc-}
 147          -- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc
 148          LexemeWhite    t  -> appendPairsText acc t
 149          LexemeAlphaNum t  -> appendPairsText acc t
 150          LexemeOther    t  -> appendPairsText acc t
 151          LexemeTree     t  -> appendPairsToken acc t
 152          LexemeEnd -> acc
 153
 154 appendLexemes :: Pairs -> [Lexeme] -> Pairs
 155 appendLexemes = foldr appendLexeme
 156
 157 -- * Type 'Lexeme'
 158 -- | 'Lexeme's cut the input in the longest chunks of common semantic,
 159 -- this enables 'orientLexemePairAny' to work with a more meaningful context.
 160 data Lexeme
 161  =   LexemePairOpen  !(NonEmpty (Cell Pair))
 162  |   LexemePairClose !(NonEmpty (Cell Pair))
 163  |   LexemePairAny   !(NonEmpty (Cell Pair))
 164      -- ^ orientation depending on the surrounding 'Lexeme's,
 165      -- see 'orientLexemePairAny'
 166  |   LexemePairBoth  !(NonEmpty (Cell Pair))
 167  |   LexemeEscape    !(Cell Char)
 168  |   LexemeLink      !(Cell TL.Text)
 169  |   LexemeWhite     !(Cell TL.Text)
 170  |   LexemeAlphaNum  !(Cell TL.Text)
 171  |   LexemeOther     !(Cell TL.Text)
 172  |   LexemeTree      !(Tree (Cell Node))
 173  |   LexemeEnd
 174  deriving (Eq, Show)
 175 instance Pretty Lexeme
 176
 177 parseTokens :: [Lexeme] -> Tokens
 178 parseTokens ps =
 179         closePairs $
 180         appendLexemes mempty $
 181         -- debug0 "Lexemes (post orient)" $
 182         orientLexemePairAny $ LexemeEnd :
 183         ps
 184
 185 parseLexemes ::
 186  String ->
 187  Cell TL.Text ->
 188  Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme]
 189 parseLexemes inp = runParserOnCell inp (p_Lexemes <* P.eof)
 190
 191 -- | Parse 'Lexeme's, returning them in reverse order
 192 -- to apply 'orientLexemePairAny'.
 193 p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
 194 p_Lexemes = debugParser "Lexemes" $ go []
 195         where
 196         go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
 197         go acc =
 198                 (P.eof $> acc) <|>
 199                 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
 200
 201 -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
 202 -- so that it can try to orient nearby 'LexemePairAny'
 203 -- to 'LexemePairOpen' or 'LexemePairClose'.
 204 orientLexemePairAny :: [Lexeme] -> [Lexeme]
 205 orientLexemePairAny = \case
 206          -- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc
 207
 208          -- "
 209          t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
 210          w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
 211          LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
 212          --    "
 213          LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
 214          LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
 215          LexemePairAny p:[] -> LexemePairOpen p:[]
 216
 217          --    ,,,"
 218          LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
 219          LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
 220          -- ",,,
 221          w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
 222          LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
 223
 224          -- ",,,AAA
 225          an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
 226          -- ,,,"AAA
 227          an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
 228
 229          -- ")
 230          c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
 231          -- ("
 232          LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
 233
 234          -- "(
 235          o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
 236          -- )"
 237          LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc
 238
 239          acc -> acc
 240
 241 p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
 242 p_Lexeme = debugParser "Lexeme" $
 243         P.choice
 244          [ P.try $ LexemeWhite     <$> p_Cell p_Spaces1
 245          , P.try $ LexemePairAny   <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
 246          , P.try $ LexemePairBoth  <$> p_some (P.try $ p_Cell p_ElemSingle)
 247          , P.try $ LexemePairOpen  <$> p_some (p_Cell $ p_satisfyMaybe pairOpen  <|> P.try p_ElemOpen)
 248          , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
 249          , P.try $ LexemeEscape    <$> p_Cell p_Escape
 250          , P.try $ LexemeLink      <$> p_Cell p_Link
 251          , P.try $ LexemeAlphaNum  <$> p_Cell (P.takeWhile1P (Just "AlphaNum") Char.isAlphaNum)
 252          ,         LexemeOther     <$> p_Cell (TL.singleton <$> P.anyChar)
 253          ]
 254
 255 p_some :: Parser e s a -> Parser e s (NonEmpty a)
 256 p_some p = NonEmpty.fromList <$> P.some p
 257
 258 pairAny :: Char -> Maybe Pair
 259 pairAny = \case
 260  '-'  -> Just PairDash
 261  '/'  -> Just PairSlash
 262  '"'  -> Just PairDoublequote
 263  '\'' -> Just PairSinglequote
 264  '`'  -> Just PairBackquote
 265  '_'  -> Just PairUnderscore
 266  '*'  -> Just PairStar
 267  '#'  -> Just PairHash
 268  _    -> Nothing
 269
 270 pairOpen :: Char -> Maybe Pair
 271 pairOpen = \case
 272  '('  -> Just PairParen
 273  '['  -> Just PairBracket
 274  '{'  -> Just PairBrace
 275  '«'  -> Just PairFrenchquote
 276  _    -> Nothing
 277
 278 pairClose :: Char -> Maybe Pair
 279 pairClose = \case
 280  ')'  -> Just PairParen
 281  ']'  -> Just PairBracket
 282  '}'  -> Just PairBrace
 283  '»'  -> Just PairFrenchquote
 284  _    -> Nothing
 285
 286 p_Escape :: Parser e s Char
 287 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
 288
 289 p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text
 290 p_Link =
 291         P.try (P.char '<' *> p <* P.char '>') <|>
 292         p
 293         where
 294         p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
 295         p =
 296                 (\scheme addr -> scheme <> "//" <> addr)
 297                  <$> P.option "" (P.try p_scheme)
 298                  <*  P.string "//"
 299                  <*> p_addr
 300         p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
 301         p_scheme =
 302                 (<> ":")
 303                  <$> (P.takeWhile1P (Just "scheme") $ \c ->
 304                         Char.isAlphaNum c
 305                         || c=='_'
 306                         || c=='-'
 307                         || c=='+')
 308                  <* P.char ':'
 309         p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
 310         p_addr =
 311                 P.takeWhileP (Just "addr") $ \c ->
 312                         Char.isAlphaNum c
 313                         || c=='%'
 314                         || c=='/'
 315                         || c=='('
 316                         || c==')'
 317                         || c=='-'
 318                         || c=='_'
 319                         || c=='.'
 320                         || c=='#'
 321                         || c=='?'
 322                         || c=='='
 323
 324 pairBorders :: Foldable f => Pair -> f a -> (TL.Text,TL.Text)
 325 pairBorders p ts | null ts   = pairBordersSingle p
 326                  | otherwise = pairBordersDouble p
 327
 328 pairBordersSingle :: Pair -> (TL.Text,TL.Text)
 329 pairBordersSingle = \case
 330  PairElem n as ->
 331         ("<"<>n<>foldMap f as<>"/>","")
 332         where f (elemAttr_white,ElemAttr{..}) =
 333                 elemAttr_white <>
 334                 elemAttr_name  <>
 335                 elemAttr_open  <>
 336                 elemAttr_value <>
 337                 elemAttr_close
 338  p -> pairBordersDouble p
 339
 340 pairBordersDouble :: Pair -> (TL.Text,TL.Text)
 341 pairBordersDouble = \case
 342  PairElem n as -> ("<"<>n<>foldMap f as<>">","</"<>n<>">")
 343         where f (elemAttr_white,ElemAttr{..}) =
 344                 elemAttr_white <>
 345                 elemAttr_name  <>
 346                 elemAttr_open  <>
 347                 elemAttr_value <>
 348                 elemAttr_close
 349  PairHash        -> ("#","#")
 350  PairStar        -> ("*","*")
 351  PairSlash       -> ("/","/")
 352  PairUnderscore  -> ("_","_")
 353  PairDash        -> ("-","-")
 354  PairBackquote   -> ("`","`")
 355  PairSinglequote -> ("'","'")
 356  PairDoublequote -> ("\"","\"")
 357  PairFrenchquote -> ("«","»")
 358  PairParen       -> ("(",")")
 359  PairBrace       -> ("{","}")
 360  PairBracket     -> ("[","]")
 361
 362 -- * Class 'TagFrom'
 363 class TagFrom a where
 364         tagFrom :: a -> Maybe (Cell Tag, a)
 365 instance TagFrom Tokens where
 366         tagFrom ts =
 367                 case Seq.viewl ts of
 368                  EmptyL -> Nothing
 369                  Tree0 (Cell b0 e0 n) :< ns ->
 370                         case n of
 371                          NodeToken (TokenText t) ->
 372                                 case tagFrom $ Cell b0 e0 t of
 373                                  Nothing -> Nothing
 374                                  Just (t0,r0) ->
 375                                         if TL.null $ unCell r0
 376                                         then
 377                                                 case tagFrom ns of
 378                                                  Just (t1@(Cell b1 _e1 _), r1) | e0 == b1 ->
 379                                                         Just (t0<>t1, r1)
 380                                                  _ -> Just (t0, ns)
 381                                         else Just (t0, pure n0 `unionTokens` ns)
 382                                         where n0 = Tree0 $ NodeToken . TokenText <$> r0
 383                          _ -> Nothing
 384                  _ -> Nothing
 385 instance TagFrom (Cell TL.Text) where
 386         tagFrom (Cell bp ep t)
 387          | (w,r) <- TL.span isTagChar t
 388          , not $ TL.null w
 389          , ew <- pos_column bp + sum (Text.length <$> TL.toChunks w) =
 390                 Just
 391                  ( Cell bp bp{pos_column=ew} w
 392                  , Cell bp{pos_column=ew} ep r )
 393         tagFrom _ = Nothing
 394
 395 isTagChar :: Char -> Bool
 396 isTagChar c =
 397         Char.isAlphaNum c ||
 398         c=='·' ||
 399         case Char.generalCategory c of
 400          Char.DashPunctuation -> True
 401          Char.ConnectorPunctuation -> True
 402          _ -> False
 403
 404 {-
 405 -- | Build 'Tokens' from many 'Token's.
 406 tokens :: [Cell Token] -> Tokens
 407 tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
 408
 409 -- | Build 'Tokens' from one 'Token'.
 410 tokens1 :: Tree (Cell Node) -> Tokens
 411 tokens1 = Seq.singleton
 412
 413 unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
 414 unTokenElem toks =
 415         case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
 416          [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
 417          _ -> Nothing
 418
 419 isTokenElem :: Tokens -> Bool
 420 isTokenElem toks =
 421         case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
 422          [Tree (unCell -> NodePair PairElem{}) _] -> True
 423          _ -> False
 424 -}