1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# LANGUAGE ViewPatterns #-}
8 {-# OPTIONS_GHC -fno-warn-orphans #-}
9 module Language.TCT.Read.Token where
11 import Control.Applicative (Applicative(..), Alternative(..))
12 import Control.Monad (Monad(..))
14 import Data.Char (Char)
15 import Data.Either (Either(..))
16 import Data.Eq (Eq(..))
17 import Data.Foldable (Foldable(..))
18 import Data.Function (($), (.))
19 import Data.Functor ((<$>), ($>))
20 import Data.List.NonEmpty (NonEmpty(..))
21 import Data.Maybe (Maybe(..))
22 import Data.Monoid (Monoid(..))
23 import Data.Semigroup (Semigroup(..))
24 import Data.Sequence (ViewL(..), ViewR(..), (<|), (|>))
25 import Data.String (String)
26 import Data.TreeSeq.Strict (Tree(..), Trees)
27 import Data.Tuple (fst,snd)
28 import Data.Void (Void)
29 import Prelude (Num(..))
30 import Text.Show (Show(..))
31 import qualified Data.Char as Char
32 import qualified Data.List.NonEmpty as NonEmpty
33 import qualified Data.Sequence as Seq
34 import qualified Data.Text as Text
35 import qualified Data.Text.Lazy as TL
36 import qualified Text.Megaparsec as P
37 import qualified Text.Megaparsec.Char as P
39 import Language.TCT.Debug
40 import Language.TCT.Cell
41 import Language.TCT.Elem
42 import Language.TCT.Tree
43 import Language.TCT.Read.Elem
44 import Language.TCT.Read.Cell
47 -- | Right-only Dyck language,
48 -- to keep track of opened 'Pair's.
49 type Pairs = (Tokens,[Opening])
50 type Tokens = Trees (Cell Node)
53 -- | An opened 'Pair' and its content so far.
54 type Opening = (Cell Pair,Tokens)
56 appendToken :: Pairs -> Tree (Cell Node) -> Pairs
57 appendToken (ts,[]) tok = (ts|>tok,[])
58 appendToken (ts,(p0,t0):ps) tok = (ts,(p0,t0|>tok):ps)
60 appendTokens :: Pairs -> Tokens -> Pairs
61 appendTokens (ts,[]) toks = (ts<>toks,[])
62 appendTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0<>toks):ps)
64 -- | Appending 'TL.Text' is a special case
65 -- to append at the 'TokenText' level is possible,
66 -- instead of the higher 'NodeToken' level.
67 appendText :: Pairs -> Cell TL.Text -> Pairs
70 (ts,[]) -> (appendTokenText ts tok,[])
71 (ts,(p0,ts0):pss) -> (ts,(p0,appendTokenText ts0 tok):pss)
73 appendTokenText :: Tokens -> Cell TL.Text -> Tokens
74 appendTokenText ts (Cell bn en n) =
79 EmptyR -> pure $ Tree0 $ Cell bn en $ NodeToken $ TokenText n
80 is :> Tree (Cell bo _eo nod) st ->
82 NodeToken (TokenText o) -> is |> i
83 where i = Tree (Cell bo en (NodeToken $ TokenText (o <> n))) st
84 _ -> ts |> Tree0 (Cell bn en $ NodeToken $ TokenText n)
86 prependTokenText :: Tokens -> Cell TL.Text -> Tokens
87 prependTokenText ts (Cell bn en n)
92 EmptyL -> pure $ Tree0 $ Cell bn en $ NodeToken $ TokenText n
93 Tree (Cell _bo eo nod) st :< is ->
95 NodeToken (TokenText o) -> i <| is
96 where i = Tree (Cell bn eo (NodeToken $ TokenText (n <> o))) st
97 _ -> Tree0 (Cell bn en $ NodeToken $ TokenText n) <| ts
99 openPair :: Pairs -> Cell Pair -> Pairs
100 openPair (t,ps) p = (t,(p,mempty):ps)
102 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
103 closePair :: Pairs -> Cell Pair -> Pairs
104 closePair ps@(_,[]) (Cell bp ep p) = -- debug0 "closePair" $
105 appendText ps $ Cell bp ep $ snd $ pairBorders p
106 closePair (t,(p1,t1):ts) p = -- debug0 "closePair" $
108 (Cell bx _ex (PairElem nx ax), Cell _by ey (PairElem ny ay)) | nx == ny ->
110 Tree (Cell bx ey $ NodePair $ PairElem nx as) t1
111 where as | null ay = ax
113 (Cell bx _ex x, Cell _by ey y) | x == y ->
115 Tree (Cell bx ey $ NodePair x) t1
120 (closeImpaired mempty (p1,t1))
122 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
123 closeImpaired :: Tokens -> (Cell Pair,Tokens) -> Tokens
124 closeImpaired acc (Cell bp ep p,toks) = -- debug0 "closeImpaired" $
126 -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
127 PairHash | Just (Cell _bt et t, ts) <- tagFrom $ toks <> acc ->
128 Tree0 (Cell bp et $ NodeToken $ TokenTag t) <| ts
130 PairHash | (Tree0 (Cell bt et (TokenPlain t))) :< ts <- Seq.viewl $ toks <> acc ->
131 case Text.span isTagChar t of
132 ("",_) | Text.null t -> toksHash mempty <> toks <> acc
133 | otherwise -> Tree0 (Cell bp et (TokenTag t)) <| ts
135 let len = Text.length tag in
136 Tree0 (Cell bp bt{columnPos = columnPos bt + len} (TokenTag tag)) <|
137 Tree0 (Cell bt{columnPos = columnPos bt + len + 1} et (TokenPlain t')) <|
140 _ -> prependTokenText (toks <> acc) toksHash
142 toksHash :: Cell TL.Text
143 toksHash = Cell bp ep $ fst $ pairBorders p
145 -- | Close remaining 'Pair's at end of parsing.
146 closePairs :: Pairs -> Tokens
147 closePairs (t0,ps) = -- debug0 "closePairs" $
148 t0 <> foldl' closeImpaired mempty ps
150 appendLexeme :: Lexeme -> Pairs -> Pairs
151 appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
153 LexemePairOpen ps -> foldl' open acc ps
155 -- NOTE: insert an empty node to encode <elem></elem>, not <elem/>
156 open a p@(Cell _bp ep (PairElem{})) =
157 openPair a p `appendToken`
158 (Tree0 $ Cell ep ep $ NodeToken $ TokenText "")
159 open a p = openPair a p
160 LexemePairClose ps -> foldl' closePair acc ps
161 LexemePairAny ps -> foldl' openPair acc ps
164 appendText acc $ sconcat $
165 ((fst . pairBordersWithoutContent) <$>) <$> ps
167 LexemePairBoth ps -> appendTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
168 LexemeEscape c -> appendToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
169 LexemeLink t -> appendToken acc $ Tree0 $ NodeToken . TokenLink <$> t
170 {-LexemeWhite (unCell -> "") -> acc-}
171 -- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc
172 LexemeWhite t -> appendText acc t
173 LexemeAlphaNum t -> appendText acc t
174 LexemeOther t -> appendText acc t
175 LexemeTree t -> appendToken acc t
178 appendLexemes :: Pairs -> [Lexeme] -> Pairs
179 appendLexemes = foldr appendLexeme
182 -- | 'Lexeme's cut the input in the longest chunks of common semantic,
183 -- this enables 'orientLexemePairAny' to work with a more meaningful context.
185 = LexemePairOpen !(NonEmpty (Cell Pair))
186 | LexemePairClose !(NonEmpty (Cell Pair))
187 | LexemePairAny !(NonEmpty (Cell Pair))
188 -- ^ orientation depending on the surrounding 'Lexeme's,
189 -- see 'orientLexemePairAny'
190 | LexemePairBoth !(NonEmpty (Cell Pair))
191 | LexemeEscape !(Cell Char)
192 | LexemeLink !(Cell TL.Text)
193 | LexemeWhite !(Cell TL.Text)
194 | LexemeAlphaNum !(Cell TL.Text)
195 | LexemeOther !(Cell TL.Text)
196 | LexemeTree !(Tree (Cell Node))
199 instance Pretty Lexeme
201 parseTokens :: [Lexeme] -> Tokens
204 appendLexemes mempty $
205 -- debug0 "Lexemes (post orient)" $
206 orientLexemePairAny $ LexemeEnd :
212 Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme]
213 parseLexemes inp = runParserOnCell inp (p_Lexemes <* P.eof)
215 -- | Parse 'Lexeme's, returning them in reverse order
216 -- to apply 'orientLexemePairAny'.
217 p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
218 p_Lexemes = debugParser "Lexemes" $ go []
220 go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
223 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
225 -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
226 -- so that it can try to orient nearby 'LexemePairAny'
227 -- to 'LexemePairOpen' or 'LexemePairClose'.
228 orientLexemePairAny :: [Lexeme] -> [Lexeme]
229 orientLexemePairAny = \case
230 -- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc
233 t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
234 w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
235 LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
237 LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
238 LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
239 LexemePairAny p:[] -> LexemePairOpen p:[]
242 LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
243 LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
245 w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
246 LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
249 an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
251 an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
254 c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
256 LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
259 o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
261 LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc
265 p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
266 p_Lexeme = debugParser "Lexeme" $
268 [ P.try $ LexemeWhite <$> p_Cell p_Spaces1
269 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
270 , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
271 , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen)
272 , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
273 , P.try $ LexemeEscape <$> p_Cell p_Escape
274 , P.try $ LexemeLink <$> p_Cell p_Link
275 , P.try $ LexemeAlphaNum <$> p_Cell (P.takeWhile1P (Just "AlphaNum") Char.isAlphaNum)
276 , LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar)
279 p_some :: Parser e s a -> Parser e s (NonEmpty a)
280 p_some p = NonEmpty.fromList <$> P.some p
282 pairAny :: Char -> Maybe Pair
285 '/' -> Just PairSlash
286 '"' -> Just PairDoublequote
287 '\'' -> Just PairSinglequote
288 '`' -> Just PairBackquote
289 '_' -> Just PairUnderscore
294 pairOpen :: Char -> Maybe Pair
296 '(' -> Just PairParen
297 '[' -> Just PairBracket
298 '{' -> Just PairBrace
299 '«' -> Just PairFrenchquote
302 pairClose :: Char -> Maybe Pair
304 ')' -> Just PairParen
305 ']' -> Just PairBracket
306 '}' -> Just PairBrace
307 '»' -> Just PairFrenchquote
310 p_Escape :: Parser e s Char
311 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
313 p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text
315 P.try (P.char '<' *> p <* P.char '>') <|>
318 p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
320 (\scheme addr -> scheme <> "//" <> addr)
321 <$> P.option "" (P.try p_scheme)
324 p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
327 <$> (P.takeWhile1P (Just "scheme") $ \c ->
333 p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
335 P.takeWhileP (Just "addr") $ \c ->
348 -- | Build 'Tokens' from many 'Token's.
349 tokens :: [Cell Token] -> Tokens
350 tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
352 -- | Build 'Tokens' from one 'Token'.
353 tokens1 :: Tree (Cell Node) -> Tokens
354 tokens1 = Seq.singleton
356 unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
358 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
359 [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
362 isTokenElem :: Tokens -> Bool
364 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
365 [Tree (unCell -> NodePair PairElem{}) _] -> True
368 pairBordersWithoutContent :: Pair -> (TL.Text,TL.Text)
369 pairBordersWithoutContent = \case
371 ("<"<>n<>foldMap f as<>"/>","")
372 where f (elemAttr_white,ElemAttr{..}) =
380 pairBorders :: Pair -> (TL.Text,TL.Text)
382 PairElem n as -> ("<"<>n<>foldMap f as<>">","</"<>n<>">")
383 where f (elemAttr_white,ElemAttr{..}) =
389 PairHash -> ("#","#")
390 PairStar -> ("*","*")
391 PairSlash -> ("/","/")
392 PairUnderscore -> ("_","_")
393 PairDash -> ("-","-")
394 PairBackquote -> ("`","`")
395 PairSinglequote -> ("'","'")
396 PairDoublequote -> ("\"","\"")
397 PairFrenchquote -> ("«","»")
398 PairParen -> ("(",")")
399 PairBrace -> ("{","}")
400 PairBracket -> ("[","]")
403 class TagFrom a where
404 tagFrom :: a -> Maybe (Cell Tag, a)
405 instance TagFrom Tokens where
409 Tree0 (Cell b0 e0 n) :< ns ->
411 NodeToken (TokenText t) ->
412 case tagFrom $ Cell b0 e0 t of
415 if TL.null (unCell r0)
418 Just (t1@(Cell b1 _e1 _), r1) | e0 == b1 ->
420 _ -> Just (t0, n0 <| ns)
421 else Just (t0, n0 <| ns)
422 where n0 = (Tree0 $ NodeToken . TokenText <$> r0)
425 instance TagFrom (Cell TL.Text) where
426 tagFrom (Cell bp ep t)
427 | (w,r) <- TL.span isTagChar t
429 , ew <- pos_column bp + sum (Text.length <$> (TL.toChunks w)) =
431 ( Cell bp bp{pos_column=ew} w
432 , Cell bp{pos_column=ew} ep r )
435 isTagChar :: Char -> Bool
439 case Char.generalCategory c of
440 Char.DashPunctuation -> True
441 Char.ConnectorPunctuation -> True