1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# LANGUAGE ViewPatterns #-}
8 module Language.TCT.Read.Token where
10 import Control.Applicative (Applicative(..), Alternative(..))
11 import Control.Monad (Monad(..))
13 import Data.Char (Char)
14 import Data.Eq (Eq(..))
15 import Data.Either (Either(..))
16 import Data.Foldable (Foldable(..))
17 import Data.Function (($), (.))
18 import Data.Functor ((<$>), ($>))
19 import Data.List.NonEmpty (NonEmpty(..))
20 import Data.Maybe (Maybe(..))
21 import Data.Monoid (Monoid(..))
22 import Data.Semigroup (Semigroup(..))
23 import Data.Sequence (Seq, ViewL(..), ViewR(..), (<|), (|>))
24 import Data.String (String)
25 import Data.TreeSeq.Strict (Tree(..), Trees)
26 import Data.Tuple (fst,snd)
27 import Data.Void (Void)
28 import Prelude (Num(..))
29 import Text.Show (Show(..))
30 import qualified Data.Char as Char
31 import qualified Data.List.NonEmpty as NonEmpty
32 import qualified Data.Sequence as Seq
33 import qualified Data.Text as Text
34 import qualified Data.Text.Lazy as TL
35 import qualified Text.Megaparsec as P
36 import qualified Text.Megaparsec.Char as P
38 import Language.TCT.Cell
39 import Language.TCT.Elem
40 -- import Language.TCT.Token
41 import Language.TCT.Tree
42 import Language.TCT.Read.Elem
43 import Language.TCT.Read.Cell
45 instance Pretty Pair where
46 pretty = return . show
47 instance Pretty a => Pretty (Cell a) where
48 pretty (Cell bp ep m) = do
50 return $ "Cell "<>show bp<>":"<>show ep<>" "<>s
51 instance Pretty Lexeme where
52 pretty = return . show
55 -- | Right-only Dyck language
56 type Pairs = (Tokens,[Opening])
57 type Tokens = Trees (Cell Node)
60 type Opening = (Cell Pair,Tokens)
62 appendToken :: Pairs -> Tree (Cell Node) -> Pairs
63 appendToken (ts,[]) tok = (ts|>tok,[])
64 appendToken (ts,(p0,t0):ps) tok = (ts,(p0,t0|>tok):ps)
66 appendTokens :: Pairs -> Tokens -> Pairs
67 appendTokens (ts,[]) toks = (ts<>toks,[])
68 appendTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0<>toks):ps)
70 appendText :: Pairs -> Cell TL.Text -> Pairs
73 (ts,[]) -> (appendTokenText ts tok,[])
74 (ts,(p0,ts0):pss) -> (ts,(p0,appendTokenText ts0 tok):pss)
76 appendTokenText :: Tokens -> Cell TL.Text -> Tokens
77 appendTokenText ts (Cell bn en n)
82 EmptyR -> pure $ Tree0 $ Cell bn en $ NodeToken $ TokenText n
83 is :> Tree (Cell bo _eo nod) st ->
85 NodeToken (TokenText o) -> is |> i
86 where i = Tree (Cell bo en (NodeToken $ TokenText (o <> n))) st
87 _ -> ts |> Tree0 (Cell bn en $ NodeToken $ TokenText n)
89 prependTokenText :: Tokens -> Cell TL.Text -> Tokens
90 prependTokenText ts (Cell bn en n)
95 EmptyL -> pure $ Tree0 $ Cell bn en $ NodeToken $ TokenText n
96 Tree (Cell _bo eo nod) st :< is ->
98 NodeToken (TokenText o) -> i <| is
99 where i = Tree (Cell bn eo (NodeToken $ TokenText (n <> o))) st
100 _ -> Tree0 (Cell bn en $ NodeToken $ TokenText n) <| ts
102 openPair :: Pairs -> Cell Pair -> Pairs
103 openPair (t,ps) p = (t,(p,mempty):ps)
105 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
106 closePair :: Pairs -> Cell Pair -> Pairs
107 closePair ps@(_,[]) (Cell bp ep p) = -- dbg "closePair" $
108 appendText ps $ Cell bp ep $ snd $ pairBorders p
109 closePair (t,(p1,t1):ts) p = -- dbg "closePair" $
111 (Cell bx _ex (PairElem nx ax), Cell _by ey (PairElem ny ay)) | nx == ny ->
113 Tree (Cell bx ey $ NodePair $ PairElem nx as) t1
114 where as | null ay = ax
116 (Cell bx _ex x, Cell _by ey y) | x == y ->
118 Tree (Cell bx ey $ NodePair x) t1
123 (closeImpaired mempty (p1,t1))
125 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
126 closeImpaired :: Tokens -> (Cell Pair,Tokens) -> Tokens
127 closeImpaired acc (Cell bp ep p,toks) = dbg "closeImpaired" $
129 -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
130 PairHash | Just (Cell bt et t, ts) <- tagFrom $ toks <> acc ->
131 Tree0 (Cell bt et $ NodeToken $ TokenTag t) <| ts
133 PairHash | (Tree0 (Cell bt et (TokenPlain t))) :< ts <- Seq.viewl $ toks <> acc ->
134 case Text.span isTagChar t of
135 ("",_) | Text.null t -> toksHash mempty <> toks <> acc
136 | otherwise -> Tree0 (Cell bp et (TokenTag t)) <| ts
138 let len = Text.length tag in
139 Tree0 (Cell bp bt{columnPos = columnPos bt + len} (TokenTag tag)) <|
140 Tree0 (Cell bt{columnPos = columnPos bt + len + 1} et (TokenPlain t')) <|
143 _ -> prependTokenText (toks <> acc) toksHash
145 toksHash :: Cell TL.Text
146 toksHash = Cell bp ep $ fst $ pairBorders p
148 isTagChar :: Char -> Bool
152 case Char.generalCategory c of
153 Char.DashPunctuation -> True
154 Char.ConnectorPunctuation -> True
158 class TagFrom a where
159 tagFrom :: a -> Maybe (Cell Tag, a)
160 instance TagFrom Tokens where
164 Tree0 (Cell b0 e0 n) :< ns ->
166 NodeToken (TokenText t) ->
167 case tagFrom $ Cell b0 e0 t of
170 if TL.null (unCell r0)
173 Just (t1@(Cell b1 _e1 _), r1) | e0 == b1 ->
175 _ -> Just (t0, n0 <| ns)
176 else Just (t0, n0 <| ns)
177 where n0 = (Tree0 $ NodeToken . TokenText <$> r0)
180 instance TagFrom (Cell TL.Text) where
181 tagFrom (Cell bp ep t)
182 | (w,r) <- TL.span isTagChar t
184 , ew <- pos_column bp + sum (Text.length <$> (TL.toChunks w)) =
186 ( Cell bp bp{pos_column=ew} w
187 , Cell bp{pos_column=ew} ep r )
190 -- | Close remaining 'Pair's at end of parsing.
191 closePairs :: Pairs -> Tokens
192 closePairs (t0,ps) = -- dbg "closePairs" $
193 t0 <> foldl' closeImpaired mempty ps
195 appendLexeme :: Lexeme -> Pairs -> Pairs
196 appendLexeme lex acc =
197 -- dbg "appendLexeme" $
199 LexemePairOpen ps -> foldl' openPair acc ps
202 open a p@(Cell _bp ep (PairElem{})) = openPair a p `appendToken` (Tree0 $ Cell ep ep $ TokenPhrase $ PhraseWhite "")
203 open a p = openPair a p
205 LexemePairClose ps -> foldl' closePair acc ps
207 appendText acc $ sconcat $
208 ((fst . pairBordersWithoutContent) <$>) <$> ps
209 LexemePairBoth ps -> appendTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
210 LexemeEscape c -> appendToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
211 LexemeLink t -> appendToken acc $ Tree0 $ NodeToken . TokenLink <$> t
212 {-LexemeWhite (unCell -> "") -> acc-}
213 -- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc
214 LexemeWhite t -> appendText acc t
215 LexemeAlphaNum t -> appendText acc t
216 LexemeOther t -> appendText acc t
217 LexemeTree t -> appendToken acc t
221 appendTokenChild :: Pairs -> Tree (Cell Node) -> Pairs
222 appendTokenChild pairs tree =
223 debug "appendTokenChild" "pairs" pairs $
224 debug "appendTokenChild" "tree" tree $
225 dbg "appendTokenChild" $
228 go (ts@(toList -> [unTree -> Cell bo _eo NodeText{}]),[])
229 tok@(Tree (Cell _bn en _n) _ns) =
230 (pure $ Tree (Cell bo en NodePara) (ts |> tok),[])
231 go (ts,[]) tok = (ts |> tok,[])
232 go (ts,(p0,t0):ps) tok = (ts,(p0,t0|>tok):ps)
235 appendLexemes :: Pairs -> [Lexeme] -> Pairs
236 appendLexemes = foldr appendLexeme
240 = LexemePairOpen !(NonEmpty (Cell Pair))
241 | LexemePairClose !(NonEmpty (Cell Pair))
242 | LexemePairAny !(NonEmpty (Cell Pair))
243 | LexemePairBoth !(NonEmpty (Cell Pair))
244 | LexemeEscape !(Cell Char)
245 | LexemeLink !(Cell TL.Text)
246 | LexemeWhite !(Cell TL.Text)
247 | LexemeAlphaNum !(Cell TL.Text)
248 | LexemeOther !(Cell TL.Text)
249 | LexemeTree !(Tree (Cell Node))
254 type Lexemes = Seq Lexeme
256 parseTokens :: [Lexeme] -> Tokens
259 appendLexemes mempty $
260 -- dbg "Lexemes (post orient)" $
261 orientLexemePairAny $ LexemeEnd :
267 Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme]
268 parseLexemes inp = runParserOnCell inp (p_Lexemes <* P.eof)
270 -- | Parse 'Lexeme's, returning them in reverse order to apply 'orientLexemePairAny'.
271 p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
272 p_Lexemes = pdbg "Lexemes" $ go []
274 go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
277 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
279 orientLexemePairAny :: [Lexeme] -> [Lexeme]
280 orientLexemePairAny = \case
281 -- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc
284 t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
285 w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
286 LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
288 LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
289 LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
290 LexemePairAny p:[] -> LexemePairOpen p:[]
293 LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
294 LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
296 w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
297 LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
300 an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
302 an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
305 c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
307 LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
310 o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
312 LexemePairAny p:c@LexemePairClose{}:acc -> c:LexemePairClose p:acc
316 p_some :: Parser e s a -> Parser e s (NonEmpty a)
317 p_some p = NonEmpty.fromList <$> P.some p
319 p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
320 p_Lexeme = pdbg "Lexeme" $
322 [ P.try $ LexemeWhite <$> p_Cell p_Spaces1
323 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
324 , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
325 , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen)
326 , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
327 , P.try $ LexemeEscape <$> p_Cell p_Escape
328 , P.try $ LexemeLink <$> p_Cell p_Link
329 , P.try $ LexemeAlphaNum <$> p_Cell (P.takeWhile1P (Just "AlphaNum") Char.isAlphaNum)
330 , LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar)
333 pairAny :: Char -> Maybe Pair
336 '/' -> Just PairSlash
337 '"' -> Just PairDoublequote
338 '\'' -> Just PairSinglequote
339 '`' -> Just PairBackquote
340 '_' -> Just PairUnderscore
345 pairOpen :: Char -> Maybe Pair
347 '(' -> Just PairParen
348 '[' -> Just PairBracket
349 '{' -> Just PairBrace
350 '«' -> Just PairFrenchquote
353 pairClose :: Char -> Maybe Pair
355 ')' -> Just PairParen
356 ']' -> Just PairBracket
357 '}' -> Just PairBrace
358 '»' -> Just PairFrenchquote
361 p_Escape :: Parser e s Char
362 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
364 p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text
366 P.try (P.char '<' *> p <* P.char '>') <|>
369 p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
371 (\scheme addr -> scheme <> "//" <> addr)
372 <$> P.option "" (P.try p_scheme)
375 p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
378 <$> (P.takeWhile1P (Just "scheme") $ \c ->
384 p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
386 P.takeWhileP (Just "addr") $ \c ->
399 -- | Build 'Tokens' from many 'Token's.
400 tokens :: [Cell Token] -> Tokens
401 tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
403 -- | Build 'Tokens' from one 'Token'.
404 tokens1 :: Tree (Cell Node) -> Tokens
405 tokens1 = Seq.singleton
407 unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
409 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
410 [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
413 isTokenElem :: Tokens -> Bool
415 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
416 [Tree (unCell -> NodePair PairElem{}) _] -> True
419 pairBordersWithoutContent :: Pair -> (TL.Text,TL.Text)
420 pairBordersWithoutContent = \case
422 ("<"<>n<>foldMap f as<>"/>","")
423 where f (elemAttr_white,ElemAttr{..}) =
431 pairBorders :: Pair -> (TL.Text,TL.Text)
433 PairElem n as -> ("<"<>n<>foldMap f as<>">","</"<>n<>">")
434 where f (elemAttr_white,ElemAttr{..}) =
440 PairHash -> ("#","#")
441 PairStar -> ("*","*")
442 PairSlash -> ("/","/")
443 PairUnderscore -> ("_","_")
444 PairDash -> ("-","-")
445 PairBackquote -> ("`","`")
446 PairSinglequote -> ("'","'")
447 PairDoublequote -> ("\"","\"")
448 PairFrenchquote -> ("«","»")
449 PairParen -> ("(",")")
450 PairBrace -> ("{","}")
451 PairBracket -> ("[","]")