1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# LANGUAGE ViewPatterns #-}
8 {-# OPTIONS_GHC -fno-warn-orphans #-}
9 module Language.TCT.Read.Token where
11 import Control.Applicative (Applicative(..), Alternative(..))
12 import Control.Monad (Monad(..))
14 import Data.Char (Char)
15 import Data.Either (Either(..))
16 import Data.Eq (Eq(..))
17 import Data.Foldable (Foldable(..))
18 import Data.Function (($), (.))
19 import Data.Functor ((<$>), ($>))
20 import Data.List.NonEmpty (NonEmpty(..))
21 import Data.Maybe (Maybe(..))
22 import Data.Monoid (Monoid(..))
23 import Data.Semigroup (Semigroup(..))
24 import Data.Sequence (ViewL(..), ViewR(..), (<|))
25 import Data.String (String)
26 import Data.TreeSeq.Strict (Tree(..), Trees)
27 import Data.Tuple (fst,snd)
28 import Data.Void (Void)
29 import Prelude (Num(..))
30 import Text.Show (Show(..))
31 import qualified Data.Char as Char
32 import qualified Data.List.NonEmpty as NonEmpty
33 import qualified Data.Sequence as Seq
34 import qualified Data.Text as Text
35 import qualified Data.Text.Lazy as TL
36 import qualified Text.Megaparsec as P
37 import qualified Text.Megaparsec.Char as P
39 import Language.TCT.Debug
40 import Language.TCT.Cell
41 import Language.TCT.Elem
42 import Language.TCT.Tree
43 import Language.TCT.Read.Elem
44 import Language.TCT.Read.Cell
47 -- | Right-only Dyck language,
48 -- to keep track of opened 'Pair's.
49 type Pairs = (Tokens,[Opening])
50 type Tokens = Trees (Cell Node)
53 -- | An opened 'Pair' and its content so far.
54 type Opening = (Cell Pair,Tokens)
56 appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs
57 appendPairsToken ps t = appendPairsTokens ps (pure t)
59 appendPairsText :: Pairs -> Cell TL.Text -> Pairs
60 appendPairsText ps (Cell bp ep t) =
63 NodeToken $ TokenText t
65 appendPairsTokens :: Pairs -> Tokens -> Pairs
66 appendPairsTokens (ts,[]) toks = (ts`unionTokens`toks,[])
67 appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps)
69 -- | Unify two 'Tokens', merging border 'TokenText's if any.
70 unionTokens :: Tokens -> Tokens -> Tokens
72 case (Seq.viewr x, Seq.viewl y) of
73 (xs :> x0, y0 :< ys) ->
75 ( Tree (Cell bx _ex (NodeToken (TokenText tx))) sx
76 , Tree (Cell _by ey (NodeToken (TokenText ty))) sy ) ->
78 pure (Tree (Cell bx ey $ NodeToken $ TokenText $ tx <> ty) (sx<>sy)) `unionTokens`
84 unionsTokens :: Foldable f => f Tokens -> Tokens
85 unionsTokens = foldl' unionTokens mempty
87 openPair :: Pairs -> Cell Pair -> Pairs
88 openPair (t,ps) p = (t,(p,mempty):ps)
90 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
91 closePair :: Pairs -> Cell Pair -> Pairs
92 closePair ps@(_,[]) (Cell bp ep p) = -- debug0 "closePair" $
93 appendPairsText ps $ Cell bp ep $
94 snd $ pairBordersDouble p
95 closePair (t,(p1,t1):ts) p = -- debug0 "closePair" $
97 (Cell bx _ex (PairElem nx ax), Cell _by ey (PairElem ny ay)) | nx == ny ->
98 appendPairsToken (t,ts) $
99 Tree (Cell bx ey $ NodePair $ PairElem nx as) t1
100 where as | null ay = ax
102 (Cell bx _ex x, Cell _by ey y) | x == y ->
103 appendPairsToken (t,ts) $
104 Tree (Cell bx ey $ NodePair x) t1
109 (closeImpaired mempty (p1,t1))
111 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
112 closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens
113 closeImpaired acc (Cell bp ep pair, toks) = -- debug0 "closeImpaired" $
115 -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
116 PairHash | Just (Cell _bt et tag, rest) <- tagFrom body ->
117 Tree0 (Cell bp et $ NodeToken $ TokenTag tag) <| rest
118 -- NOTE: use bp (not bt) to include the '#'
119 _ -> pure open `unionTokens` body
121 body = toks `unionTokens` acc
122 open = Tree0 $ Cell bp ep $ NodeToken $ TokenText $ fst $ pairBordersDouble pair
124 -- | Close remaining 'Pair's at end of parsing.
125 closePairs :: Pairs -> Tokens
126 closePairs (t0,ps) = -- debug0 "closePairs" $
127 t0 `unionTokens` foldl' closeImpaired mempty ps
129 appendLexeme :: Lexeme -> Pairs -> Pairs
130 appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
132 LexemePairOpen ps -> foldl' open acc ps
134 -- NOTE: insert an empty node to encode <elem></elem>, not <elem/>
135 open a p@(Cell _bp ep (PairElem{})) = openPair a p `appendPairsText` Cell ep ep ""
136 open a p = openPair a p
137 LexemePairClose ps -> foldl' closePair acc ps
138 LexemePairAny ps -> foldl' openPair acc ps
141 appendPairsText acc $ sconcat $
142 ((fst . pairBordersSingle) <$>) <$> ps
144 LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
145 LexemeEscape c -> appendPairsToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
146 LexemeLink t -> appendPairsToken acc $ Tree0 $ NodeToken . TokenLink <$> t
147 {-LexemeWhite (unCell -> "") -> acc-}
148 -- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc
149 LexemeWhite t -> appendPairsText acc t
150 LexemeAlphaNum t -> appendPairsText acc t
151 LexemeOther t -> appendPairsText acc t
152 LexemeTree t -> appendPairsToken acc t
155 appendLexemes :: Pairs -> [Lexeme] -> Pairs
156 appendLexemes = foldr appendLexeme
159 -- | 'Lexeme's cut the input in the longest chunks of common semantic,
160 -- this enables 'orientLexemePairAny' to work with a more meaningful context.
162 = LexemePairOpen !(NonEmpty (Cell Pair))
163 | LexemePairClose !(NonEmpty (Cell Pair))
164 | LexemePairAny !(NonEmpty (Cell Pair))
165 -- ^ orientation depending on the surrounding 'Lexeme's,
166 -- see 'orientLexemePairAny'
167 | LexemePairBoth !(NonEmpty (Cell Pair))
168 | LexemeEscape !(Cell Char)
169 | LexemeLink !(Cell TL.Text)
170 | LexemeWhite !(Cell TL.Text)
171 | LexemeAlphaNum !(Cell TL.Text)
172 | LexemeOther !(Cell TL.Text)
173 | LexemeTree !(Tree (Cell Node))
176 instance Pretty Lexeme
178 parseTokens :: [Lexeme] -> Tokens
181 appendLexemes mempty $
182 -- debug0 "Lexemes (post orient)" $
183 orientLexemePairAny $ LexemeEnd :
189 Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme]
190 parseLexemes inp = runParserOnCell inp (p_Lexemes <* P.eof)
192 -- | Parse 'Lexeme's, returning them in reverse order
193 -- to apply 'orientLexemePairAny'.
194 p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
195 p_Lexemes = debugParser "Lexemes" $ go []
197 go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
200 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
202 -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
203 -- so that it can try to orient nearby 'LexemePairAny'
204 -- to 'LexemePairOpen' or 'LexemePairClose'.
205 orientLexemePairAny :: [Lexeme] -> [Lexeme]
206 orientLexemePairAny = \case
207 -- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc
210 t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
211 w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
212 LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
214 LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
215 LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
216 LexemePairAny p:[] -> LexemePairOpen p:[]
219 LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
220 LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
222 w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
223 LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
226 an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
228 an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
231 c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
233 LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
236 o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
238 LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc
242 p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
243 p_Lexeme = debugParser "Lexeme" $
245 [ P.try $ LexemeWhite <$> p_Cell p_Spaces1
246 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
247 , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
248 , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen)
249 , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
250 , P.try $ LexemeEscape <$> p_Cell p_Escape
251 , P.try $ LexemeLink <$> p_Cell p_Link
252 , P.try $ LexemeAlphaNum <$> p_Cell (P.takeWhile1P (Just "AlphaNum") Char.isAlphaNum)
253 , LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar)
256 p_some :: Parser e s a -> Parser e s (NonEmpty a)
257 p_some p = NonEmpty.fromList <$> P.some p
259 pairAny :: Char -> Maybe Pair
262 '/' -> Just PairSlash
263 '"' -> Just PairDoublequote
264 '\'' -> Just PairSinglequote
265 '`' -> Just PairBackquote
266 '_' -> Just PairUnderscore
271 pairOpen :: Char -> Maybe Pair
273 '(' -> Just PairParen
274 '[' -> Just PairBracket
275 '{' -> Just PairBrace
276 '«' -> Just PairFrenchquote
279 pairClose :: Char -> Maybe Pair
281 ')' -> Just PairParen
282 ']' -> Just PairBracket
283 '}' -> Just PairBrace
284 '»' -> Just PairFrenchquote
287 p_Escape :: Parser e s Char
288 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
290 p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text
292 P.try (P.char '<' *> p <* P.char '>') <|>
295 p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
297 (\scheme addr -> scheme <> "//" <> addr)
298 <$> P.option "" (P.try p_scheme)
301 p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
304 <$> (P.takeWhile1P (Just "scheme") $ \c ->
310 p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
312 P.takeWhileP (Just "addr") $ \c ->
325 pairBorders :: Foldable f => Pair -> f a -> (TL.Text,TL.Text)
326 pairBorders p ts | null ts = pairBordersSingle p
327 | otherwise = pairBordersDouble p
329 pairBordersSingle :: Pair -> (TL.Text,TL.Text)
330 pairBordersSingle = \case
332 ("<"<>n<>foldMap f as<>"/>","")
333 where f (elemAttr_white,ElemAttr{..}) =
339 p -> pairBordersDouble p
341 pairBordersDouble :: Pair -> (TL.Text,TL.Text)
342 pairBordersDouble = \case
343 PairElem n as -> ("<"<>n<>foldMap f as<>">","</"<>n<>">")
344 where f (elemAttr_white,ElemAttr{..}) =
350 PairHash -> ("#","#")
351 PairStar -> ("*","*")
352 PairSlash -> ("/","/")
353 PairUnderscore -> ("_","_")
354 PairDash -> ("-","-")
355 PairBackquote -> ("`","`")
356 PairSinglequote -> ("'","'")
357 PairDoublequote -> ("\"","\"")
358 PairFrenchquote -> ("«","»")
359 PairParen -> ("(",")")
360 PairBrace -> ("{","}")
361 PairBracket -> ("[","]")
364 class TagFrom a where
365 tagFrom :: a -> Maybe (Cell Tag, a)
366 instance TagFrom Tokens where
370 Tree0 (Cell b0 e0 n) :< ns ->
372 NodeToken (TokenText t) ->
373 case tagFrom $ Cell b0 e0 t of
376 if TL.null $ unCell r0
379 Just (t1@(Cell b1 _e1 _), r1) | e0 == b1 ->
382 else Just (t0, pure n0 `unionTokens` ns)
383 where n0 = Tree0 $ NodeToken . TokenText <$> r0
386 instance TagFrom (Cell TL.Text) where
387 tagFrom (Cell bp ep t)
388 | (w,r) <- TL.span isTagChar t
390 , ew <- pos_column bp + sum (Text.length <$> (TL.toChunks w)) =
392 ( Cell bp bp{pos_column=ew} w
393 , Cell bp{pos_column=ew} ep r )
396 isTagChar :: Char -> Bool
400 case Char.generalCategory c of
401 Char.DashPunctuation -> True
402 Char.ConnectorPunctuation -> True
406 -- | Build 'Tokens' from many 'Token's.
407 tokens :: [Cell Token] -> Tokens
408 tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
410 -- | Build 'Tokens' from one 'Token'.
411 tokens1 :: Tree (Cell Node) -> Tokens
412 tokens1 = Seq.singleton
414 unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
416 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
417 [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
420 isTokenElem :: Tokens -> Bool
422 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
423 [Tree (unCell -> NodePair PairElem{}) _] -> True