1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Hdoc.TCT.Read.Token where
10 import Control.Applicative (Applicative(..), Alternative(..))
11 import Control.Monad (Monad(..))
13 import Data.Char (Char)
14 import Data.Either (Either(..))
15 import Data.Eq (Eq(..))
16 import Data.Foldable (Foldable(..))
17 import Data.Function (($), (.))
18 import Data.Functor ((<$>), ($>))
19 import Data.List.NonEmpty (NonEmpty(..))
20 import Data.Maybe (Maybe(..))
21 import Data.Monoid (Monoid(..))
22 import Data.Semigroup (Semigroup(..))
23 import Data.Sequence (ViewL(..), ViewR(..), (<|))
24 import Data.TreeSeq.Strict (Tree(..), Trees)
25 import Data.Tuple (fst,snd)
26 import Data.Void (Void)
27 import Prelude (Num(..))
28 import Text.Show (Show(..))
29 import qualified Data.Char as Char
30 import qualified Data.List.NonEmpty as NonEmpty
31 import qualified Data.Sequence as Seq
32 import qualified Data.Text as Text
33 import qualified Data.Text.Lazy as TL
34 import qualified Text.Megaparsec as P
35 import qualified Text.Megaparsec.Char as P
41 import Hdoc.TCT.Read.Elem
42 import Hdoc.TCT.Read.Cell
45 -- | Right-only Dyck language,
46 -- to keep track of opened 'Pair's.
47 type Pairs = (Tokens,[Opening])
48 type Tokens = Trees (Cell Node)
51 -- | An opened 'Pair' and its content so far.
52 type Opening = (Cell Pair,Tokens)
54 appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs
55 appendPairsToken ps t = appendPairsTokens ps (pure t)
57 appendPairsText :: Pairs -> Cell TL.Text -> Pairs
58 appendPairsText ps (Cell sp t) =
61 NodeToken $ TokenText t
63 appendPairsTokens :: Pairs -> Tokens -> Pairs
64 appendPairsTokens (ts,[]) toks = (ts`unionTokens`toks,[])
65 appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps)
67 -- | Unify two 'Tokens', merging border 'TokenText's if any.
68 unionTokens :: Tokens -> Tokens -> Tokens
70 case (Seq.viewr x, Seq.viewl y) of
71 (xs :> x0, y0 :< ys) ->
73 ( Tree (Cell (Span fx bx _ex:| lx) (NodeToken (TokenText tx))) tsx
74 , Tree (Cell (Span _fy _by ey:|_ly) (NodeToken (TokenText ty))) tsy ) ->
76 pure (Tree (Cell (Span fx bx ey:|lx) $ NodeToken $ TokenText $ tx <> ty) (tsx<>tsy)) `unionTokens`
82 unionsTokens :: Foldable f => f Tokens -> Tokens
83 unionsTokens = foldl' unionTokens mempty
85 openPair :: Pairs -> Cell Pair -> Pairs
86 openPair (t,ps) p = (t,(p,mempty):ps)
88 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
89 closePair :: Pairs -> Cell Pair -> Pairs
90 closePair ps@(_,[]) (Cell loc p) = -- debug0 "closePair" $
91 appendPairsText ps $ Cell loc $
92 snd $ pairBordersDouble p
93 closePair (t,(cx@(Cell (Span fx bx _ex:| lx) px),t1):ts)
94 cy@(Cell (Span _fy _by ey:|_ly) py) = -- debug0 "closePair" $
96 (PairElem nx ax, PairElem ny ay) | nx == ny ->
97 appendPairsToken (t,ts) $
98 Tree (Cell (Span fx bx ey:|lx) $ NodePair $ PairElem nx as) t1
99 where as | null ay = ax
102 appendPairsToken (t,ts) $
103 Tree (Cell (Span fx bx ey:|lx) $ NodePair px) t1
108 (closeImpaired mempty (cx,t1))
110 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
111 closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens
112 closeImpaired acc (Cell loc@(s0:|lp) pair, toks) = -- debug0 "closeImpaired" $
114 -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
115 PairHash | Just (Cell (Span{span_end}:|_lt) tag, rest) <- tagFrom body ->
116 Tree0 (Cell (s0{span_end}:|lp) $ NodeToken $ TokenTag tag) <| rest
117 -- NOTE: use bp (not bt) to include the '#'
118 _ -> pure open `unionTokens` body
120 body = toks `unionTokens` acc
121 open = Tree0 $ Cell loc $ NodeToken $ TokenText $ fst $ pairBordersDouble pair
123 -- | Close remaining 'Pair's at end of parsing.
124 closePairs :: Pairs -> Tokens
125 closePairs (t0,ps) = -- debug0 "closePairs" $
126 t0 `unionTokens` foldl' closeImpaired mempty ps
128 appendLexeme :: Lexeme -> Pairs -> Pairs
129 appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
131 LexemePairOpen ps -> foldl' open acc ps
133 -- NOTE: insert an empty node to encode <elem></elem>, not <elem/>
134 open a p@(Cell (Span{span_end, ..}:|sp) PairElem{}) = openPair a p `appendPairsText` Cell (Span{span_begin=span_end, ..}:|sp) ""
135 open a p = openPair a p
136 LexemePairClose ps -> foldl' closePair acc ps
137 LexemePairAny ps -> foldl' openPair acc ps
140 appendPairsText acc $ sconcat $
141 ((fst . pairBordersSingle) <$>) <$> ps
143 LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
144 LexemeEscape c -> appendPairsToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
145 LexemeLink t -> appendPairsToken acc $ Tree0 $ NodeToken . TokenLink <$> t
146 {-LexemeWhite (unCell -> "") -> acc-}
147 -- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc
148 LexemeWhite t -> appendPairsText acc t
149 LexemeAlphaNum t -> appendPairsText acc t
150 LexemeOther t -> appendPairsText acc t
151 LexemeTree t -> appendPairsToken acc t
154 appendLexemes :: Pairs -> [Lexeme] -> Pairs
155 appendLexemes = foldr appendLexeme
158 -- | 'Lexeme's cut the input in the longest chunks of common semantic,
159 -- this enables 'orientLexemePairAny' to work with a more meaningful context.
161 = LexemePairOpen !(NonEmpty (Cell Pair))
162 | LexemePairClose !(NonEmpty (Cell Pair))
163 | LexemePairAny !(NonEmpty (Cell Pair))
164 -- ^ orientation depending on the surrounding 'Lexeme's,
165 -- see 'orientLexemePairAny'
166 | LexemePairBoth !(NonEmpty (Cell Pair))
167 | LexemeEscape !(Cell Char)
168 | LexemeLink !(Cell TL.Text)
169 | LexemeWhite !(Cell TL.Text)
170 | LexemeAlphaNum !(Cell TL.Text)
171 | LexemeOther !(Cell TL.Text)
172 | LexemeTree !(Tree (Cell Node))
175 instance Pretty Lexeme
177 parseTokens :: [Lexeme] -> Tokens
180 appendLexemes mempty $
181 -- debug0 "Lexemes (post orient)" $
182 orientLexemePairAny $ LexemeEnd :
187 Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme]
188 parseLexemes = runParserOnCell (p_Lexemes <* P.eof)
190 -- | Parse 'Lexeme's, returning them in reverse order
191 -- to apply 'orientLexemePairAny'.
192 p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
193 p_Lexemes = debugParser "Lexemes" $ go []
195 go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
198 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
200 -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
201 -- so that it can try to orient nearby 'LexemePairAny'
202 -- to 'LexemePairOpen' or 'LexemePairClose'.
203 orientLexemePairAny :: [Lexeme] -> [Lexeme]
204 orientLexemePairAny = \case
205 -- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc
208 t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
209 w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
210 LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
212 LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
213 LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
214 LexemePairAny p:[] -> LexemePairOpen p:[]
217 LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
218 LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
220 w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
221 LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
224 an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
226 an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
229 c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
231 LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
234 o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
236 LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc
240 p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
241 p_Lexeme = debugParser "Lexeme" $
243 [ P.try $ LexemeWhite <$> p_Cell p_Spaces1
244 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
245 , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
246 , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen)
247 , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
248 , P.try $ LexemeEscape <$> p_Cell p_Escape
249 , P.try $ LexemeLink <$> p_Cell p_Link
250 , P.try $ LexemeAlphaNum <$> p_Cell (P.takeWhile1P (Just "AlphaNum") Char.isAlphaNum)
251 , LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar)
254 p_some :: Parser e s a -> Parser e s (NonEmpty a)
255 p_some p = NonEmpty.fromList <$> P.some p
257 pairAny :: Char -> Maybe Pair
260 '/' -> Just PairSlash
261 '"' -> Just PairDoublequote
262 '\'' -> Just PairSinglequote
263 '`' -> Just PairBackquote
264 '_' -> Just PairUnderscore
269 pairOpen :: Char -> Maybe Pair
271 '(' -> Just PairParen
272 '[' -> Just PairBracket
273 '{' -> Just PairBrace
274 '«' -> Just PairFrenchquote
277 pairClose :: Char -> Maybe Pair
279 ')' -> Just PairParen
280 ']' -> Just PairBracket
281 '}' -> Just PairBrace
282 '»' -> Just PairFrenchquote
285 p_Escape :: Parser e s Char
286 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
288 p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text
290 P.try (P.char '<' *> p <* P.char '>') <|>
293 p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
295 (\scheme addr -> scheme <> "//" <> addr)
296 <$> P.option "" (P.try p_scheme)
299 p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
302 <$> (P.takeWhile1P (Just "scheme") $ \c ->
308 p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
310 P.takeWhileP (Just "addr") $ \c ->
324 pairBorders :: Foldable f => Pair -> f a -> (TL.Text,TL.Text)
325 pairBorders p ts | null ts = pairBordersSingle p
326 | otherwise = pairBordersDouble p
328 pairBordersSingle :: Pair -> (TL.Text,TL.Text)
329 pairBordersSingle = \case
331 ("<"<>n<>foldMap f as<>"/>","")
332 where f (elemAttr_white,ElemAttr{..}) =
338 p -> pairBordersDouble p
340 pairBordersDouble :: Pair -> (TL.Text,TL.Text)
341 pairBordersDouble = \case
342 PairElem n as -> ("<"<>n<>foldMap f as<>">","</"<>n<>">")
343 where f (elemAttr_white,ElemAttr{..}) =
349 PairHash -> ("#","#")
350 PairStar -> ("*","*")
351 PairSlash -> ("/","/")
352 PairUnderscore -> ("_","_")
353 PairDash -> ("-","-")
354 PairBackquote -> ("`","`")
355 PairSinglequote -> ("'","'")
356 PairDoublequote -> ("\"","\"")
357 PairFrenchquote -> ("«","»")
358 PairParen -> ("(",")")
359 PairBrace -> ("{","}")
360 PairBracket -> ("[","]")
363 class TagFrom a where
364 tagFrom :: a -> Maybe (Cell Tag, a)
365 instance TagFrom Tokens where
369 Tree0 (Cell loc0@(Span _f0 _b0 e0:|_l0) n) :< ns ->
371 NodeToken (TokenText t) ->
372 case tagFrom $ Cell loc0 t of
375 if TL.null $ unCell r0
378 Just (t1@(Cell (Span _f1 b1 _e1:|_l1) _), r1) | e0 == b1 ->
381 else Just (t0, pure n0 `unionTokens` ns)
382 where n0 = Tree0 $ NodeToken . TokenText <$> r0
385 instance TagFrom (Cell TL.Text) where
386 tagFrom (Cell (Span fp bp ep:|sp) t)
387 | (w,r) <- TL.span isTagChar t
389 , ew <- pos_column bp + sum (Text.length <$> TL.toChunks w) =
391 ( Cell (Span fp bp bp{pos_column=ew}:|sp) w
392 , Cell (Span fp bp{pos_column=ew} ep:|sp) r )
395 isTagChar :: Char -> Bool
399 case Char.generalCategory c of
400 Char.DashPunctuation -> True
401 Char.ConnectorPunctuation -> True
405 -- | Build 'Tokens' from many 'Token's.
406 tokens :: [Cell Token] -> Tokens
407 tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
409 -- | Build 'Tokens' from one 'Token'.
410 tokens1 :: Tree (Cell Node) -> Tokens
411 tokens1 = Seq.singleton
413 unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
415 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
416 [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
419 isTokenElem :: Tokens -> Bool
421 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
422 [Tree (unCell -> NodePair PairElem{}) _] -> True