1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Textphile.TCT.Read.Token where
10 import Control.Applicative (Applicative(..), Alternative(..))
11 import Control.Monad (Monad(..))
13 import Data.Char (Char)
14 import Data.Either (Either(..))
15 import Data.Eq (Eq(..))
16 import Data.Foldable (Foldable(..))
17 import Data.Function (($), (.))
18 import Data.Functor ((<$>), ($>), (<$))
19 import Data.List.NonEmpty (NonEmpty(..))
20 import Data.Maybe (Maybe(..))
21 import Data.Monoid (Monoid(..))
22 import Data.Semigroup (Semigroup(..))
23 import Data.Sequence (ViewL(..), ViewR(..), (<|))
24 import Data.TreeSeq.Strict (Tree(..), Trees)
25 import Data.Tuple (fst,snd)
26 import Data.Void (Void)
27 import Prelude (Num(..))
28 import Text.Show (Show(..))
29 import qualified Data.Char as Char
30 import qualified Data.List.NonEmpty as NonEmpty
31 import qualified Data.Sequence as Seq
32 import qualified Data.Text as Text
33 import qualified Data.Text.Lazy as TL
34 import qualified Symantic.XML as XML
35 import qualified Text.Megaparsec as P
36 import qualified Text.Megaparsec.Char as P
38 import Textphile.TCT.Debug
39 import Textphile.TCT.Cell
40 import Textphile.TCT.Elem
41 import Textphile.TCT.Tree
42 import Textphile.TCT.Read.Elem
43 import Textphile.TCT.Read.Cell
46 -- | Right-only Dyck language,
47 -- to keep track of opened 'Pair's.
48 type Pairs = (Tokens,[Opening])
49 type Tokens = Trees (Cell Node)
52 -- | An opened 'Pair' and its content so far.
53 type Opening = (Cell Pair,Tokens)
55 appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs
56 appendPairsToken ps t = appendPairsTokens ps (pure t)
58 appendPairsText :: Pairs -> Cell TL.Text -> Pairs
59 appendPairsText ps (Sourced sp t) =
62 NodeToken $ TokenText t
64 appendPairsTokens :: Pairs -> Tokens -> Pairs
65 appendPairsTokens (ts,[]) toks = (ts`unionTokens`toks,[])
66 appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps)
68 -- | Unify two 'Tokens', merging border 'TokenText's if any.
69 unionTokens :: Tokens -> Tokens -> Tokens
71 case (Seq.viewr x, Seq.viewl y) of
72 (xs :> x0, y0 :< ys) ->
74 ( Tree (Sourced (FileRange fx bx _ex:| lx)
75 (NodeToken (TokenText tx))) tsx
76 , Tree (Sourced (FileRange _fy _by ey:|_ly)
77 (NodeToken (TokenText ty))) tsy ) ->
79 pure (Tree (Sourced (FileRange fx bx ey:|lx) $
80 NodeToken $ TokenText $ tx <> ty) (tsx<>tsy)
87 unionsTokens :: Foldable f => f Tokens -> Tokens
88 unionsTokens = foldl' unionTokens mempty
90 openPair :: Pairs -> Cell Pair -> Pairs
91 openPair (t,ps) p = (t,(p,mempty):ps)
93 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
94 closePair :: Pairs -> Cell Pair -> Pairs
95 closePair ps@(_,[]) (Sourced loc p) = -- debug0 "closePair" $
96 appendPairsText ps $ Sourced loc $
97 snd $ pairBordersDouble p
98 closePair (t,(cx@(Sourced (FileRange fx bx _ex:| lx) px),t1):ts)
99 cy@(Sourced (FileRange _fy _by ey:|_ly) py) = -- debug0 "closePair" $
101 (PairElem nx ax, PairElem ny ay) | nx == ny ->
102 appendPairsToken (t,ts) $
103 Tree (Sourced (FileRange fx bx ey:|lx) $ NodePair $ PairElem nx as) t1
104 where as | null ay = ax
107 appendPairsToken (t,ts) $
108 Tree (Sourced (FileRange fx bx ey:|lx) $ NodePair px) t1
113 (closeImpaired mempty (cx,t1))
115 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
116 closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens
117 closeImpaired acc (Sourced src@(s0:|ss) pair, toks) = -- debug0 "closeImpaired" $
119 -- NOTE: try to close 'PairTag' as 'TokenTag' instead of 'TokenText'.
120 PairTag isBackref | Just (Sourced (FileRange{fileRange_end}:|_lt) ref, rest) <- parseRef body ->
121 Tree0 (Sourced (s0{fileRange_end}:|ss) $ NodeToken $ TokenTag isBackref ref) <| rest
122 -- NOTE: try to close 'PairAt' as 'TokenAt' instead of 'TokenText'.
123 PairAt isBackref | Just (Sourced (FileRange{fileRange_end}:|_lt) ref, rest) <- parseRef body ->
124 Tree0 (Sourced (s0{fileRange_end}:|ss) $ NodeToken $ TokenAt isBackref ref) <| rest
125 | not isBackref -- NOTE: page-ref without a target anchor.
126 , Tree (Sourced _src (NodePair PairBracket{})) _ts :< _rest <- Seq.viewl body ->
127 Tree0 (Sourced src $ NodeToken $ TokenAt False "") <| body
128 _ -> pure open `unionTokens` body
130 body = toks `unionTokens` acc
131 open = Tree0 $ Sourced src $ NodeToken $
132 TokenText $ fst $ pairBordersDouble pair
134 -- | Close remaining 'Pair's at end of parsing.
135 closePairs :: Pairs -> Tokens
136 closePairs (t0,ps) = -- debug0 "closePairs" $
137 t0 `unionTokens` foldl' closeImpaired mempty ps
139 appendLexeme :: Lexeme -> Pairs -> Pairs
140 appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
142 LexemePairOpen ps -> foldl' open acc ps
144 -- NOTE: insert an empty node to encode <elem></elem>, not <elem/>
145 open a p@(Sourced (FileRange{fileRange_end, ..}:|sp) PairElem{}) = openPair a p `appendPairsText` Sourced (FileRange{fileRange_begin=fileRange_end, ..}:|sp) ""
146 open a p = openPair a p
147 LexemePairClose ps -> foldl' closePair acc ps
148 LexemePairAny ps -> foldl' openPair acc ps
149 LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
150 LexemeEscape c -> appendPairsToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
151 LexemeLink t -> appendPairsToken acc $ Tree0 $ NodeToken . TokenLink <$> t
152 {-LexemeWhite (unSourced -> "") -> acc-}
153 -- LexemeWhite (unSourced -> Text.all (==' ') -> True) -> acc
154 LexemeWhite t -> appendPairsText acc t
155 LexemeAlphaNum t -> appendPairsText acc t
156 LexemeOther t -> appendPairsText acc t
157 LexemeTree t -> appendPairsToken acc t
160 appendLexemes :: Pairs -> [Lexeme] -> Pairs
161 appendLexemes = foldr appendLexeme
164 -- | 'Lexeme's cut the input in the longest chunks of common semantic,
165 -- this enables 'orientLexemePairAny' to work with a more meaningful context.
167 = LexemePairOpen !(NonEmpty (Cell Pair))
168 | LexemePairClose !(NonEmpty (Cell Pair))
169 | LexemePairAny !(NonEmpty (Cell Pair))
170 -- ^ orientation depending on the surrounding 'Lexeme's,
171 -- see 'orientLexemePairAny'
172 | LexemePairBoth !(NonEmpty (Cell Pair))
173 | LexemeEscape !(Cell Char)
174 | LexemeLink !(Cell Link)
175 | LexemeWhite !(Cell TL.Text)
176 | LexemeAlphaNum !(Cell TL.Text)
177 | LexemeOther !(Cell TL.Text)
178 | LexemeTree !(Tree (Cell Node)) -- FIXME: useless?
181 instance Pretty Lexeme
183 parseTokens :: [Lexeme] -> Tokens
186 appendLexemes mempty $
187 -- debug0 "Lexemes (post orient)" $
188 orientLexemePairAny $
189 -- debug0 "Lexemes (pre orient)" $
194 Either (P.ParseErrorBundle StreamCell Void) [Lexeme]
195 parseLexemes = runParserOnCell (p_Lexemes <* P.eof)
197 -- | Parse 'Lexeme's, returning them in reverse order
198 -- to apply 'orientLexemePairAny'.
199 p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
200 p_Lexemes = {-debugParser "Lexemes" $-} go []
202 go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
205 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
207 -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
208 -- so that it can try to orient nearby 'LexemePairAny'
209 -- to 'LexemePairOpen' or 'LexemePairClose'.
210 orientLexemePairAny :: [Lexeme] -> [Lexeme]
211 orientLexemePairAny = \case
212 -- LexemeOther (Sourced _bx ex x):LexemeOther (Sourced by _ey y):acc -> LexemeOther (Sourced by ex (x<>y)):acc
215 t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
216 w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
217 LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
219 LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
220 LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
221 LexemePairAny p:[] -> LexemePairOpen p:[]
224 LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
225 LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
227 w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
228 LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
231 an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
232 an@LexemeEscape{} :a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
234 an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
235 an@LexemeEscape{} :LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
238 c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
240 LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
243 o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
245 LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairOpen p:c:acc
248 a1@LexemeAlphaNum{}:LexemePairAny p:a2@LexemeAlphaNum{}:acc ->
249 a1:LexemeOther (sconcat (((fst . pairBordersDouble) <$>) <$> p)):a2:acc
253 p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
254 p_Lexeme = -- debugParser "Lexeme" $
256 [ P.try $ LexemeWhite <$> p_Cell p_Spaces1
257 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
258 , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
259 , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen <|> P.try p_BackOpen)
260 , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
261 , P.try $ LexemeEscape <$> p_Cell p_Escape
262 , P.try $ LexemeLink <$> p_Cell p_Link
263 , P.try $ LexemeAlphaNum <$> p_Cell p_AlphaNums1
264 , LexemeOther <$> p_Cell (TL.singleton <$> P.anySingle)
267 p_some :: Parser e s a -> Parser e s (NonEmpty a)
268 p_some p = NonEmpty.fromList <$> P.some p
270 pairAny :: Char -> Maybe Pair
273 '/' -> Just PairSlash
274 '"' -> Just PairDoublequote
275 '\'' -> Just PairSinglequote
276 '`' -> Just PairBackquote
277 '_' -> Just PairUnderscore
279 '#' -> Just $ PairTag False
280 '@' -> Just $ PairAt False
283 pairOpen :: Char -> Maybe Pair
285 '(' -> Just PairParen
286 '[' -> Just PairBracket
287 '{' -> Just PairBrace
288 '«' -> Just PairFrenchquote
291 pairClose :: Char -> Maybe Pair
293 ')' -> Just PairParen
294 ']' -> Just PairBracket
295 '}' -> Just PairBrace
296 '»' -> Just PairFrenchquote
299 p_BackOpen :: P.Tokens s ~ TL.Text => Parser e s Pair
300 p_BackOpen = -- debugParser "BackOpen" $
302 *> (PairAt True <$ P.char '@'
303 <|> PairTag True <$ P.char '#')
305 p_Escape :: Parser e s Char
306 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
308 p_Link :: P.Tokens s ~ TL.Text => Parser e s Link
310 P.try (P.char '<' *> p <* P.char '>') <|>
313 p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
315 (\scheme addr -> scheme <> "//" <> addr)
316 <$> P.option "" (P.try p_scheme)
319 p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
322 <$> (P.takeWhile1P (Just "scheme") $ \c ->
328 p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
330 P.takeWhileP (Just "addr") $ \c ->
344 pairBorders :: Foldable f => Pair -> f a -> (TL.Text, TL.Text)
345 pairBorders p ts | null ts = pairBordersSingle p
346 | otherwise = pairBordersDouble p
348 pairBordersSingle :: Pair -> (TL.Text,TL.Text)
349 pairBordersSingle = \case
350 PairElem (XML.NCName n) as ->
351 ("<"<>n<>foldMap f as<>"/>","")
352 where f (elemAttr_white,ElemAttr{..}) =
354 XML.unNCName elemAttr_name <>
358 p -> pairBordersDouble p
360 pairBordersDouble :: Pair -> (TL.Text, TL.Text)
361 pairBordersDouble = \case
362 PairElem (XML.NCName n) as -> ("<"<>n<>foldMap f as<>">","</"<>n<>">")
363 where f (elemAttr_white,ElemAttr{..}) =
365 XML.unNCName elemAttr_name <>
369 PairTag isBackref | isBackref -> ("^#","#")
370 | otherwise -> ("#","#")
371 PairAt isBackref | isBackref -> ("^@","@")
372 | otherwise -> ("@","@")
373 PairStar -> ("*","*")
374 PairSlash -> ("/","/")
375 PairUnderscore -> ("_","_")
376 PairDash -> ("-","-")
377 PairBackquote -> ("`","`")
378 PairSinglequote -> ("'","'")
379 PairDoublequote -> ("\"","\"")
380 PairFrenchquote -> ("«","»")
381 PairParen -> ("(",")")
382 PairBrace -> ("{","}")
383 PairBracket -> ("[","]")
385 -- * Class 'ParseRef'
386 class ParseRef a where
387 parseRef :: a -> Maybe (Cell Ref, a)
388 instance ParseRef Tokens where
392 Tree0 (Sourced src0@(FileRange _f0 _b0 e0:|_l0) n) :< ns ->
394 NodeToken (TokenText t) ->
395 case parseRef $ Sourced src0 t of
398 if TL.null $ unSourced r0
401 Just (t1@(Sourced (FileRange _f1 b1 _e1:|_l1) _), r1) | e0 == b1 ->
404 else Just (t0, pure n0 `unionTokens` ns)
405 where n0 = Tree0 $ NodeToken . TokenText <$> r0
408 instance ParseRef (Cell TL.Text) where
409 parseRef (Sourced (FileRange fp bp ep:|sp) t)
410 | (w,r) <- TL.span isTagChar t
412 , ew <- sconcat $ colNum bp :| (P.mkPos . Text.length <$> TL.toChunks w) =
414 ( Sourced (FileRange fp bp bp{colNum=ew}:|sp) w
415 , Sourced (FileRange fp bp{colNum=ew} ep:|sp) r )
418 isTagChar :: Char -> Bool
422 case Char.generalCategory c of
423 Char.DashPunctuation -> True
424 Char.ConnectorPunctuation -> True
428 -- | Build 'Tokens' from many 'Token's.
429 tokens :: [Cell Token] -> Tokens
430 tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
432 -- | Build 'Tokens' from one 'Token'.
433 tokens1 :: Tree (Cell Node) -> Tokens
434 tokens1 = Seq.singleton
436 unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
438 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
439 [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
442 isTokenElem :: Tokens -> Bool
444 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
445 [Tree (unSourced -> NodePair PairElem{}) _] -> True