1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Textphile.TCT.Read.Token where
10 import Control.Applicative (Applicative(..), Alternative(..))
11 import Control.Monad (Monad(..))
13 import Data.Char (Char)
14 import Data.Either (Either(..))
15 import Data.Eq (Eq(..))
16 import Data.Foldable (Foldable(..))
17 import Data.Function (($), (.))
18 import Data.Functor ((<$>), ($>), (<$))
19 import Data.List.NonEmpty (NonEmpty(..))
20 import Data.Maybe (Maybe(..))
21 import Data.Monoid (Monoid(..))
22 import Data.Semigroup (Semigroup(..))
23 import Data.Sequence (ViewL(..), ViewR(..), (<|))
24 import Data.TreeSeq.Strict (Tree(..), Trees)
25 import Data.Tuple (fst,snd)
26 import Data.Void (Void)
27 import Text.Show (Show(..))
28 import qualified Data.Char as Char
29 import qualified Data.List.NonEmpty as NonEmpty
30 import qualified Data.Sequence as Seq
31 import qualified Data.Text as Text
32 import qualified Data.Text.Lazy as TL
33 import qualified Symantic.XML as XML
34 import qualified Text.Megaparsec as P
35 import qualified Text.Megaparsec.Char as P
37 import Textphile.TCT.Debug
38 import Textphile.TCT.Cell
39 import Textphile.TCT.Elem
40 import Textphile.TCT.Tree
41 import Textphile.TCT.Read.Elem
42 import Textphile.TCT.Read.Cell
45 -- | Right-only Dyck language,
46 -- to keep track of opened 'Pair's.
47 type Pairs = (Tokens,[Opening])
48 type Tokens = Trees (Cell Node)
51 -- | An opened 'Pair' and its content so far.
52 type Opening = (Cell Pair,Tokens)
54 appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs
55 appendPairsToken ps t = appendPairsTokens ps (pure t)
57 appendPairsText :: Pairs -> Cell TL.Text -> Pairs
58 appendPairsText ps (Sourced sp t) =
61 NodeToken $ TokenText t
63 appendPairsTokens :: Pairs -> Tokens -> Pairs
64 appendPairsTokens (ts,[]) toks = (ts`unionTokens`toks,[])
65 appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps)
67 -- | Unify two 'Tokens', merging border 'TokenText's if any.
68 unionTokens :: Tokens -> Tokens -> Tokens
70 case (Seq.viewr x, Seq.viewl y) of
71 (xs :> x0, y0 :< ys) ->
73 ( Tree (Sourced (FileRange fx bx _ex:| lx)
74 (NodeToken (TokenText tx))) tsx
75 , Tree (Sourced (FileRange _fy _by ey:|_ly)
76 (NodeToken (TokenText ty))) tsy ) ->
78 pure (Tree (Sourced (FileRange fx bx ey:|lx) $
79 NodeToken $ TokenText $ tx <> ty) (tsx<>tsy)
86 unionsTokens :: Foldable f => f Tokens -> Tokens
87 unionsTokens = foldl' unionTokens mempty
89 openPair :: Pairs -> Cell Pair -> Pairs
90 openPair (t,ps) p = (t,(p,mempty):ps)
92 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
93 closePair :: Pairs -> Cell Pair -> Pairs
94 closePair ps@(_,[]) (Sourced loc p) = -- debug0 "closePair" $
95 appendPairsText ps $ Sourced loc $
96 snd $ pairBordersDouble p
97 closePair (t,(cx@(Sourced (FileRange fx bx _ex:| lx) px),t1):ts)
98 cy@(Sourced (FileRange _fy _by ey:|_ly) py) = -- debug0 "closePair" $
100 (PairElem nx ax, PairElem ny ay) | nx == ny ->
101 appendPairsToken (t,ts) $
102 Tree (Sourced (FileRange fx bx ey:|lx) $ NodePair $ PairElem nx as) t1
103 where as | null ay = ax
106 appendPairsToken (t,ts) $
107 Tree (Sourced (FileRange fx bx ey:|lx) $ NodePair px) t1
112 (closeImpaired mempty (cx,t1))
114 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
115 closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens
116 closeImpaired acc (Sourced src@(s0:|ss) pair, toks) = -- debug0 "closeImpaired" $
118 -- NOTE: try to close 'PairTag' as 'TokenTag' instead of 'TokenText'.
119 PairTag isBackref | Just (Sourced (FileRange{fileRange_end}:|_lt) ref, rest) <- parseRef body ->
120 Tree0 (Sourced (s0{fileRange_end}:|ss) $ NodeToken $ TokenTag isBackref ref) <| rest
121 -- NOTE: try to close 'PairAt' as 'TokenAt' instead of 'TokenText'.
122 PairAt isBackref | Just (Sourced (FileRange{fileRange_end}:|_lt) ref, rest) <- parseRef body ->
123 Tree0 (Sourced (s0{fileRange_end}:|ss) $ NodeToken $ TokenAt isBackref ref) <| rest
124 | not isBackref -- NOTE: page-ref without a target anchor.
125 , Tree (Sourced _src (NodePair PairBracket{})) _ts :< _rest <- Seq.viewl body ->
126 Tree0 (Sourced src $ NodeToken $ TokenAt False "") <| body
127 _ -> pure open `unionTokens` body
129 body = toks `unionTokens` acc
130 open = Tree0 $ Sourced src $ NodeToken $
131 TokenText $ fst $ pairBordersDouble pair
133 -- | Close remaining 'Pair's at end of parsing.
134 closePairs :: Pairs -> Tokens
135 closePairs (t0,ps) = -- debug0 "closePairs" $
136 t0 `unionTokens` foldl' closeImpaired mempty ps
138 appendLexeme :: Lexeme -> Pairs -> Pairs
139 appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
141 LexemePairOpen ps -> foldl' open acc ps
143 -- NOTE: insert an empty node to encode <elem></elem>, not <elem/>
144 open a p@(Sourced (FileRange{fileRange_end, ..}:|sp) PairElem{}) = openPair a p `appendPairsText` Sourced (FileRange{fileRange_begin=fileRange_end, ..}:|sp) ""
145 open a p = openPair a p
146 LexemePairClose ps -> foldl' closePair acc ps
147 LexemePairAny ps -> foldl' openPair acc ps
148 LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
149 LexemeEscape c -> appendPairsToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
150 LexemeLink t -> appendPairsToken acc $ Tree0 $ NodeToken . TokenLink <$> t
151 {-LexemeWhite (unSourced -> "") -> acc-}
152 -- LexemeWhite (unSourced -> Text.all (==' ') -> True) -> acc
153 LexemeWhite t -> appendPairsText acc t
154 LexemeAlphaNum t -> appendPairsText acc t
155 LexemeOther t -> appendPairsText acc t
156 LexemeTree t -> appendPairsToken acc t
159 appendLexemes :: Pairs -> [Lexeme] -> Pairs
160 appendLexemes = foldr appendLexeme
163 -- | 'Lexeme's cut the input in the longest chunks of common semantic,
164 -- this enables 'orientLexemePairAny' to work with a more meaningful context.
166 = LexemePairOpen !(NonEmpty (Cell Pair))
167 | LexemePairClose !(NonEmpty (Cell Pair))
168 | LexemePairAny !(NonEmpty (Cell Pair))
169 -- ^ orientation depending on the surrounding 'Lexeme's,
170 -- see 'orientLexemePairAny'
171 | LexemePairBoth !(NonEmpty (Cell Pair))
172 | LexemeEscape !(Cell Char)
173 | LexemeLink !(Cell Link)
174 | LexemeWhite !(Cell TL.Text)
175 | LexemeAlphaNum !(Cell TL.Text)
176 | LexemeOther !(Cell TL.Text)
177 | LexemeTree !(Tree (Cell Node)) -- FIXME: useless?
180 instance Pretty Lexeme
182 parseTokens :: [Lexeme] -> Tokens
185 appendLexemes mempty $
186 -- debug0 "Lexemes (post orient)" $
187 orientLexemePairAny $
188 -- debug0 "Lexemes (pre orient)" $
193 Either (P.ParseErrorBundle StreamCell Void) [Lexeme]
194 parseLexemes = runParserOnCell (p_Lexemes <* P.eof)
196 -- | Parse 'Lexeme's, returning them in reverse order
197 -- to apply 'orientLexemePairAny'.
198 p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
199 p_Lexemes = {-debugParser "Lexemes" $-} go []
201 go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
204 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
206 -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
207 -- so that it can try to orient nearby 'LexemePairAny'
208 -- to 'LexemePairOpen' or 'LexemePairClose'.
209 orientLexemePairAny :: [Lexeme] -> [Lexeme]
210 orientLexemePairAny = \case
211 -- LexemeOther (Sourced _bx ex x):LexemeOther (Sourced by _ey y):acc -> LexemeOther (Sourced by ex (x<>y)):acc
214 t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
215 w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
216 LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
218 LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
219 LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
220 LexemePairAny p:[] -> LexemePairOpen p:[]
223 LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
224 LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
226 w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
227 LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
230 an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
231 an@LexemeEscape{} :a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
233 an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
234 an@LexemeEscape{} :LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
237 c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
239 LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
242 o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
244 LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairOpen p:c:acc
247 a1@LexemeAlphaNum{}:LexemePairAny p:a2@LexemeAlphaNum{}:acc ->
248 a1:LexemeOther (sconcat (((fst . pairBordersDouble) <$>) <$> p)):a2:acc
252 p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
253 p_Lexeme = -- debugParser "Lexeme" $
255 [ P.try $ LexemeWhite <$> p_Cell p_Spaces1
256 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
257 , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
258 , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen <|> P.try p_BackOpen)
259 , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
260 , P.try $ LexemeEscape <$> p_Cell p_Escape
261 , P.try $ LexemeLink <$> p_Cell p_Link
262 , P.try $ LexemeAlphaNum <$> p_Cell p_AlphaNums1
263 , LexemeOther <$> p_Cell (TL.singleton <$> P.anySingle)
266 p_some :: Parser e s a -> Parser e s (NonEmpty a)
267 p_some p = NonEmpty.fromList <$> P.some p
269 pairAny :: Char -> Maybe Pair
272 '/' -> Just PairSlash
273 '"' -> Just PairDoublequote
274 '\'' -> Just PairSinglequote
275 '`' -> Just PairBackquote
276 '_' -> Just PairUnderscore
278 '#' -> Just $ PairTag False
279 '@' -> Just $ PairAt False
282 pairOpen :: Char -> Maybe Pair
284 '(' -> Just PairParen
285 '[' -> Just PairBracket
286 '{' -> Just PairBrace
287 '«' -> Just PairFrenchquote
290 pairClose :: Char -> Maybe Pair
292 ')' -> Just PairParen
293 ']' -> Just PairBracket
294 '}' -> Just PairBrace
295 '»' -> Just PairFrenchquote
298 p_BackOpen :: P.Tokens s ~ TL.Text => Parser e s Pair
299 p_BackOpen = -- debugParser "BackOpen" $
301 *> (PairAt True <$ P.char '@'
302 <|> PairTag True <$ P.char '#')
304 p_Escape :: Parser e s Char
305 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
307 p_Link :: P.Tokens s ~ TL.Text => Parser e s Link
309 P.try (P.char '<' *> p <* P.char '>') <|>
312 p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
314 (\scheme addr -> scheme <> "//" <> addr)
315 <$> P.option "" (P.try p_scheme)
318 p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
321 <$> (P.takeWhile1P (Just "scheme") $ \c ->
327 p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
329 P.takeWhileP (Just "addr") $ \c ->
343 pairBorders :: Foldable f => Pair -> f a -> (TL.Text, TL.Text)
344 pairBorders p ts | null ts = pairBordersSingle p
345 | otherwise = pairBordersDouble p
347 pairBordersSingle :: Pair -> (TL.Text,TL.Text)
348 pairBordersSingle = \case
349 PairElem (XML.NCName n) as ->
350 ("<"<>n<>foldMap f as<>"/>","")
351 where f (elemAttr_white,ElemAttr{..}) =
353 XML.unNCName elemAttr_name <>
357 p -> pairBordersDouble p
359 pairBordersDouble :: Pair -> (TL.Text, TL.Text)
360 pairBordersDouble = \case
361 PairElem (XML.NCName n) as -> ("<"<>n<>foldMap f as<>">","</"<>n<>">")
362 where f (elemAttr_white,ElemAttr{..}) =
364 XML.unNCName elemAttr_name <>
368 PairTag isBackref | isBackref -> ("^#","#")
369 | otherwise -> ("#","#")
370 PairAt isBackref | isBackref -> ("^@","@")
371 | otherwise -> ("@","@")
372 PairStar -> ("*","*")
373 PairSlash -> ("/","/")
374 PairUnderscore -> ("_","_")
375 PairDash -> ("-","-")
376 PairBackquote -> ("`","`")
377 PairSinglequote -> ("'","'")
378 PairDoublequote -> ("\"","\"")
379 PairFrenchquote -> ("«","»")
380 PairParen -> ("(",")")
381 PairBrace -> ("{","}")
382 PairBracket -> ("[","]")
384 -- * Class 'ParseRef'
385 class ParseRef a where
386 parseRef :: a -> Maybe (Cell Ref, a)
387 instance ParseRef Tokens where
391 Tree0 (Sourced src0@(FileRange _f0 _b0 e0:|_l0) n) :< ns ->
393 NodeToken (TokenText t) ->
394 case parseRef $ Sourced src0 t of
397 if TL.null $ unSourced r0
400 Just (t1@(Sourced (FileRange _f1 b1 _e1:|_l1) _), r1) | e0 == b1 ->
403 else Just (t0, pure n0 `unionTokens` ns)
404 where n0 = Tree0 $ NodeToken . TokenText <$> r0
407 instance ParseRef (Cell TL.Text) where
408 parseRef (Sourced (FileRange fp bp ep:|sp) t)
409 | (w,r) <- TL.span isTagChar t
411 , ew <- sconcat $ colNum bp :| (P.mkPos . Text.length <$> TL.toChunks w) =
413 ( Sourced (FileRange fp bp bp{colNum=ew}:|sp) w
414 , Sourced (FileRange fp bp{colNum=ew} ep:|sp) r )
417 isTagChar :: Char -> Bool
421 case Char.generalCategory c of
422 Char.DashPunctuation -> True
423 Char.ConnectorPunctuation -> True
427 -- | Build 'Tokens' from many 'Token's.
428 tokens :: [Cell Token] -> Tokens
429 tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
431 -- | Build 'Tokens' from one 'Token'.
432 tokens1 :: Tree (Cell Node) -> Tokens
433 tokens1 = Seq.singleton
435 unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
437 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
438 [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
441 isTokenElem :: Tokens -> Bool
443 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
444 [Tree (unSourced -> NodePair PairElem{}) _] -> True