1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Hdoc.TCT.Read.Token where
10 import Control.Applicative (Applicative(..), Alternative(..))
11 import Control.Monad (Monad(..))
13 import Data.Char (Char)
14 import Data.Either (Either(..))
15 import Data.Eq (Eq(..))
16 import Data.Foldable (Foldable(..))
17 import Data.Function (($), (.))
18 import Data.Functor ((<$>), ($>), (<$))
19 import Data.List.NonEmpty (NonEmpty(..))
20 import Data.Maybe (Maybe(..))
21 import Data.Monoid (Monoid(..))
22 import Data.Semigroup (Semigroup(..))
23 import Data.Sequence (ViewL(..), ViewR(..), (<|))
24 import Data.TreeSeq.Strict (Tree(..), Trees)
25 import Data.Tuple (fst,snd)
26 import Data.Void (Void)
27 import Prelude (Num(..))
28 import Text.Show (Show(..))
29 import qualified Data.Char as Char
30 import qualified Data.List.NonEmpty as NonEmpty
31 import qualified Data.Sequence as Seq
32 import qualified Data.Text as Text
33 import qualified Data.Text.Lazy as TL
34 import qualified Language.Symantic.XML as XML
35 import qualified Text.Megaparsec as P
36 import qualified Text.Megaparsec.Char as P
42 import Hdoc.TCT.Read.Elem
43 import Hdoc.TCT.Read.Cell
46 -- | Right-only Dyck language,
47 -- to keep track of opened 'Pair's.
48 type Pairs = (Tokens,[Opening])
49 type Tokens = Trees (Cell Node)
52 -- | An opened 'Pair' and its content so far.
53 type Opening = (Cell Pair,Tokens)
55 appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs
56 appendPairsToken ps t = appendPairsTokens ps (pure t)
58 appendPairsText :: Pairs -> Cell TL.Text -> Pairs
59 appendPairsText ps (Sourced sp t) =
62 NodeToken $ TokenText t
64 appendPairsTokens :: Pairs -> Tokens -> Pairs
65 appendPairsTokens (ts,[]) toks = (ts`unionTokens`toks,[])
66 appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps)
68 -- | Unify two 'Tokens', merging border 'TokenText's if any.
69 unionTokens :: Tokens -> Tokens -> Tokens
71 case (Seq.viewr x, Seq.viewl y) of
72 (xs :> x0, y0 :< ys) ->
74 ( Tree (Sourced (FileRange fx bx _ex:| lx)
75 (NodeToken (TokenText tx))) tsx
76 , Tree (Sourced (FileRange _fy _by ey:|_ly)
77 (NodeToken (TokenText ty))) tsy ) ->
79 pure (Tree (Sourced (FileRange fx bx ey:|lx) $
80 NodeToken $ TokenText $ tx <> ty) (tsx<>tsy)
87 unionsTokens :: Foldable f => f Tokens -> Tokens
88 unionsTokens = foldl' unionTokens mempty
90 openPair :: Pairs -> Cell Pair -> Pairs
91 openPair (t,ps) p = (t,(p,mempty):ps)
93 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
94 closePair :: Pairs -> Cell Pair -> Pairs
95 closePair ps@(_,[]) (Sourced loc p) = -- debug0 "closePair" $
96 appendPairsText ps $ Sourced loc $
97 snd $ pairBordersDouble p
98 closePair (t,(cx@(Sourced (FileRange fx bx _ex:| lx) px),t1):ts)
99 cy@(Sourced (FileRange _fy _by ey:|_ly) py) = -- debug0 "closePair" $
101 (PairElem nx ax, PairElem ny ay) | nx == ny ->
102 appendPairsToken (t,ts) $
103 Tree (Sourced (FileRange fx bx ey:|lx) $ NodePair $ PairElem nx as) t1
104 where as | null ay = ax
107 appendPairsToken (t,ts) $
108 Tree (Sourced (FileRange fx bx ey:|lx) $ NodePair px) t1
113 (closeImpaired mempty (cx,t1))
115 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
116 closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens
117 closeImpaired acc (Sourced loc@(s0:|lp) pair, toks) = -- debug0 "closeImpaired" $
119 -- NOTE: try to close 'PairTag' as 'TokenTag' instead of 'TokenText'.
120 PairTag isBackref | Just (Sourced (FileRange{fileRange_end}:|_lt) ref, rest) <- parseRef body ->
121 Tree0 (Sourced (s0{fileRange_end}:|lp) $ NodeToken $ TokenTag isBackref ref) <| rest
122 -- NOTE: use bp (not bt) to include the '#'
123 -- NOTE: try to close 'PairAt' as 'TokenAt' instead of 'TokenText'.
124 PairAt isBackref | Just (Sourced (FileRange{fileRange_end}:|_lt) ref, rest) <- parseRef body ->
125 Tree0 (Sourced (s0{fileRange_end}:|lp) $ NodeToken $ TokenAt isBackref ref) <| rest
126 -- NOTE: use bp (not bt) to include the '@'
127 _ -> pure open `unionTokens` body
129 body = toks `unionTokens` acc
130 open = Tree0 $ Sourced loc $ NodeToken $
131 TokenText $ fst $ pairBordersDouble pair
133 -- | Close remaining 'Pair's at end of parsing.
134 closePairs :: Pairs -> Tokens
135 closePairs (t0,ps) = -- debug0 "closePairs" $
136 t0 `unionTokens` foldl' closeImpaired mempty ps
138 appendLexeme :: Lexeme -> Pairs -> Pairs
139 appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
141 LexemePairOpen ps -> foldl' open acc ps
143 -- NOTE: insert an empty node to encode <elem></elem>, not <elem/>
144 open a p@(Sourced (FileRange{fileRange_end, ..}:|sp) PairElem{}) = openPair a p `appendPairsText` Sourced (FileRange{fileRange_begin=fileRange_end, ..}:|sp) ""
145 open a p = openPair a p
146 LexemePairClose ps -> foldl' closePair acc ps
147 LexemePairAny ps -> foldl' openPair acc ps
148 LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
149 LexemeEscape c -> appendPairsToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
150 LexemeLink t -> appendPairsToken acc $ Tree0 $ NodeToken . TokenLink <$> t
151 {-LexemeWhite (unSourced -> "") -> acc-}
152 -- LexemeWhite (unSourced -> Text.all (==' ') -> True) -> acc
153 LexemeWhite t -> appendPairsText acc t
154 LexemeAlphaNum t -> appendPairsText acc t
155 LexemeOther t -> appendPairsText acc t
156 LexemeTree t -> appendPairsToken acc t
159 appendLexemes :: Pairs -> [Lexeme] -> Pairs
160 appendLexemes = foldr appendLexeme
163 -- | 'Lexeme's cut the input in the longest chunks of common semantic,
164 -- this enables 'orientLexemePairAny' to work with a more meaningful context.
166 = LexemePairOpen !(NonEmpty (Cell Pair))
167 | LexemePairClose !(NonEmpty (Cell Pair))
168 | LexemePairAny !(NonEmpty (Cell Pair))
169 -- ^ orientation depending on the surrounding 'Lexeme's,
170 -- see 'orientLexemePairAny'
171 | LexemePairBoth !(NonEmpty (Cell Pair))
172 | LexemeEscape !(Cell Char)
173 | LexemeLink !(Cell Link)
174 | LexemeWhite !(Cell TL.Text)
175 | LexemeAlphaNum !(Cell TL.Text)
176 | LexemeOther !(Cell TL.Text)
177 | LexemeTree !(Tree (Cell Node)) -- FIXME: useless?
180 instance Pretty Lexeme
182 parseTokens :: [Lexeme] -> Tokens
185 appendLexemes mempty $
186 -- debug0 "Lexemes (post orient)" $
187 orientLexemePairAny $ LexemeEnd :
192 Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme]
193 parseLexemes = runParserOnCell (p_Lexemes <* P.eof)
195 -- | Parse 'Lexeme's, returning them in reverse order
196 -- to apply 'orientLexemePairAny'.
197 p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
198 p_Lexemes = debugParser "Lexemes" $ go []
200 go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
203 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
205 -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
206 -- so that it can try to orient nearby 'LexemePairAny'
207 -- to 'LexemePairOpen' or 'LexemePairClose'.
208 orientLexemePairAny :: [Lexeme] -> [Lexeme]
209 orientLexemePairAny = \case
210 -- LexemeOther (Sourced _bx ex x):LexemeOther (Sourced by _ey y):acc -> LexemeOther (Sourced by ex (x<>y)):acc
213 t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
214 w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
215 LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
217 LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
218 LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
219 LexemePairAny p:[] -> LexemePairOpen p:[]
222 LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
223 LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
225 w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
226 LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
229 an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
230 an@LexemeEscape{} :a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
232 an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
233 an@LexemeEscape{} :LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
236 c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
238 LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
241 o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
243 LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc
246 a1@LexemeAlphaNum{}:LexemePairAny p:a2@LexemeAlphaNum{}:acc ->
247 a1:LexemeOther (sconcat (((fst . pairBordersDouble) <$>) <$> p)):a2:acc
251 p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
252 p_Lexeme = debugParser "Lexeme" $
254 [ P.try $ LexemeWhite <$> p_Cell p_Spaces1
255 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
256 , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
257 , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen <|> P.try p_BackOpen)
258 , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
259 , P.try $ LexemeEscape <$> p_Cell p_Escape
260 , P.try $ LexemeLink <$> p_Cell p_Link
261 , P.try $ LexemeAlphaNum <$> p_Cell p_AlphaNums1
262 , LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar)
265 p_some :: Parser e s a -> Parser e s (NonEmpty a)
266 p_some p = NonEmpty.fromList <$> P.some p
268 pairAny :: Char -> Maybe Pair
271 '/' -> Just PairSlash
272 '"' -> Just PairDoublequote
273 '\'' -> Just PairSinglequote
274 '`' -> Just PairBackquote
275 '_' -> Just PairUnderscore
277 '#' -> Just $ PairTag False
278 '@' -> Just $ PairAt False
281 pairOpen :: Char -> Maybe Pair
283 '(' -> Just PairParen
284 '[' -> Just PairBracket
285 '{' -> Just PairBrace
286 '«' -> Just PairFrenchquote
289 pairClose :: Char -> Maybe Pair
291 ')' -> Just PairParen
292 ']' -> Just PairBracket
293 '}' -> Just PairBrace
294 '»' -> Just PairFrenchquote
297 p_BackOpen :: P.Tokens s ~ TL.Text => Parser e s Pair
298 p_BackOpen = debugParser "BackOpen" $
300 *> (PairAt True <$ P.char '@'
301 <|> PairTag True <$ P.char '#')
303 p_Escape :: Parser e s Char
304 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
306 p_Link :: P.Tokens s ~ TL.Text => Parser e s Link
308 P.try (P.char '<' *> p <* P.char '>') <|>
311 p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
313 (\scheme addr -> scheme <> "//" <> addr)
314 <$> P.option "" (P.try p_scheme)
317 p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
320 <$> (P.takeWhile1P (Just "scheme") $ \c ->
326 p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
328 P.takeWhileP (Just "addr") $ \c ->
342 pairBorders :: Foldable f => Pair -> f a -> (TL.Text, TL.Text)
343 pairBorders p ts | null ts = pairBordersSingle p
344 | otherwise = pairBordersDouble p
346 pairBordersSingle :: Pair -> (TL.Text,TL.Text)
347 pairBordersSingle = \case
348 PairElem (XML.NCName n) as ->
349 ("<"<>n<>foldMap f as<>"/>","")
350 where f (elemAttr_white,ElemAttr{..}) =
352 XML.unNCName elemAttr_name <>
356 p -> pairBordersDouble p
358 pairBordersDouble :: Pair -> (TL.Text, TL.Text)
359 pairBordersDouble = \case
360 PairElem (XML.NCName n) as -> ("<"<>n<>foldMap f as<>">","</"<>n<>">")
361 where f (elemAttr_white,ElemAttr{..}) =
363 XML.unNCName elemAttr_name <>
367 PairTag isBackref | isBackref -> ("^#","#")
368 | otherwise -> ("#","#")
369 PairAt isBackref | isBackref -> ("^@","@")
370 | otherwise -> ("@","@")
371 PairStar -> ("*","*")
372 PairSlash -> ("/","/")
373 PairUnderscore -> ("_","_")
374 PairDash -> ("-","-")
375 PairBackquote -> ("`","`")
376 PairSinglequote -> ("'","'")
377 PairDoublequote -> ("\"","\"")
378 PairFrenchquote -> ("«","»")
379 PairParen -> ("(",")")
380 PairBrace -> ("{","}")
381 PairBracket -> ("[","]")
383 -- * Class 'ParseRef'
384 class ParseRef a where
385 parseRef :: a -> Maybe (Cell Ref, a)
386 instance ParseRef Tokens where
390 Tree0 (Sourced src0@(FileRange _f0 _b0 e0:|_l0) n) :< ns ->
392 NodeToken (TokenText t) ->
393 case parseRef $ Sourced src0 t of
396 if TL.null $ unSourced r0
399 Just (t1@(Sourced (FileRange _f1 b1 _e1:|_l1) _), r1) | e0 == b1 ->
402 else Just (t0, pure n0 `unionTokens` ns)
403 where n0 = Tree0 $ NodeToken . TokenText <$> r0
406 instance ParseRef (Cell TL.Text) where
407 parseRef (Sourced (FileRange fp bp ep:|sp) t)
408 | (w,r) <- TL.span isTagChar t
410 , ew <- filePos_column bp + sum (Text.length <$> TL.toChunks w) =
412 ( Sourced (FileRange fp bp bp{filePos_column=ew}:|sp) w
413 , Sourced (FileRange fp bp{filePos_column=ew} ep:|sp) r )
416 isTagChar :: Char -> Bool
420 case Char.generalCategory c of
421 Char.DashPunctuation -> True
422 Char.ConnectorPunctuation -> True
426 -- | Build 'Tokens' from many 'Token's.
427 tokens :: [Cell Token] -> Tokens
428 tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
430 -- | Build 'Tokens' from one 'Token'.
431 tokens1 :: Tree (Cell Node) -> Tokens
432 tokens1 = Seq.singleton
434 unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
436 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
437 [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
440 isTokenElem :: Tokens -> Bool
442 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
443 [Tree (unSourced -> NodePair PairElem{}) _] -> True