]> Git — Sourcephile - doclang.git/blob - Hdoc/TCT/Read/Token.hs
stack: fix locations
[doclang.git] / Hdoc / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Hdoc.TCT.Read.Token where
9
10 import Control.Applicative (Applicative(..), Alternative(..))
11 import Control.Monad (Monad(..))
12 import Data.Bool
13 import Data.Char (Char)
14 import Data.Either (Either(..))
15 import Data.Eq (Eq(..))
16 import Data.Foldable (Foldable(..))
17 import Data.Function (($), (.))
18 import Data.Functor ((<$>), ($>), (<$))
19 import Data.List.NonEmpty (NonEmpty(..))
20 import Data.Maybe (Maybe(..))
21 import Data.Monoid (Monoid(..))
22 import Data.Semigroup (Semigroup(..))
23 import Data.Sequence (ViewL(..), ViewR(..), (<|))
24 import Data.TreeSeq.Strict (Tree(..), Trees)
25 import Data.Tuple (fst,snd)
26 import Data.Void (Void)
27 import Prelude (Num(..))
28 import Text.Show (Show(..))
29 import qualified Data.Char as Char
30 import qualified Data.List.NonEmpty as NonEmpty
31 import qualified Data.Sequence as Seq
32 import qualified Data.Text as Text
33 import qualified Data.Text.Lazy as TL
34 import qualified Text.Megaparsec as P
35 import qualified Text.Megaparsec.Char as P
36
37 import Hdoc.TCT.Debug
38 import Hdoc.TCT.Cell
39 import Hdoc.TCT.Elem
40 import Hdoc.TCT.Tree
41 import Hdoc.TCT.Read.Elem
42 import Hdoc.TCT.Read.Cell
43
44 -- * Type 'Pairs'
45 -- | Right-only Dyck language,
46 -- to keep track of opened 'Pair's.
47 type Pairs = (Tokens,[Opening])
48 type Tokens = Trees (Cell Node)
49
50 -- ** Type 'Opening'
51 -- | An opened 'Pair' and its content so far.
52 type Opening = (Cell Pair,Tokens)
53
54 appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs
55 appendPairsToken ps t = appendPairsTokens ps (pure t)
56
57 appendPairsText :: Pairs -> Cell TL.Text -> Pairs
58 appendPairsText ps (Cell sp t) =
59 appendPairsToken ps $
60 Tree0 $ Cell sp $
61 NodeToken $ TokenText t
62
63 appendPairsTokens :: Pairs -> Tokens -> Pairs
64 appendPairsTokens (ts,[]) toks = (ts`unionTokens`toks,[])
65 appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps)
66
67 -- | Unify two 'Tokens', merging border 'TokenText's if any.
68 unionTokens :: Tokens -> Tokens -> Tokens
69 unionTokens x y =
70 case (Seq.viewr x, Seq.viewl y) of
71 (xs :> x0, y0 :< ys) ->
72 case (x0,y0) of
73 ( Tree (Cell (Span fx bx _ex:| lx) (NodeToken (TokenText tx))) tsx
74 , Tree (Cell (Span _fy _by ey:|_ly) (NodeToken (TokenText ty))) tsy ) ->
75 xs `unionTokens`
76 pure (Tree (Cell (Span fx bx ey:|lx) $ NodeToken $ TokenText $ tx <> ty) (tsx<>tsy)) `unionTokens`
77 ys
78 _ -> x <> y
79 (EmptyR, _) -> y
80 (_, EmptyL) -> x
81
82 unionsTokens :: Foldable f => f Tokens -> Tokens
83 unionsTokens = foldl' unionTokens mempty
84
85 openPair :: Pairs -> Cell Pair -> Pairs
86 openPair (t,ps) p = (t,(p,mempty):ps)
87
88 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
89 closePair :: Pairs -> Cell Pair -> Pairs
90 closePair ps@(_,[]) (Cell loc p) = -- debug0 "closePair" $
91 appendPairsText ps $ Cell loc $
92 snd $ pairBordersDouble p
93 closePair (t,(cx@(Cell (Span fx bx _ex:| lx) px),t1):ts)
94 cy@(Cell (Span _fy _by ey:|_ly) py) = -- debug0 "closePair" $
95 case (px,py) of
96 (PairElem nx ax, PairElem ny ay) | nx == ny ->
97 appendPairsToken (t,ts) $
98 Tree (Cell (Span fx bx ey:|lx) $ NodePair $ PairElem nx as) t1
99 where as | null ay = ax
100 | otherwise = ax<>ay
101 _ | px == py ->
102 appendPairsToken (t,ts) $
103 Tree (Cell (Span fx bx ey:|lx) $ NodePair px) t1
104 _ ->
105 (`closePair` cy) $
106 appendPairsTokens
107 (t,ts)
108 (closeImpaired mempty (cx,t1))
109
110 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
111 closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens
112 closeImpaired acc (Cell loc@(s0:|lp) pair, toks) = -- debug0 "closeImpaired" $
113 case pair of
114 -- NOTE: try to close 'PairTag' as 'TokenTag' instead of 'TokenText'.
115 PairTag isBackref | Just (Cell (Span{span_end}:|_lt) ref, rest) <- parseRef body ->
116 Tree0 (Cell (s0{span_end}:|lp) $ NodeToken $ TokenTag isBackref ref) <| rest
117 -- NOTE: use bp (not bt) to include the '#'
118 -- NOTE: try to close 'PairAt' as 'TokenAt' instead of 'TokenText'.
119 PairAt isBackref | Just (Cell (Span{span_end}:|_lt) ref, rest) <- parseRef body ->
120 Tree0 (Cell (s0{span_end}:|lp) $ NodeToken $ TokenAt isBackref ref) <| rest
121 -- NOTE: use bp (not bt) to include the '@'
122 _ -> pure open `unionTokens` body
123 where
124 body = toks `unionTokens` acc
125 open = Tree0 $ Cell loc $ NodeToken $ TokenText $ fst $ pairBordersDouble pair
126
127 -- | Close remaining 'Pair's at end of parsing.
128 closePairs :: Pairs -> Tokens
129 closePairs (t0,ps) = -- debug0 "closePairs" $
130 t0 `unionTokens` foldl' closeImpaired mempty ps
131
132 appendLexeme :: Lexeme -> Pairs -> Pairs
133 appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
134 case lex of
135 LexemePairOpen ps -> foldl' open acc ps
136 where
137 -- NOTE: insert an empty node to encode <elem></elem>, not <elem/>
138 open a p@(Cell (Span{span_end, ..}:|sp) PairElem{}) = openPair a p `appendPairsText` Cell (Span{span_begin=span_end, ..}:|sp) ""
139 open a p = openPair a p
140 LexemePairClose ps -> foldl' closePair acc ps
141 LexemePairAny ps -> foldl' openPair acc ps
142 LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
143 LexemeEscape c -> appendPairsToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
144 LexemeLink t -> appendPairsToken acc $ Tree0 $ NodeToken . TokenLink <$> t
145 {-LexemeWhite (unCell -> "") -> acc-}
146 -- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc
147 LexemeWhite t -> appendPairsText acc t
148 LexemeAlphaNum t -> appendPairsText acc t
149 LexemeOther t -> appendPairsText acc t
150 LexemeTree t -> appendPairsToken acc t
151 LexemeEnd -> acc
152
153 appendLexemes :: Pairs -> [Lexeme] -> Pairs
154 appendLexemes = foldr appendLexeme
155
156 -- * Type 'Lexeme'
157 -- | 'Lexeme's cut the input in the longest chunks of common semantic,
158 -- this enables 'orientLexemePairAny' to work with a more meaningful context.
159 data Lexeme
160 = LexemePairOpen !(NonEmpty (Cell Pair))
161 | LexemePairClose !(NonEmpty (Cell Pair))
162 | LexemePairAny !(NonEmpty (Cell Pair))
163 -- ^ orientation depending on the surrounding 'Lexeme's,
164 -- see 'orientLexemePairAny'
165 | LexemePairBoth !(NonEmpty (Cell Pair))
166 | LexemeEscape !(Cell Char)
167 | LexemeLink !(Cell TL.Text)
168 | LexemeWhite !(Cell TL.Text)
169 | LexemeAlphaNum !(Cell TL.Text)
170 | LexemeOther !(Cell TL.Text)
171 | LexemeTree !(Tree (Cell Node)) -- FIXME: useless?
172 | LexemeEnd
173 deriving (Eq, Show)
174 instance Pretty Lexeme
175
176 parseTokens :: [Lexeme] -> Tokens
177 parseTokens ps =
178 closePairs $
179 appendLexemes mempty $
180 -- debug0 "Lexemes (post orient)" $
181 orientLexemePairAny $ LexemeEnd :
182 ps
183
184 parseLexemes ::
185 Cell TL.Text ->
186 Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme]
187 parseLexemes = runParserOnCell (p_Lexemes <* P.eof)
188
189 -- | Parse 'Lexeme's, returning them in reverse order
190 -- to apply 'orientLexemePairAny'.
191 p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
192 p_Lexemes = debugParser "Lexemes" $ go []
193 where
194 go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
195 go acc =
196 (P.eof $> acc) <|>
197 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
198
199 -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
200 -- so that it can try to orient nearby 'LexemePairAny'
201 -- to 'LexemePairOpen' or 'LexemePairClose'.
202 orientLexemePairAny :: [Lexeme] -> [Lexeme]
203 orientLexemePairAny = \case
204 -- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc
205
206 -- "   
207 t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
208 w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
209 LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
210 --    "
211 LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
212 LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
213 LexemePairAny p:[] -> LexemePairOpen p:[]
214
215 --    ,,,"
216 LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
217 LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
218 -- ",,,   
219 w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
220 LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
221
222 -- ",,,AAA
223 an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
224 an@LexemeEscape{} :a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
225 -- ,,,"AAA
226 an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
227 an@LexemeEscape{} :LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
228
229 -- ")
230 c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
231 -- ("
232 LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
233
234 -- "(
235 o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
236 -- )"
237 LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc
238
239 -- AAA#AAA
240 a1@LexemeAlphaNum{}:LexemePairAny p:a2@LexemeAlphaNum{}:acc ->
241 a1:LexemeOther (sconcat (((fst . pairBordersDouble) <$>) <$> p)):a2:acc
242
243 acc -> acc
244
245 p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
246 p_Lexeme = debugParser "Lexeme" $
247 P.choice
248 [ P.try $ LexemeWhite <$> p_Cell p_Spaces1
249 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
250 , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
251 , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen <|> P.try p_BackOpen)
252 , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
253 , P.try $ LexemeEscape <$> p_Cell p_Escape
254 , P.try $ LexemeLink <$> p_Cell p_Link
255 , P.try $ LexemeAlphaNum <$> p_Cell p_AlphaNums1
256 , LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar)
257 ]
258
259 p_some :: Parser e s a -> Parser e s (NonEmpty a)
260 p_some p = NonEmpty.fromList <$> P.some p
261
262 pairAny :: Char -> Maybe Pair
263 pairAny = \case
264 '-' -> Just PairDash
265 '/' -> Just PairSlash
266 '"' -> Just PairDoublequote
267 '\'' -> Just PairSinglequote
268 '`' -> Just PairBackquote
269 '_' -> Just PairUnderscore
270 '*' -> Just PairStar
271 '#' -> Just $ PairTag False
272 '@' -> Just $ PairAt False
273 _ -> Nothing
274
275 pairOpen :: Char -> Maybe Pair
276 pairOpen = \case
277 '(' -> Just PairParen
278 '[' -> Just PairBracket
279 '{' -> Just PairBrace
280 '«' -> Just PairFrenchquote
281 _ -> Nothing
282
283 pairClose :: Char -> Maybe Pair
284 pairClose = \case
285 ')' -> Just PairParen
286 ']' -> Just PairBracket
287 '}' -> Just PairBrace
288 '»' -> Just PairFrenchquote
289 _ -> Nothing
290
291 p_BackOpen :: P.Tokens s ~ TL.Text => Parser e s Pair
292 p_BackOpen = debugParser "BackOpen" $
293 P.char '~'
294 *> (PairAt True <$ P.char '@'
295 <|> PairTag True <$ P.char '#')
296
297 p_Escape :: Parser e s Char
298 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
299
300 p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text
301 p_Link =
302 P.try (P.char '<' *> p <* P.char '>') <|>
303 p
304 where
305 p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
306 p =
307 (\scheme addr -> scheme <> "//" <> addr)
308 <$> P.option "" (P.try p_scheme)
309 <* P.string "//"
310 <*> p_addr
311 p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
312 p_scheme =
313 (<> ":")
314 <$> (P.takeWhile1P (Just "scheme") $ \c ->
315 Char.isAlphaNum c
316 || c=='_'
317 || c=='-'
318 || c=='+')
319 <* P.char ':'
320 p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
321 p_addr =
322 P.takeWhileP (Just "addr") $ \c ->
323 Char.isAlphaNum c
324 || c=='%'
325 || c=='/'
326 || c=='('
327 || c==')'
328 || c=='-'
329 || c=='_'
330 || c=='.'
331 || c=='#'
332 || c=='?'
333 || c=='='
334 || c=='@'
335
336 pairBorders :: Foldable f => Pair -> f a -> (TL.Text,TL.Text)
337 pairBorders p ts | null ts = pairBordersSingle p
338 | otherwise = pairBordersDouble p
339
340 pairBordersSingle :: Pair -> (TL.Text,TL.Text)
341 pairBordersSingle = \case
342 PairElem n as ->
343 ("<"<>n<>foldMap f as<>"/>","")
344 where f (elemAttr_white,ElemAttr{..}) =
345 elemAttr_white <>
346 elemAttr_name <>
347 elemAttr_open <>
348 elemAttr_value <>
349 elemAttr_close
350 p -> pairBordersDouble p
351
352 pairBordersDouble :: Pair -> (TL.Text,TL.Text)
353 pairBordersDouble = \case
354 PairElem n as -> ("<"<>n<>foldMap f as<>">","</"<>n<>">")
355 where f (elemAttr_white,ElemAttr{..}) =
356 elemAttr_white <>
357 elemAttr_name <>
358 elemAttr_open <>
359 elemAttr_value <>
360 elemAttr_close
361 PairTag isBackref | isBackref -> ("^#","#")
362 | otherwise -> ("#","#")
363 PairAt isBackref | isBackref -> ("^@","@")
364 | otherwise -> ("@","@")
365 PairStar -> ("*","*")
366 PairSlash -> ("/","/")
367 PairUnderscore -> ("_","_")
368 PairDash -> ("-","-")
369 PairBackquote -> ("`","`")
370 PairSinglequote -> ("'","'")
371 PairDoublequote -> ("\"","\"")
372 PairFrenchquote -> ("«","»")
373 PairParen -> ("(",")")
374 PairBrace -> ("{","}")
375 PairBracket -> ("[","]")
376
377 -- * Class 'ParseRef'
378 class ParseRef a where
379 parseRef :: a -> Maybe (Cell Ref, a)
380 instance ParseRef Tokens where
381 parseRef ts =
382 case Seq.viewl ts of
383 EmptyL -> Nothing
384 Tree0 (Cell loc0@(Span _f0 _b0 e0:|_l0) n) :< ns ->
385 case n of
386 NodeToken (TokenText t) ->
387 case parseRef $ Cell loc0 t of
388 Nothing -> Nothing
389 Just (t0,r0) ->
390 if TL.null $ unCell r0
391 then
392 case parseRef ns of
393 Just (t1@(Cell (Span _f1 b1 _e1:|_l1) _), r1) | e0 == b1 ->
394 Just (t0<>t1, r1)
395 _ -> Just (t0, ns)
396 else Just (t0, pure n0 `unionTokens` ns)
397 where n0 = Tree0 $ NodeToken . TokenText <$> r0
398 _ -> Nothing
399 _ -> Nothing
400 instance ParseRef (Cell TL.Text) where
401 parseRef (Cell (Span fp bp ep:|sp) t)
402 | (w,r) <- TL.span isTagChar t
403 , not $ TL.null w
404 , ew <- pos_column bp + sum (Text.length <$> TL.toChunks w) =
405 Just
406 ( Cell (Span fp bp bp{pos_column=ew}:|sp) w
407 , Cell (Span fp bp{pos_column=ew} ep:|sp) r )
408 parseRef _ = Nothing
409
410 isTagChar :: Char -> Bool
411 isTagChar c =
412 Char.isAlphaNum c ||
413 c=='·' ||
414 case Char.generalCategory c of
415 Char.DashPunctuation -> True
416 Char.ConnectorPunctuation -> True
417 _ -> False
418
419 {-
420 -- | Build 'Tokens' from many 'Token's.
421 tokens :: [Cell Token] -> Tokens
422 tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
423
424 -- | Build 'Tokens' from one 'Token'.
425 tokens1 :: Tree (Cell Node) -> Tokens
426 tokens1 = Seq.singleton
427
428 unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
429 unTokenElem toks =
430 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
431 [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
432 _ -> Nothing
433
434 isTokenElem :: Tokens -> Bool
435 isTokenElem toks =
436 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
437 [Tree (unCell -> NodePair PairElem{}) _] -> True
438 _ -> False
439 -}