]> Git — Sourcephile - doclang.git/blob - Hdoc/TCT/Read/Token.hs
Renames in XML, to use it qualified.
[doclang.git] / Hdoc / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Hdoc.TCT.Read.Token where
9
10 import Control.Applicative (Applicative(..), Alternative(..))
11 import Control.Monad (Monad(..))
12 import Data.Bool
13 import Data.Char (Char)
14 import Data.Either (Either(..))
15 import Data.Eq (Eq(..))
16 import Data.Foldable (Foldable(..))
17 import Data.Function (($), (.))
18 import Data.Functor ((<$>), ($>))
19 import Data.List.NonEmpty (NonEmpty(..))
20 import Data.Maybe (Maybe(..))
21 import Data.Monoid (Monoid(..))
22 import Data.Semigroup (Semigroup(..))
23 import Data.Sequence (ViewL(..), ViewR(..), (<|))
24 import Data.TreeSeq.Strict (Tree(..), Trees)
25 import Data.Tuple (fst,snd)
26 import Data.Void (Void)
27 import Prelude (Num(..))
28 import Text.Show (Show(..))
29 import qualified Data.Char as Char
30 import qualified Data.List.NonEmpty as NonEmpty
31 import qualified Data.Sequence as Seq
32 import qualified Data.Text as Text
33 import qualified Data.Text.Lazy as TL
34 import qualified Text.Megaparsec as P
35 import qualified Text.Megaparsec.Char as P
36
37 import Hdoc.TCT.Debug
38 import Hdoc.TCT.Cell
39 import Hdoc.TCT.Elem
40 import Hdoc.TCT.Tree
41 import Hdoc.TCT.Read.Elem
42 import Hdoc.TCT.Read.Cell
43
44 -- * Type 'Pairs'
45 -- | Right-only Dyck language,
46 -- to keep track of opened 'Pair's.
47 type Pairs = (Tokens,[Opening])
48 type Tokens = Trees (Cell Node)
49
50 -- ** Type 'Opening'
51 -- | An opened 'Pair' and its content so far.
52 type Opening = (Cell Pair,Tokens)
53
54 appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs
55 appendPairsToken ps t = appendPairsTokens ps (pure t)
56
57 appendPairsText :: Pairs -> Cell TL.Text -> Pairs
58 appendPairsText ps (Cell sp t) =
59 appendPairsToken ps $
60 Tree0 $ Cell sp $
61 NodeToken $ TokenText t
62
63 appendPairsTokens :: Pairs -> Tokens -> Pairs
64 appendPairsTokens (ts,[]) toks = (ts`unionTokens`toks,[])
65 appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps)
66
67 -- | Unify two 'Tokens', merging border 'TokenText's if any.
68 unionTokens :: Tokens -> Tokens -> Tokens
69 unionTokens x y =
70 case (Seq.viewr x, Seq.viewl y) of
71 (xs :> x0, y0 :< ys) ->
72 case (x0,y0) of
73 ( Tree (Cell (Span fx bx _ex:| lx) (NodeToken (TokenText tx))) tsx
74 , Tree (Cell (Span _fy _by ey:|_ly) (NodeToken (TokenText ty))) tsy ) ->
75 xs `unionTokens`
76 pure (Tree (Cell (Span fx bx ey:|lx) $ NodeToken $ TokenText $ tx <> ty) (tsx<>tsy)) `unionTokens`
77 ys
78 _ -> x <> y
79 (EmptyR, _) -> y
80 (_, EmptyL) -> x
81
82 unionsTokens :: Foldable f => f Tokens -> Tokens
83 unionsTokens = foldl' unionTokens mempty
84
85 openPair :: Pairs -> Cell Pair -> Pairs
86 openPair (t,ps) p = (t,(p,mempty):ps)
87
88 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
89 closePair :: Pairs -> Cell Pair -> Pairs
90 closePair ps@(_,[]) (Cell loc p) = -- debug0 "closePair" $
91 appendPairsText ps $ Cell loc $
92 snd $ pairBordersDouble p
93 closePair (t,(cx@(Cell (Span fx bx _ex:| lx) px),t1):ts)
94 cy@(Cell (Span _fy _by ey:|_ly) py) = -- debug0 "closePair" $
95 case (px,py) of
96 (PairElem nx ax, PairElem ny ay) | nx == ny ->
97 appendPairsToken (t,ts) $
98 Tree (Cell (Span fx bx ey:|lx) $ NodePair $ PairElem nx as) t1
99 where as | null ay = ax
100 | otherwise = ax<>ay
101 _ | px == py ->
102 appendPairsToken (t,ts) $
103 Tree (Cell (Span fx bx ey:|lx) $ NodePair px) t1
104 _ ->
105 (`closePair` cy) $
106 appendPairsTokens
107 (t,ts)
108 (closeImpaired mempty (cx,t1))
109
110 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
111 closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens
112 closeImpaired acc (Cell loc@(s0:|lp) pair, toks) = -- debug0 "closeImpaired" $
113 case pair of
114 -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
115 PairHash | Just (Cell (Span{span_end}:|_lt) tag, rest) <- tagFrom body ->
116 Tree0 (Cell (s0{span_end}:|lp) $ NodeToken $ TokenTag tag) <| rest
117 -- NOTE: use bp (not bt) to include the '#'
118 _ -> pure open `unionTokens` body
119 where
120 body = toks `unionTokens` acc
121 open = Tree0 $ Cell loc $ NodeToken $ TokenText $ fst $ pairBordersDouble pair
122
123 -- | Close remaining 'Pair's at end of parsing.
124 closePairs :: Pairs -> Tokens
125 closePairs (t0,ps) = -- debug0 "closePairs" $
126 t0 `unionTokens` foldl' closeImpaired mempty ps
127
128 appendLexeme :: Lexeme -> Pairs -> Pairs
129 appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
130 case lex of
131 LexemePairOpen ps -> foldl' open acc ps
132 where
133 -- NOTE: insert an empty node to encode <elem></elem>, not <elem/>
134 open a p@(Cell (Span{span_end, ..}:|sp) PairElem{}) = openPair a p `appendPairsText` Cell (Span{span_begin=span_end, ..}:|sp) ""
135 open a p = openPair a p
136 LexemePairClose ps -> foldl' closePair acc ps
137 LexemePairAny ps -> foldl' openPair acc ps
138 {-
139 LexemePairAny ps ->
140 appendPairsText acc $ sconcat $
141 ((fst . pairBordersSingle) <$>) <$> ps
142 -}
143 LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
144 LexemeEscape c -> appendPairsToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
145 LexemeLink t -> appendPairsToken acc $ Tree0 $ NodeToken . TokenLink <$> t
146 {-LexemeWhite (unCell -> "") -> acc-}
147 -- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc
148 LexemeWhite t -> appendPairsText acc t
149 LexemeAlphaNum t -> appendPairsText acc t
150 LexemeOther t -> appendPairsText acc t
151 LexemeTree t -> appendPairsToken acc t
152 LexemeEnd -> acc
153
154 appendLexemes :: Pairs -> [Lexeme] -> Pairs
155 appendLexemes = foldr appendLexeme
156
157 -- * Type 'Lexeme'
158 -- | 'Lexeme's cut the input in the longest chunks of common semantic,
159 -- this enables 'orientLexemePairAny' to work with a more meaningful context.
160 data Lexeme
161 = LexemePairOpen !(NonEmpty (Cell Pair))
162 | LexemePairClose !(NonEmpty (Cell Pair))
163 | LexemePairAny !(NonEmpty (Cell Pair))
164 -- ^ orientation depending on the surrounding 'Lexeme's,
165 -- see 'orientLexemePairAny'
166 | LexemePairBoth !(NonEmpty (Cell Pair))
167 | LexemeEscape !(Cell Char)
168 | LexemeLink !(Cell TL.Text)
169 | LexemeWhite !(Cell TL.Text)
170 | LexemeAlphaNum !(Cell TL.Text)
171 | LexemeOther !(Cell TL.Text)
172 | LexemeTree !(Tree (Cell Node))
173 | LexemeEnd
174 deriving (Eq, Show)
175 instance Pretty Lexeme
176
177 parseTokens :: [Lexeme] -> Tokens
178 parseTokens ps =
179 closePairs $
180 appendLexemes mempty $
181 -- debug0 "Lexemes (post orient)" $
182 orientLexemePairAny $ LexemeEnd :
183 ps
184
185 parseLexemes ::
186 Cell TL.Text ->
187 Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme]
188 parseLexemes = runParserOnCell (p_Lexemes <* P.eof)
189
190 -- | Parse 'Lexeme's, returning them in reverse order
191 -- to apply 'orientLexemePairAny'.
192 p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
193 p_Lexemes = debugParser "Lexemes" $ go []
194 where
195 go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
196 go acc =
197 (P.eof $> acc) <|>
198 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
199
200 -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
201 -- so that it can try to orient nearby 'LexemePairAny'
202 -- to 'LexemePairOpen' or 'LexemePairClose'.
203 orientLexemePairAny :: [Lexeme] -> [Lexeme]
204 orientLexemePairAny = \case
205 -- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc
206
207 -- "   
208 t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
209 w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
210 LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
211 --    "
212 LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
213 LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
214 LexemePairAny p:[] -> LexemePairOpen p:[]
215
216 --    ,,,"
217 LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
218 LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
219 -- ",,,   
220 w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
221 LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
222
223 -- ",,,AAA
224 an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
225 -- ,,,"AAA
226 an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
227
228 -- ")
229 c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
230 -- ("
231 LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
232
233 -- "(
234 o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
235 -- )"
236 LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc
237
238 acc -> acc
239
240 p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
241 p_Lexeme = debugParser "Lexeme" $
242 P.choice
243 [ P.try $ LexemeWhite <$> p_Cell p_Spaces1
244 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
245 , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
246 , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen)
247 , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
248 , P.try $ LexemeEscape <$> p_Cell p_Escape
249 , P.try $ LexemeLink <$> p_Cell p_Link
250 , P.try $ LexemeAlphaNum <$> p_Cell (P.takeWhile1P (Just "AlphaNum") Char.isAlphaNum)
251 , LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar)
252 ]
253
254 p_some :: Parser e s a -> Parser e s (NonEmpty a)
255 p_some p = NonEmpty.fromList <$> P.some p
256
257 pairAny :: Char -> Maybe Pair
258 pairAny = \case
259 '-' -> Just PairDash
260 '/' -> Just PairSlash
261 '"' -> Just PairDoublequote
262 '\'' -> Just PairSinglequote
263 '`' -> Just PairBackquote
264 '_' -> Just PairUnderscore
265 '*' -> Just PairStar
266 '#' -> Just PairHash
267 _ -> Nothing
268
269 pairOpen :: Char -> Maybe Pair
270 pairOpen = \case
271 '(' -> Just PairParen
272 '[' -> Just PairBracket
273 '{' -> Just PairBrace
274 '«' -> Just PairFrenchquote
275 _ -> Nothing
276
277 pairClose :: Char -> Maybe Pair
278 pairClose = \case
279 ')' -> Just PairParen
280 ']' -> Just PairBracket
281 '}' -> Just PairBrace
282 '»' -> Just PairFrenchquote
283 _ -> Nothing
284
285 p_Escape :: Parser e s Char
286 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
287
288 p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text
289 p_Link =
290 P.try (P.char '<' *> p <* P.char '>') <|>
291 p
292 where
293 p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
294 p =
295 (\scheme addr -> scheme <> "//" <> addr)
296 <$> P.option "" (P.try p_scheme)
297 <* P.string "//"
298 <*> p_addr
299 p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
300 p_scheme =
301 (<> ":")
302 <$> (P.takeWhile1P (Just "scheme") $ \c ->
303 Char.isAlphaNum c
304 || c=='_'
305 || c=='-'
306 || c=='+')
307 <* P.char ':'
308 p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
309 p_addr =
310 P.takeWhileP (Just "addr") $ \c ->
311 Char.isAlphaNum c
312 || c=='%'
313 || c=='/'
314 || c=='('
315 || c==')'
316 || c=='-'
317 || c=='_'
318 || c=='.'
319 || c=='#'
320 || c=='?'
321 || c=='='
322
323 pairBorders :: Foldable f => Pair -> f a -> (TL.Text,TL.Text)
324 pairBorders p ts | null ts = pairBordersSingle p
325 | otherwise = pairBordersDouble p
326
327 pairBordersSingle :: Pair -> (TL.Text,TL.Text)
328 pairBordersSingle = \case
329 PairElem n as ->
330 ("<"<>n<>foldMap f as<>"/>","")
331 where f (elemAttr_white,ElemAttr{..}) =
332 elemAttr_white <>
333 elemAttr_name <>
334 elemAttr_open <>
335 elemAttr_value <>
336 elemAttr_close
337 p -> pairBordersDouble p
338
339 pairBordersDouble :: Pair -> (TL.Text,TL.Text)
340 pairBordersDouble = \case
341 PairElem n as -> ("<"<>n<>foldMap f as<>">","</"<>n<>">")
342 where f (elemAttr_white,ElemAttr{..}) =
343 elemAttr_white <>
344 elemAttr_name <>
345 elemAttr_open <>
346 elemAttr_value <>
347 elemAttr_close
348 PairHash -> ("#","#")
349 PairStar -> ("*","*")
350 PairSlash -> ("/","/")
351 PairUnderscore -> ("_","_")
352 PairDash -> ("-","-")
353 PairBackquote -> ("`","`")
354 PairSinglequote -> ("'","'")
355 PairDoublequote -> ("\"","\"")
356 PairFrenchquote -> ("«","»")
357 PairParen -> ("(",")")
358 PairBrace -> ("{","}")
359 PairBracket -> ("[","]")
360
361 -- * Class 'TagFrom'
362 class TagFrom a where
363 tagFrom :: a -> Maybe (Cell Tag, a)
364 instance TagFrom Tokens where
365 tagFrom ts =
366 case Seq.viewl ts of
367 EmptyL -> Nothing
368 Tree0 (Cell loc0@(Span _f0 _b0 e0:|_l0) n) :< ns ->
369 case n of
370 NodeToken (TokenText t) ->
371 case tagFrom $ Cell loc0 t of
372 Nothing -> Nothing
373 Just (t0,r0) ->
374 if TL.null $ unCell r0
375 then
376 case tagFrom ns of
377 Just (t1@(Cell (Span _f1 b1 _e1:|_l1) _), r1) | e0 == b1 ->
378 Just (t0<>t1, r1)
379 _ -> Just (t0, ns)
380 else Just (t0, pure n0 `unionTokens` ns)
381 where n0 = Tree0 $ NodeToken . TokenText <$> r0
382 _ -> Nothing
383 _ -> Nothing
384 instance TagFrom (Cell TL.Text) where
385 tagFrom (Cell (Span fp bp ep:|sp) t)
386 | (w,r) <- TL.span isTagChar t
387 , not $ TL.null w
388 , ew <- pos_column bp + sum (Text.length <$> TL.toChunks w) =
389 Just
390 ( Cell (Span fp bp bp{pos_column=ew}:|sp) w
391 , Cell (Span fp bp{pos_column=ew} ep:|sp) r )
392 tagFrom _ = Nothing
393
394 isTagChar :: Char -> Bool
395 isTagChar c =
396 Char.isAlphaNum c ||
397 c=='·' ||
398 case Char.generalCategory c of
399 Char.DashPunctuation -> True
400 Char.ConnectorPunctuation -> True
401 _ -> False
402
403 {-
404 -- | Build 'Tokens' from many 'Token's.
405 tokens :: [Cell Token] -> Tokens
406 tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
407
408 -- | Build 'Tokens' from one 'Token'.
409 tokens1 :: Tree (Cell Node) -> Tokens
410 tokens1 = Seq.singleton
411
412 unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
413 unTokenElem toks =
414 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
415 [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
416 _ -> Nothing
417
418 isTokenElem :: Tokens -> Bool
419 isTokenElem toks =
420 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
421 [Tree (unCell -> NodePair PairElem{}) _] -> True
422 _ -> False
423 -}