]> Git — Sourcephile - doclang.git/blob - Hdoc/TCT/Read/Token.hs
XML: use symantic-xml
[doclang.git] / Hdoc / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Hdoc.TCT.Read.Token where
9
10 import Control.Applicative (Applicative(..), Alternative(..))
11 import Control.Monad (Monad(..))
12 import Data.Bool
13 import Data.Char (Char)
14 import Data.Either (Either(..))
15 import Data.Eq (Eq(..))
16 import Data.Foldable (Foldable(..))
17 import Data.Function (($), (.))
18 import Data.Functor ((<$>), ($>), (<$))
19 import Data.List.NonEmpty (NonEmpty(..))
20 import Data.Maybe (Maybe(..))
21 import Data.Monoid (Monoid(..))
22 import Data.Semigroup (Semigroup(..))
23 import Data.Sequence (ViewL(..), ViewR(..), (<|))
24 import Data.TreeSeq.Strict (Tree(..), Trees)
25 import Data.Tuple (fst,snd)
26 import Data.Void (Void)
27 import Prelude (Num(..))
28 import Text.Show (Show(..))
29 import qualified Data.Char as Char
30 import qualified Data.List.NonEmpty as NonEmpty
31 import qualified Data.Sequence as Seq
32 import qualified Data.Text as Text
33 import qualified Data.Text.Lazy as TL
34 import qualified Language.Symantic.XML as XML
35 import qualified Text.Megaparsec as P
36 import qualified Text.Megaparsec.Char as P
37
38 import Hdoc.TCT.Debug
39 import Hdoc.TCT.Cell
40 import Hdoc.TCT.Elem
41 import Hdoc.TCT.Tree
42 import Hdoc.TCT.Read.Elem
43 import Hdoc.TCT.Read.Cell
44
45 -- * Type 'Pairs'
46 -- | Right-only Dyck language,
47 -- to keep track of opened 'Pair's.
48 type Pairs = (Tokens,[Opening])
49 type Tokens = Trees (Cell Node)
50
51 -- ** Type 'Opening'
52 -- | An opened 'Pair' and its content so far.
53 type Opening = (Cell Pair,Tokens)
54
55 appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs
56 appendPairsToken ps t = appendPairsTokens ps (pure t)
57
58 appendPairsText :: Pairs -> Cell TL.Text -> Pairs
59 appendPairsText ps (Sourced sp t) =
60 appendPairsToken ps $
61 Tree0 $ Sourced sp $
62 NodeToken $ TokenText t
63
64 appendPairsTokens :: Pairs -> Tokens -> Pairs
65 appendPairsTokens (ts,[]) toks = (ts`unionTokens`toks,[])
66 appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps)
67
68 -- | Unify two 'Tokens', merging border 'TokenText's if any.
69 unionTokens :: Tokens -> Tokens -> Tokens
70 unionTokens x y =
71 case (Seq.viewr x, Seq.viewl y) of
72 (xs :> x0, y0 :< ys) ->
73 case (x0,y0) of
74 ( Tree (Sourced (FileRange fx bx _ex:| lx)
75 (NodeToken (TokenText tx))) tsx
76 , Tree (Sourced (FileRange _fy _by ey:|_ly)
77 (NodeToken (TokenText ty))) tsy ) ->
78 xs `unionTokens`
79 pure (Tree (Sourced (FileRange fx bx ey:|lx) $
80 NodeToken $ TokenText $ tx <> ty) (tsx<>tsy)
81 ) `unionTokens`
82 ys
83 _ -> x <> y
84 (EmptyR, _) -> y
85 (_, EmptyL) -> x
86
87 unionsTokens :: Foldable f => f Tokens -> Tokens
88 unionsTokens = foldl' unionTokens mempty
89
90 openPair :: Pairs -> Cell Pair -> Pairs
91 openPair (t,ps) p = (t,(p,mempty):ps)
92
93 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
94 closePair :: Pairs -> Cell Pair -> Pairs
95 closePair ps@(_,[]) (Sourced loc p) = -- debug0 "closePair" $
96 appendPairsText ps $ Sourced loc $
97 snd $ pairBordersDouble p
98 closePair (t,(cx@(Sourced (FileRange fx bx _ex:| lx) px),t1):ts)
99 cy@(Sourced (FileRange _fy _by ey:|_ly) py) = -- debug0 "closePair" $
100 case (px,py) of
101 (PairElem nx ax, PairElem ny ay) | nx == ny ->
102 appendPairsToken (t,ts) $
103 Tree (Sourced (FileRange fx bx ey:|lx) $ NodePair $ PairElem nx as) t1
104 where as | null ay = ax
105 | otherwise = ax<>ay
106 _ | px == py ->
107 appendPairsToken (t,ts) $
108 Tree (Sourced (FileRange fx bx ey:|lx) $ NodePair px) t1
109 _ ->
110 (`closePair` cy) $
111 appendPairsTokens
112 (t,ts)
113 (closeImpaired mempty (cx,t1))
114
115 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
116 closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens
117 closeImpaired acc (Sourced loc@(s0:|lp) pair, toks) = -- debug0 "closeImpaired" $
118 case pair of
119 -- NOTE: try to close 'PairTag' as 'TokenTag' instead of 'TokenText'.
120 PairTag isBackref | Just (Sourced (FileRange{fileRange_end}:|_lt) ref, rest) <- parseRef body ->
121 Tree0 (Sourced (s0{fileRange_end}:|lp) $ NodeToken $ TokenTag isBackref ref) <| rest
122 -- NOTE: use bp (not bt) to include the '#'
123 -- NOTE: try to close 'PairAt' as 'TokenAt' instead of 'TokenText'.
124 PairAt isBackref | Just (Sourced (FileRange{fileRange_end}:|_lt) ref, rest) <- parseRef body ->
125 Tree0 (Sourced (s0{fileRange_end}:|lp) $ NodeToken $ TokenAt isBackref ref) <| rest
126 -- NOTE: use bp (not bt) to include the '@'
127 _ -> pure open `unionTokens` body
128 where
129 body = toks `unionTokens` acc
130 open = Tree0 $ Sourced loc $ NodeToken $
131 TokenText $ fst $ pairBordersDouble pair
132
133 -- | Close remaining 'Pair's at end of parsing.
134 closePairs :: Pairs -> Tokens
135 closePairs (t0,ps) = -- debug0 "closePairs" $
136 t0 `unionTokens` foldl' closeImpaired mempty ps
137
138 appendLexeme :: Lexeme -> Pairs -> Pairs
139 appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
140 case lex of
141 LexemePairOpen ps -> foldl' open acc ps
142 where
143 -- NOTE: insert an empty node to encode <elem></elem>, not <elem/>
144 open a p@(Sourced (FileRange{fileRange_end, ..}:|sp) PairElem{}) = openPair a p `appendPairsText` Sourced (FileRange{fileRange_begin=fileRange_end, ..}:|sp) ""
145 open a p = openPair a p
146 LexemePairClose ps -> foldl' closePair acc ps
147 LexemePairAny ps -> foldl' openPair acc ps
148 LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
149 LexemeEscape c -> appendPairsToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
150 LexemeLink t -> appendPairsToken acc $ Tree0 $ NodeToken . TokenLink <$> t
151 {-LexemeWhite (unSourced -> "") -> acc-}
152 -- LexemeWhite (unSourced -> Text.all (==' ') -> True) -> acc
153 LexemeWhite t -> appendPairsText acc t
154 LexemeAlphaNum t -> appendPairsText acc t
155 LexemeOther t -> appendPairsText acc t
156 LexemeTree t -> appendPairsToken acc t
157 LexemeEnd -> acc
158
159 appendLexemes :: Pairs -> [Lexeme] -> Pairs
160 appendLexemes = foldr appendLexeme
161
162 -- * Type 'Lexeme'
163 -- | 'Lexeme's cut the input in the longest chunks of common semantic,
164 -- this enables 'orientLexemePairAny' to work with a more meaningful context.
165 data Lexeme
166 = LexemePairOpen !(NonEmpty (Cell Pair))
167 | LexemePairClose !(NonEmpty (Cell Pair))
168 | LexemePairAny !(NonEmpty (Cell Pair))
169 -- ^ orientation depending on the surrounding 'Lexeme's,
170 -- see 'orientLexemePairAny'
171 | LexemePairBoth !(NonEmpty (Cell Pair))
172 | LexemeEscape !(Cell Char)
173 | LexemeLink !(Cell TL.Text)
174 | LexemeWhite !(Cell TL.Text)
175 | LexemeAlphaNum !(Cell TL.Text)
176 | LexemeOther !(Cell TL.Text)
177 | LexemeTree !(Tree (Cell Node)) -- FIXME: useless?
178 | LexemeEnd
179 deriving (Eq, Show)
180 instance Pretty Lexeme
181
182 parseTokens :: [Lexeme] -> Tokens
183 parseTokens ps =
184 closePairs $
185 appendLexemes mempty $
186 -- debug0 "Lexemes (post orient)" $
187 orientLexemePairAny $ LexemeEnd :
188 ps
189
190 parseLexemes ::
191 Cell TL.Text ->
192 Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme]
193 parseLexemes = runParserOnCell (p_Lexemes <* P.eof)
194
195 -- | Parse 'Lexeme's, returning them in reverse order
196 -- to apply 'orientLexemePairAny'.
197 p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
198 p_Lexemes = debugParser "Lexemes" $ go []
199 where
200 go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
201 go acc =
202 (P.eof $> acc) <|>
203 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
204
205 -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
206 -- so that it can try to orient nearby 'LexemePairAny'
207 -- to 'LexemePairOpen' or 'LexemePairClose'.
208 orientLexemePairAny :: [Lexeme] -> [Lexeme]
209 orientLexemePairAny = \case
210 -- LexemeOther (Sourced _bx ex x):LexemeOther (Sourced by _ey y):acc -> LexemeOther (Sourced by ex (x<>y)):acc
211
212 -- "   
213 t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
214 w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
215 LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
216 --    "
217 LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
218 LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
219 LexemePairAny p:[] -> LexemePairOpen p:[]
220
221 --    ,,,"
222 LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
223 LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
224 -- ",,,   
225 w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
226 LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
227
228 -- ",,,AAA
229 an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
230 an@LexemeEscape{} :a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
231 -- ,,,"AAA
232 an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
233 an@LexemeEscape{} :LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
234
235 -- ")
236 c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
237 -- ("
238 LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
239
240 -- "(
241 o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
242 -- )"
243 LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc
244
245 -- AAA#AAA
246 a1@LexemeAlphaNum{}:LexemePairAny p:a2@LexemeAlphaNum{}:acc ->
247 a1:LexemeOther (sconcat (((fst . pairBordersDouble) <$>) <$> p)):a2:acc
248
249 acc -> acc
250
251 p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
252 p_Lexeme = debugParser "Lexeme" $
253 P.choice
254 [ P.try $ LexemeWhite <$> p_Cell p_Spaces1
255 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
256 , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
257 , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen <|> P.try p_BackOpen)
258 , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
259 , P.try $ LexemeEscape <$> p_Cell p_Escape
260 , P.try $ LexemeLink <$> p_Cell p_Link
261 , P.try $ LexemeAlphaNum <$> p_Cell p_AlphaNums1
262 , LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar)
263 ]
264
265 p_some :: Parser e s a -> Parser e s (NonEmpty a)
266 p_some p = NonEmpty.fromList <$> P.some p
267
268 pairAny :: Char -> Maybe Pair
269 pairAny = \case
270 '-' -> Just PairDash
271 '/' -> Just PairSlash
272 '"' -> Just PairDoublequote
273 '\'' -> Just PairSinglequote
274 '`' -> Just PairBackquote
275 '_' -> Just PairUnderscore
276 '*' -> Just PairStar
277 '#' -> Just $ PairTag False
278 '@' -> Just $ PairAt False
279 _ -> Nothing
280
281 pairOpen :: Char -> Maybe Pair
282 pairOpen = \case
283 '(' -> Just PairParen
284 '[' -> Just PairBracket
285 '{' -> Just PairBrace
286 '«' -> Just PairFrenchquote
287 _ -> Nothing
288
289 pairClose :: Char -> Maybe Pair
290 pairClose = \case
291 ')' -> Just PairParen
292 ']' -> Just PairBracket
293 '}' -> Just PairBrace
294 '»' -> Just PairFrenchquote
295 _ -> Nothing
296
297 p_BackOpen :: P.Tokens s ~ TL.Text => Parser e s Pair
298 p_BackOpen = debugParser "BackOpen" $
299 P.char '~'
300 *> (PairAt True <$ P.char '@'
301 <|> PairTag True <$ P.char '#')
302
303 p_Escape :: Parser e s Char
304 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
305
306 p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text
307 p_Link =
308 P.try (P.char '<' *> p <* P.char '>') <|>
309 p
310 where
311 p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
312 p =
313 (\scheme addr -> scheme <> "//" <> addr)
314 <$> P.option "" (P.try p_scheme)
315 <* P.string "//"
316 <*> p_addr
317 p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
318 p_scheme =
319 (<> ":")
320 <$> (P.takeWhile1P (Just "scheme") $ \c ->
321 Char.isAlphaNum c
322 || c=='_'
323 || c=='-'
324 || c=='+')
325 <* P.char ':'
326 p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
327 p_addr =
328 P.takeWhileP (Just "addr") $ \c ->
329 Char.isAlphaNum c
330 || c=='%'
331 || c=='/'
332 || c=='('
333 || c==')'
334 || c=='-'
335 || c=='_'
336 || c=='.'
337 || c=='#'
338 || c=='?'
339 || c=='='
340 || c=='@'
341
342 pairBorders :: Foldable f => Pair -> f a -> (TL.Text, TL.Text)
343 pairBorders p ts | null ts = pairBordersSingle p
344 | otherwise = pairBordersDouble p
345
346 pairBordersSingle :: Pair -> (TL.Text,TL.Text)
347 pairBordersSingle = \case
348 PairElem (XML.NCName n) as ->
349 ("<"<>n<>foldMap f as<>"/>","")
350 where f (elemAttr_white,ElemAttr{..}) =
351 elemAttr_white <>
352 XML.unNCName elemAttr_name <>
353 elemAttr_open <>
354 elemAttr_value <>
355 elemAttr_close
356 p -> pairBordersDouble p
357
358 pairBordersDouble :: Pair -> (TL.Text, TL.Text)
359 pairBordersDouble = \case
360 PairElem (XML.NCName n) as -> ("<"<>n<>foldMap f as<>">","</"<>n<>">")
361 where f (elemAttr_white,ElemAttr{..}) =
362 elemAttr_white <>
363 XML.unNCName elemAttr_name <>
364 elemAttr_open <>
365 elemAttr_value <>
366 elemAttr_close
367 PairTag isBackref | isBackref -> ("^#","#")
368 | otherwise -> ("#","#")
369 PairAt isBackref | isBackref -> ("^@","@")
370 | otherwise -> ("@","@")
371 PairStar -> ("*","*")
372 PairSlash -> ("/","/")
373 PairUnderscore -> ("_","_")
374 PairDash -> ("-","-")
375 PairBackquote -> ("`","`")
376 PairSinglequote -> ("'","'")
377 PairDoublequote -> ("\"","\"")
378 PairFrenchquote -> ("«","»")
379 PairParen -> ("(",")")
380 PairBrace -> ("{","}")
381 PairBracket -> ("[","]")
382
383 -- * Class 'ParseRef'
384 class ParseRef a where
385 parseRef :: a -> Maybe (Cell Ref, a)
386 instance ParseRef Tokens where
387 parseRef ts =
388 case Seq.viewl ts of
389 EmptyL -> Nothing
390 Tree0 (Sourced src0@(FileRange _f0 _b0 e0:|_l0) n) :< ns ->
391 case n of
392 NodeToken (TokenText t) ->
393 case parseRef $ Sourced src0 t of
394 Nothing -> Nothing
395 Just (t0,r0) ->
396 if TL.null $ unSourced r0
397 then
398 case parseRef ns of
399 Just (t1@(Sourced (FileRange _f1 b1 _e1:|_l1) _), r1) | e0 == b1 ->
400 Just (t0<>t1, r1)
401 _ -> Just (t0, ns)
402 else Just (t0, pure n0 `unionTokens` ns)
403 where n0 = Tree0 $ NodeToken . TokenText <$> r0
404 _ -> Nothing
405 _ -> Nothing
406 instance ParseRef (Cell TL.Text) where
407 parseRef (Sourced (FileRange fp bp ep:|sp) t)
408 | (w,r) <- TL.span isTagChar t
409 , not $ TL.null w
410 , ew <- filePos_column bp + sum (Text.length <$> TL.toChunks w) =
411 Just
412 ( Sourced (FileRange fp bp bp{filePos_column=ew}:|sp) w
413 , Sourced (FileRange fp bp{filePos_column=ew} ep:|sp) r )
414 parseRef _ = Nothing
415
416 isTagChar :: Char -> Bool
417 isTagChar c =
418 Char.isAlphaNum c ||
419 c=='·' ||
420 case Char.generalCategory c of
421 Char.DashPunctuation -> True
422 Char.ConnectorPunctuation -> True
423 _ -> False
424
425 {-
426 -- | Build 'Tokens' from many 'Token's.
427 tokens :: [Cell Token] -> Tokens
428 tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
429
430 -- | Build 'Tokens' from one 'Token'.
431 tokens1 :: Tree (Cell Node) -> Tokens
432 tokens1 = Seq.singleton
433
434 unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
435 unTokenElem toks =
436 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
437 [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
438 _ -> Nothing
439
440 isTokenElem :: Tokens -> Bool
441 isTokenElem toks =
442 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
443 [Tree (unSourced -> NodePair PairElem{}) _] -> True
444 _ -> False
445 -}