]> Git — Sourcephile - doclang.git/blob - src/Textphile/TCT/Read/Token.hs
Polish code
[doclang.git] / src / Textphile / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Textphile.TCT.Read.Token where
9
10 import Control.Applicative (Applicative(..), Alternative(..))
11 import Control.Monad (Monad(..))
12 import Data.Bool
13 import Data.Char (Char)
14 import Data.Either (Either(..))
15 import Data.Eq (Eq(..))
16 import Data.Foldable (Foldable(..))
17 import Data.Function (($), (.))
18 import Data.Functor ((<$>), ($>), (<$))
19 import Data.List.NonEmpty (NonEmpty(..))
20 import Data.Maybe (Maybe(..))
21 import Data.Monoid (Monoid(..))
22 import Data.Semigroup (Semigroup(..))
23 import Data.Sequence (ViewL(..), ViewR(..), (<|))
24 import Data.TreeSeq.Strict (Tree(..), Trees)
25 import Data.Tuple (fst,snd)
26 import Data.Void (Void)
27 import Text.Show (Show(..))
28 import qualified Data.Char as Char
29 import qualified Data.List.NonEmpty as NonEmpty
30 import qualified Data.Sequence as Seq
31 import qualified Data.Text as Text
32 import qualified Data.Text.Lazy as TL
33 import qualified Symantic.XML as XML
34 import qualified Text.Megaparsec as P
35 import qualified Text.Megaparsec.Char as P
36
37 import Textphile.TCT.Debug
38 import Textphile.TCT.Cell
39 import Textphile.TCT.Elem
40 import Textphile.TCT.Tree
41 import Textphile.TCT.Read.Elem
42 import Textphile.TCT.Read.Cell
43
44 -- * Type 'Pairs'
45 -- | Right-only Dyck language,
46 -- to keep track of opened 'Pair's.
47 type Pairs = (Tokens,[Opening])
48 type Tokens = Trees (Cell Node)
49
50 -- ** Type 'Opening'
51 -- | An opened 'Pair' and its content so far.
52 type Opening = (Cell Pair,Tokens)
53
54 appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs
55 appendPairsToken ps t = appendPairsTokens ps (pure t)
56
57 appendPairsText :: Pairs -> Cell TL.Text -> Pairs
58 appendPairsText ps (Sourced sp t) =
59 appendPairsToken ps $
60 Tree0 $ Sourced sp $
61 NodeToken $ TokenText t
62
63 appendPairsTokens :: Pairs -> Tokens -> Pairs
64 appendPairsTokens (ts,[]) toks = (ts`unionTokens`toks,[])
65 appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps)
66
67 -- | Unify two 'Tokens', merging border 'TokenText's if any.
68 unionTokens :: Tokens -> Tokens -> Tokens
69 unionTokens x y =
70 case (Seq.viewr x, Seq.viewl y) of
71 (xs :> x0, y0 :< ys) ->
72 case (x0,y0) of
73 ( Tree (Sourced (FileRange fx bx _ex:| lx)
74 (NodeToken (TokenText tx))) tsx
75 , Tree (Sourced (FileRange _fy _by ey:|_ly)
76 (NodeToken (TokenText ty))) tsy ) ->
77 xs `unionTokens`
78 pure (Tree (Sourced (FileRange fx bx ey:|lx) $
79 NodeToken $ TokenText $ tx <> ty) (tsx<>tsy)
80 ) `unionTokens`
81 ys
82 _ -> x <> y
83 (EmptyR, _) -> y
84 (_, EmptyL) -> x
85
86 unionsTokens :: Foldable f => f Tokens -> Tokens
87 unionsTokens = foldl' unionTokens mempty
88
89 openPair :: Pairs -> Cell Pair -> Pairs
90 openPair (t,ps) p = (t,(p,mempty):ps)
91
92 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
93 closePair :: Pairs -> Cell Pair -> Pairs
94 closePair ps@(_,[]) (Sourced loc p) = -- debug0 "closePair" $
95 appendPairsText ps $ Sourced loc $
96 snd $ pairBordersDouble p
97 closePair (t,(cx@(Sourced (FileRange fx bx _ex:| lx) px),t1):ts)
98 cy@(Sourced (FileRange _fy _by ey:|_ly) py) = -- debug0 "closePair" $
99 case (px,py) of
100 (PairElem nx ax, PairElem ny ay) | nx == ny ->
101 appendPairsToken (t,ts) $
102 Tree (Sourced (FileRange fx bx ey:|lx) $ NodePair $ PairElem nx as) t1
103 where as | null ay = ax
104 | otherwise = ax<>ay
105 _ | px == py ->
106 appendPairsToken (t,ts) $
107 Tree (Sourced (FileRange fx bx ey:|lx) $ NodePair px) t1
108 _ ->
109 (`closePair` cy) $
110 appendPairsTokens
111 (t,ts)
112 (closeImpaired mempty (cx,t1))
113
114 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
115 closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens
116 closeImpaired acc (Sourced src@(s0:|ss) pair, toks) = -- debug0 "closeImpaired" $
117 case pair of
118 -- NOTE: try to close 'PairTag' as 'TokenTag' instead of 'TokenText'.
119 PairTag isBackref | Just (Sourced (FileRange{fileRange_end}:|_lt) ref, rest) <- parseRef body ->
120 Tree0 (Sourced (s0{fileRange_end}:|ss) $ NodeToken $ TokenTag isBackref ref) <| rest
121 -- NOTE: try to close 'PairAt' as 'TokenAt' instead of 'TokenText'.
122 PairAt isBackref | Just (Sourced (FileRange{fileRange_end}:|_lt) ref, rest) <- parseRef body ->
123 Tree0 (Sourced (s0{fileRange_end}:|ss) $ NodeToken $ TokenAt isBackref ref) <| rest
124 | not isBackref -- NOTE: page-ref without a target anchor.
125 , Tree (Sourced _src (NodePair PairBracket{})) _ts :< _rest <- Seq.viewl body ->
126 Tree0 (Sourced src $ NodeToken $ TokenAt False "") <| body
127 _ -> pure open `unionTokens` body
128 where
129 body = toks `unionTokens` acc
130 open = Tree0 $ Sourced src $ NodeToken $
131 TokenText $ fst $ pairBordersDouble pair
132
133 -- | Close remaining 'Pair's at end of parsing.
134 closePairs :: Pairs -> Tokens
135 closePairs (t0,ps) = -- debug0 "closePairs" $
136 t0 `unionTokens` foldl' closeImpaired mempty ps
137
138 appendLexeme :: Lexeme -> Pairs -> Pairs
139 appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
140 case lex of
141 LexemePairOpen ps -> foldl' open acc ps
142 where
143 -- NOTE: insert an empty node to encode <elem></elem>, not <elem/>
144 open a p@(Sourced (FileRange{fileRange_end, ..}:|sp) PairElem{}) = openPair a p `appendPairsText` Sourced (FileRange{fileRange_begin=fileRange_end, ..}:|sp) ""
145 open a p = openPair a p
146 LexemePairClose ps -> foldl' closePair acc ps
147 LexemePairAny ps -> foldl' openPair acc ps
148 LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
149 LexemeEscape c -> appendPairsToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
150 LexemeLink t -> appendPairsToken acc $ Tree0 $ NodeToken . TokenLink <$> t
151 {-LexemeWhite (unSourced -> "") -> acc-}
152 -- LexemeWhite (unSourced -> Text.all (==' ') -> True) -> acc
153 LexemeWhite t -> appendPairsText acc t
154 LexemeAlphaNum t -> appendPairsText acc t
155 LexemeOther t -> appendPairsText acc t
156 LexemeTree t -> appendPairsToken acc t
157 LexemeEnd -> acc
158
159 appendLexemes :: Pairs -> [Lexeme] -> Pairs
160 appendLexemes = foldr appendLexeme
161
162 -- * Type 'Lexeme'
163 -- | 'Lexeme's cut the input in the longest chunks of common semantic,
164 -- this enables 'orientLexemePairAny' to work with a more meaningful context.
165 data Lexeme
166 = LexemePairOpen !(NonEmpty (Cell Pair))
167 | LexemePairClose !(NonEmpty (Cell Pair))
168 | LexemePairAny !(NonEmpty (Cell Pair))
169 -- ^ orientation depending on the surrounding 'Lexeme's,
170 -- see 'orientLexemePairAny'
171 | LexemePairBoth !(NonEmpty (Cell Pair))
172 | LexemeEscape !(Cell Char)
173 | LexemeLink !(Cell Link)
174 | LexemeWhite !(Cell TL.Text)
175 | LexemeAlphaNum !(Cell TL.Text)
176 | LexemeOther !(Cell TL.Text)
177 | LexemeTree !(Tree (Cell Node)) -- FIXME: useless?
178 | LexemeEnd
179 deriving (Eq, Show)
180 instance Pretty Lexeme
181
182 parseTokens :: [Lexeme] -> Tokens
183 parseTokens ps =
184 closePairs $
185 appendLexemes mempty $
186 -- debug0 "Lexemes (post orient)" $
187 orientLexemePairAny $
188 -- debug0 "Lexemes (pre orient)" $
189 LexemeEnd : ps
190
191 parseLexemes ::
192 Cell TL.Text ->
193 Either (P.ParseErrorBundle StreamCell Void) [Lexeme]
194 parseLexemes = runParserOnCell (p_Lexemes <* P.eof)
195
196 -- | Parse 'Lexeme's, returning them in reverse order
197 -- to apply 'orientLexemePairAny'.
198 p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
199 p_Lexemes = {-debugParser "Lexemes" $-} go []
200 where
201 go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
202 go acc =
203 (P.eof $> acc) <|>
204 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
205
206 -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
207 -- so that it can try to orient nearby 'LexemePairAny'
208 -- to 'LexemePairOpen' or 'LexemePairClose'.
209 orientLexemePairAny :: [Lexeme] -> [Lexeme]
210 orientLexemePairAny = \case
211 -- LexemeOther (Sourced _bx ex x):LexemeOther (Sourced by _ey y):acc -> LexemeOther (Sourced by ex (x<>y)):acc
212
213 -- "   
214 t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
215 w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
216 LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
217 --    "
218 LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
219 LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
220 LexemePairAny p:[] -> LexemePairOpen p:[]
221
222 --    ,,,"
223 LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
224 LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
225 -- ",,,   
226 w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
227 LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
228
229 -- ",,,AAA
230 an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
231 an@LexemeEscape{} :a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
232 -- ,,,"AAA
233 an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
234 an@LexemeEscape{} :LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
235
236 -- ")
237 c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
238 -- ("
239 LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
240
241 -- "(
242 o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
243 -- )"
244 LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairOpen p:c:acc
245
246 -- AAA#AAA
247 a1@LexemeAlphaNum{}:LexemePairAny p:a2@LexemeAlphaNum{}:acc ->
248 a1:LexemeOther (sconcat (((fst . pairBordersDouble) <$>) <$> p)):a2:acc
249
250 acc -> acc
251
252 p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
253 p_Lexeme = -- debugParser "Lexeme" $
254 P.choice
255 [ P.try $ LexemeWhite <$> p_Cell p_Spaces1
256 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
257 , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
258 , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen <|> P.try p_BackOpen)
259 , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
260 , P.try $ LexemeEscape <$> p_Cell p_Escape
261 , P.try $ LexemeLink <$> p_Cell p_Link
262 , P.try $ LexemeAlphaNum <$> p_Cell p_AlphaNums1
263 , LexemeOther <$> p_Cell (TL.singleton <$> P.anySingle)
264 ]
265
266 p_some :: Parser e s a -> Parser e s (NonEmpty a)
267 p_some p = NonEmpty.fromList <$> P.some p
268
269 pairAny :: Char -> Maybe Pair
270 pairAny = \case
271 '-' -> Just PairDash
272 '/' -> Just PairSlash
273 '"' -> Just PairDoublequote
274 '\'' -> Just PairSinglequote
275 '`' -> Just PairBackquote
276 '_' -> Just PairUnderscore
277 '*' -> Just PairStar
278 '#' -> Just $ PairTag False
279 '@' -> Just $ PairAt False
280 _ -> Nothing
281
282 pairOpen :: Char -> Maybe Pair
283 pairOpen = \case
284 '(' -> Just PairParen
285 '[' -> Just PairBracket
286 '{' -> Just PairBrace
287 '«' -> Just PairFrenchquote
288 _ -> Nothing
289
290 pairClose :: Char -> Maybe Pair
291 pairClose = \case
292 ')' -> Just PairParen
293 ']' -> Just PairBracket
294 '}' -> Just PairBrace
295 '»' -> Just PairFrenchquote
296 _ -> Nothing
297
298 p_BackOpen :: P.Tokens s ~ TL.Text => Parser e s Pair
299 p_BackOpen = -- debugParser "BackOpen" $
300 P.char '~'
301 *> (PairAt True <$ P.char '@'
302 <|> PairTag True <$ P.char '#')
303
304 p_Escape :: Parser e s Char
305 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
306
307 p_Link :: P.Tokens s ~ TL.Text => Parser e s Link
308 p_Link =
309 P.try (P.char '<' *> p <* P.char '>') <|>
310 p
311 where
312 p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
313 p =
314 (\scheme addr -> scheme <> "//" <> addr)
315 <$> P.option "" (P.try p_scheme)
316 <* P.string "//"
317 <*> p_addr
318 p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
319 p_scheme =
320 (<> ":")
321 <$> (P.takeWhile1P (Just "scheme") $ \c ->
322 Char.isAlphaNum c
323 || c=='_'
324 || c=='-'
325 || c=='+')
326 <* P.char ':'
327 p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
328 p_addr =
329 P.takeWhileP (Just "addr") $ \c ->
330 Char.isAlphaNum c
331 || c=='%'
332 || c=='/'
333 || c=='('
334 || c==')'
335 || c=='-'
336 || c=='_'
337 || c=='.'
338 || c=='#'
339 || c=='?'
340 || c=='='
341 || c=='@'
342
343 pairBorders :: Foldable f => Pair -> f a -> (TL.Text, TL.Text)
344 pairBorders p ts | null ts = pairBordersSingle p
345 | otherwise = pairBordersDouble p
346
347 pairBordersSingle :: Pair -> (TL.Text,TL.Text)
348 pairBordersSingle = \case
349 PairElem (XML.NCName n) as ->
350 ("<"<>n<>foldMap f as<>"/>","")
351 where f (elemAttr_white,ElemAttr{..}) =
352 elemAttr_white <>
353 XML.unNCName elemAttr_name <>
354 elemAttr_open <>
355 elemAttr_value <>
356 elemAttr_close
357 p -> pairBordersDouble p
358
359 pairBordersDouble :: Pair -> (TL.Text, TL.Text)
360 pairBordersDouble = \case
361 PairElem (XML.NCName n) as -> ("<"<>n<>foldMap f as<>">","</"<>n<>">")
362 where f (elemAttr_white,ElemAttr{..}) =
363 elemAttr_white <>
364 XML.unNCName elemAttr_name <>
365 elemAttr_open <>
366 elemAttr_value <>
367 elemAttr_close
368 PairTag isBackref | isBackref -> ("^#","#")
369 | otherwise -> ("#","#")
370 PairAt isBackref | isBackref -> ("^@","@")
371 | otherwise -> ("@","@")
372 PairStar -> ("*","*")
373 PairSlash -> ("/","/")
374 PairUnderscore -> ("_","_")
375 PairDash -> ("-","-")
376 PairBackquote -> ("`","`")
377 PairSinglequote -> ("'","'")
378 PairDoublequote -> ("\"","\"")
379 PairFrenchquote -> ("«","»")
380 PairParen -> ("(",")")
381 PairBrace -> ("{","}")
382 PairBracket -> ("[","]")
383
384 -- * Class 'ParseRef'
385 class ParseRef a where
386 parseRef :: a -> Maybe (Cell Ref, a)
387 instance ParseRef Tokens where
388 parseRef ts =
389 case Seq.viewl ts of
390 EmptyL -> Nothing
391 Tree0 (Sourced src0@(FileRange _f0 _b0 e0:|_l0) n) :< ns ->
392 case n of
393 NodeToken (TokenText t) ->
394 case parseRef $ Sourced src0 t of
395 Nothing -> Nothing
396 Just (t0,r0) ->
397 if TL.null $ unSourced r0
398 then
399 case parseRef ns of
400 Just (t1@(Sourced (FileRange _f1 b1 _e1:|_l1) _), r1) | e0 == b1 ->
401 Just (t0<>t1, r1)
402 _ -> Just (t0, ns)
403 else Just (t0, pure n0 `unionTokens` ns)
404 where n0 = Tree0 $ NodeToken . TokenText <$> r0
405 _ -> Nothing
406 _ -> Nothing
407 instance ParseRef (Cell TL.Text) where
408 parseRef (Sourced (FileRange fp bp ep:|sp) t)
409 | (w,r) <- TL.span isTagChar t
410 , not $ TL.null w
411 , ew <- sconcat $ colNum bp :| (P.mkPos . Text.length <$> TL.toChunks w) =
412 Just
413 ( Sourced (FileRange fp bp bp{colNum=ew}:|sp) w
414 , Sourced (FileRange fp bp{colNum=ew} ep:|sp) r )
415 parseRef _ = Nothing
416
417 isTagChar :: Char -> Bool
418 isTagChar c =
419 Char.isAlphaNum c ||
420 c=='·' ||
421 case Char.generalCategory c of
422 Char.DashPunctuation -> True
423 Char.ConnectorPunctuation -> True
424 _ -> False
425
426 {-
427 -- | Build 'Tokens' from many 'Token's.
428 tokens :: [Cell Token] -> Tokens
429 tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
430
431 -- | Build 'Tokens' from one 'Token'.
432 tokens1 :: Tree (Cell Node) -> Tokens
433 tokens1 = Seq.singleton
434
435 unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
436 unTokenElem toks =
437 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
438 [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
439 _ -> Nothing
440
441 isTokenElem :: Tokens -> Bool
442 isTokenElem toks =
443 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
444 [Tree (unSourced -> NodePair PairElem{}) _] -> True
445 _ -> False
446 -}