]> Git — Sourcephile - doclang.git/blob - src/Textphile/TCT/Read/Token.hs
Rename {hdoc => textphile}
[doclang.git] / src / Textphile / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Textphile.TCT.Read.Token where
9
10 import Control.Applicative (Applicative(..), Alternative(..))
11 import Control.Monad (Monad(..))
12 import Data.Bool
13 import Data.Char (Char)
14 import Data.Either (Either(..))
15 import Data.Eq (Eq(..))
16 import Data.Foldable (Foldable(..))
17 import Data.Function (($), (.))
18 import Data.Functor ((<$>), ($>), (<$))
19 import Data.List.NonEmpty (NonEmpty(..))
20 import Data.Maybe (Maybe(..))
21 import Data.Monoid (Monoid(..))
22 import Data.Semigroup (Semigroup(..))
23 import Data.Sequence (ViewL(..), ViewR(..), (<|))
24 import Data.TreeSeq.Strict (Tree(..), Trees)
25 import Data.Tuple (fst,snd)
26 import Data.Void (Void)
27 import Prelude (Num(..))
28 import Text.Show (Show(..))
29 import qualified Data.Char as Char
30 import qualified Data.List.NonEmpty as NonEmpty
31 import qualified Data.Sequence as Seq
32 import qualified Data.Text as Text
33 import qualified Data.Text.Lazy as TL
34 import qualified Symantic.XML as XML
35 import qualified Text.Megaparsec as P
36 import qualified Text.Megaparsec.Char as P
37
38 import Textphile.TCT.Debug
39 import Textphile.TCT.Cell
40 import Textphile.TCT.Elem
41 import Textphile.TCT.Tree
42 import Textphile.TCT.Read.Elem
43 import Textphile.TCT.Read.Cell
44
45 -- * Type 'Pairs'
46 -- | Right-only Dyck language,
47 -- to keep track of opened 'Pair's.
48 type Pairs = (Tokens,[Opening])
49 type Tokens = Trees (Cell Node)
50
51 -- ** Type 'Opening'
52 -- | An opened 'Pair' and its content so far.
53 type Opening = (Cell Pair,Tokens)
54
55 appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs
56 appendPairsToken ps t = appendPairsTokens ps (pure t)
57
58 appendPairsText :: Pairs -> Cell TL.Text -> Pairs
59 appendPairsText ps (Sourced sp t) =
60 appendPairsToken ps $
61 Tree0 $ Sourced sp $
62 NodeToken $ TokenText t
63
64 appendPairsTokens :: Pairs -> Tokens -> Pairs
65 appendPairsTokens (ts,[]) toks = (ts`unionTokens`toks,[])
66 appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps)
67
68 -- | Unify two 'Tokens', merging border 'TokenText's if any.
69 unionTokens :: Tokens -> Tokens -> Tokens
70 unionTokens x y =
71 case (Seq.viewr x, Seq.viewl y) of
72 (xs :> x0, y0 :< ys) ->
73 case (x0,y0) of
74 ( Tree (Sourced (FileRange fx bx _ex:| lx)
75 (NodeToken (TokenText tx))) tsx
76 , Tree (Sourced (FileRange _fy _by ey:|_ly)
77 (NodeToken (TokenText ty))) tsy ) ->
78 xs `unionTokens`
79 pure (Tree (Sourced (FileRange fx bx ey:|lx) $
80 NodeToken $ TokenText $ tx <> ty) (tsx<>tsy)
81 ) `unionTokens`
82 ys
83 _ -> x <> y
84 (EmptyR, _) -> y
85 (_, EmptyL) -> x
86
87 unionsTokens :: Foldable f => f Tokens -> Tokens
88 unionsTokens = foldl' unionTokens mempty
89
90 openPair :: Pairs -> Cell Pair -> Pairs
91 openPair (t,ps) p = (t,(p,mempty):ps)
92
93 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
94 closePair :: Pairs -> Cell Pair -> Pairs
95 closePair ps@(_,[]) (Sourced loc p) = -- debug0 "closePair" $
96 appendPairsText ps $ Sourced loc $
97 snd $ pairBordersDouble p
98 closePair (t,(cx@(Sourced (FileRange fx bx _ex:| lx) px),t1):ts)
99 cy@(Sourced (FileRange _fy _by ey:|_ly) py) = -- debug0 "closePair" $
100 case (px,py) of
101 (PairElem nx ax, PairElem ny ay) | nx == ny ->
102 appendPairsToken (t,ts) $
103 Tree (Sourced (FileRange fx bx ey:|lx) $ NodePair $ PairElem nx as) t1
104 where as | null ay = ax
105 | otherwise = ax<>ay
106 _ | px == py ->
107 appendPairsToken (t,ts) $
108 Tree (Sourced (FileRange fx bx ey:|lx) $ NodePair px) t1
109 _ ->
110 (`closePair` cy) $
111 appendPairsTokens
112 (t,ts)
113 (closeImpaired mempty (cx,t1))
114
115 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
116 closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens
117 closeImpaired acc (Sourced src@(s0:|ss) pair, toks) = -- debug0 "closeImpaired" $
118 case pair of
119 -- NOTE: try to close 'PairTag' as 'TokenTag' instead of 'TokenText'.
120 PairTag isBackref | Just (Sourced (FileRange{fileRange_end}:|_lt) ref, rest) <- parseRef body ->
121 Tree0 (Sourced (s0{fileRange_end}:|ss) $ NodeToken $ TokenTag isBackref ref) <| rest
122 -- NOTE: try to close 'PairAt' as 'TokenAt' instead of 'TokenText'.
123 PairAt isBackref | Just (Sourced (FileRange{fileRange_end}:|_lt) ref, rest) <- parseRef body ->
124 Tree0 (Sourced (s0{fileRange_end}:|ss) $ NodeToken $ TokenAt isBackref ref) <| rest
125 | not isBackref -- NOTE: page-ref without a target anchor.
126 , Tree (Sourced _src (NodePair PairBracket{})) _ts :< _rest <- Seq.viewl body ->
127 Tree0 (Sourced src $ NodeToken $ TokenAt False "") <| body
128 _ -> pure open `unionTokens` body
129 where
130 body = toks `unionTokens` acc
131 open = Tree0 $ Sourced src $ NodeToken $
132 TokenText $ fst $ pairBordersDouble pair
133
134 -- | Close remaining 'Pair's at end of parsing.
135 closePairs :: Pairs -> Tokens
136 closePairs (t0,ps) = -- debug0 "closePairs" $
137 t0 `unionTokens` foldl' closeImpaired mempty ps
138
139 appendLexeme :: Lexeme -> Pairs -> Pairs
140 appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
141 case lex of
142 LexemePairOpen ps -> foldl' open acc ps
143 where
144 -- NOTE: insert an empty node to encode <elem></elem>, not <elem/>
145 open a p@(Sourced (FileRange{fileRange_end, ..}:|sp) PairElem{}) = openPair a p `appendPairsText` Sourced (FileRange{fileRange_begin=fileRange_end, ..}:|sp) ""
146 open a p = openPair a p
147 LexemePairClose ps -> foldl' closePair acc ps
148 LexemePairAny ps -> foldl' openPair acc ps
149 LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
150 LexemeEscape c -> appendPairsToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
151 LexemeLink t -> appendPairsToken acc $ Tree0 $ NodeToken . TokenLink <$> t
152 {-LexemeWhite (unSourced -> "") -> acc-}
153 -- LexemeWhite (unSourced -> Text.all (==' ') -> True) -> acc
154 LexemeWhite t -> appendPairsText acc t
155 LexemeAlphaNum t -> appendPairsText acc t
156 LexemeOther t -> appendPairsText acc t
157 LexemeTree t -> appendPairsToken acc t
158 LexemeEnd -> acc
159
160 appendLexemes :: Pairs -> [Lexeme] -> Pairs
161 appendLexemes = foldr appendLexeme
162
163 -- * Type 'Lexeme'
164 -- | 'Lexeme's cut the input in the longest chunks of common semantic,
165 -- this enables 'orientLexemePairAny' to work with a more meaningful context.
166 data Lexeme
167 = LexemePairOpen !(NonEmpty (Cell Pair))
168 | LexemePairClose !(NonEmpty (Cell Pair))
169 | LexemePairAny !(NonEmpty (Cell Pair))
170 -- ^ orientation depending on the surrounding 'Lexeme's,
171 -- see 'orientLexemePairAny'
172 | LexemePairBoth !(NonEmpty (Cell Pair))
173 | LexemeEscape !(Cell Char)
174 | LexemeLink !(Cell Link)
175 | LexemeWhite !(Cell TL.Text)
176 | LexemeAlphaNum !(Cell TL.Text)
177 | LexemeOther !(Cell TL.Text)
178 | LexemeTree !(Tree (Cell Node)) -- FIXME: useless?
179 | LexemeEnd
180 deriving (Eq, Show)
181 instance Pretty Lexeme
182
183 parseTokens :: [Lexeme] -> Tokens
184 parseTokens ps =
185 closePairs $
186 appendLexemes mempty $
187 -- debug0 "Lexemes (post orient)" $
188 orientLexemePairAny $
189 -- debug0 "Lexemes (pre orient)" $
190 LexemeEnd : ps
191
192 parseLexemes ::
193 Cell TL.Text ->
194 Either (P.ParseErrorBundle StreamCell Void) [Lexeme]
195 parseLexemes = runParserOnCell (p_Lexemes <* P.eof)
196
197 -- | Parse 'Lexeme's, returning them in reverse order
198 -- to apply 'orientLexemePairAny'.
199 p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
200 p_Lexemes = {-debugParser "Lexemes" $-} go []
201 where
202 go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
203 go acc =
204 (P.eof $> acc) <|>
205 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
206
207 -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
208 -- so that it can try to orient nearby 'LexemePairAny'
209 -- to 'LexemePairOpen' or 'LexemePairClose'.
210 orientLexemePairAny :: [Lexeme] -> [Lexeme]
211 orientLexemePairAny = \case
212 -- LexemeOther (Sourced _bx ex x):LexemeOther (Sourced by _ey y):acc -> LexemeOther (Sourced by ex (x<>y)):acc
213
214 -- "   
215 t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
216 w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
217 LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
218 --    "
219 LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
220 LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
221 LexemePairAny p:[] -> LexemePairOpen p:[]
222
223 --    ,,,"
224 LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
225 LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
226 -- ",,,   
227 w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
228 LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
229
230 -- ",,,AAA
231 an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
232 an@LexemeEscape{} :a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
233 -- ,,,"AAA
234 an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
235 an@LexemeEscape{} :LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
236
237 -- ")
238 c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
239 -- ("
240 LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
241
242 -- "(
243 o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
244 -- )"
245 LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairOpen p:c:acc
246
247 -- AAA#AAA
248 a1@LexemeAlphaNum{}:LexemePairAny p:a2@LexemeAlphaNum{}:acc ->
249 a1:LexemeOther (sconcat (((fst . pairBordersDouble) <$>) <$> p)):a2:acc
250
251 acc -> acc
252
253 p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
254 p_Lexeme = -- debugParser "Lexeme" $
255 P.choice
256 [ P.try $ LexemeWhite <$> p_Cell p_Spaces1
257 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
258 , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
259 , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen <|> P.try p_BackOpen)
260 , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
261 , P.try $ LexemeEscape <$> p_Cell p_Escape
262 , P.try $ LexemeLink <$> p_Cell p_Link
263 , P.try $ LexemeAlphaNum <$> p_Cell p_AlphaNums1
264 , LexemeOther <$> p_Cell (TL.singleton <$> P.anySingle)
265 ]
266
267 p_some :: Parser e s a -> Parser e s (NonEmpty a)
268 p_some p = NonEmpty.fromList <$> P.some p
269
270 pairAny :: Char -> Maybe Pair
271 pairAny = \case
272 '-' -> Just PairDash
273 '/' -> Just PairSlash
274 '"' -> Just PairDoublequote
275 '\'' -> Just PairSinglequote
276 '`' -> Just PairBackquote
277 '_' -> Just PairUnderscore
278 '*' -> Just PairStar
279 '#' -> Just $ PairTag False
280 '@' -> Just $ PairAt False
281 _ -> Nothing
282
283 pairOpen :: Char -> Maybe Pair
284 pairOpen = \case
285 '(' -> Just PairParen
286 '[' -> Just PairBracket
287 '{' -> Just PairBrace
288 '«' -> Just PairFrenchquote
289 _ -> Nothing
290
291 pairClose :: Char -> Maybe Pair
292 pairClose = \case
293 ')' -> Just PairParen
294 ']' -> Just PairBracket
295 '}' -> Just PairBrace
296 '»' -> Just PairFrenchquote
297 _ -> Nothing
298
299 p_BackOpen :: P.Tokens s ~ TL.Text => Parser e s Pair
300 p_BackOpen = -- debugParser "BackOpen" $
301 P.char '~'
302 *> (PairAt True <$ P.char '@'
303 <|> PairTag True <$ P.char '#')
304
305 p_Escape :: Parser e s Char
306 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
307
308 p_Link :: P.Tokens s ~ TL.Text => Parser e s Link
309 p_Link =
310 P.try (P.char '<' *> p <* P.char '>') <|>
311 p
312 where
313 p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
314 p =
315 (\scheme addr -> scheme <> "//" <> addr)
316 <$> P.option "" (P.try p_scheme)
317 <* P.string "//"
318 <*> p_addr
319 p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
320 p_scheme =
321 (<> ":")
322 <$> (P.takeWhile1P (Just "scheme") $ \c ->
323 Char.isAlphaNum c
324 || c=='_'
325 || c=='-'
326 || c=='+')
327 <* P.char ':'
328 p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
329 p_addr =
330 P.takeWhileP (Just "addr") $ \c ->
331 Char.isAlphaNum c
332 || c=='%'
333 || c=='/'
334 || c=='('
335 || c==')'
336 || c=='-'
337 || c=='_'
338 || c=='.'
339 || c=='#'
340 || c=='?'
341 || c=='='
342 || c=='@'
343
344 pairBorders :: Foldable f => Pair -> f a -> (TL.Text, TL.Text)
345 pairBorders p ts | null ts = pairBordersSingle p
346 | otherwise = pairBordersDouble p
347
348 pairBordersSingle :: Pair -> (TL.Text,TL.Text)
349 pairBordersSingle = \case
350 PairElem (XML.NCName n) as ->
351 ("<"<>n<>foldMap f as<>"/>","")
352 where f (elemAttr_white,ElemAttr{..}) =
353 elemAttr_white <>
354 XML.unNCName elemAttr_name <>
355 elemAttr_open <>
356 elemAttr_value <>
357 elemAttr_close
358 p -> pairBordersDouble p
359
360 pairBordersDouble :: Pair -> (TL.Text, TL.Text)
361 pairBordersDouble = \case
362 PairElem (XML.NCName n) as -> ("<"<>n<>foldMap f as<>">","</"<>n<>">")
363 where f (elemAttr_white,ElemAttr{..}) =
364 elemAttr_white <>
365 XML.unNCName elemAttr_name <>
366 elemAttr_open <>
367 elemAttr_value <>
368 elemAttr_close
369 PairTag isBackref | isBackref -> ("^#","#")
370 | otherwise -> ("#","#")
371 PairAt isBackref | isBackref -> ("^@","@")
372 | otherwise -> ("@","@")
373 PairStar -> ("*","*")
374 PairSlash -> ("/","/")
375 PairUnderscore -> ("_","_")
376 PairDash -> ("-","-")
377 PairBackquote -> ("`","`")
378 PairSinglequote -> ("'","'")
379 PairDoublequote -> ("\"","\"")
380 PairFrenchquote -> ("«","»")
381 PairParen -> ("(",")")
382 PairBrace -> ("{","}")
383 PairBracket -> ("[","]")
384
385 -- * Class 'ParseRef'
386 class ParseRef a where
387 parseRef :: a -> Maybe (Cell Ref, a)
388 instance ParseRef Tokens where
389 parseRef ts =
390 case Seq.viewl ts of
391 EmptyL -> Nothing
392 Tree0 (Sourced src0@(FileRange _f0 _b0 e0:|_l0) n) :< ns ->
393 case n of
394 NodeToken (TokenText t) ->
395 case parseRef $ Sourced src0 t of
396 Nothing -> Nothing
397 Just (t0,r0) ->
398 if TL.null $ unSourced r0
399 then
400 case parseRef ns of
401 Just (t1@(Sourced (FileRange _f1 b1 _e1:|_l1) _), r1) | e0 == b1 ->
402 Just (t0<>t1, r1)
403 _ -> Just (t0, ns)
404 else Just (t0, pure n0 `unionTokens` ns)
405 where n0 = Tree0 $ NodeToken . TokenText <$> r0
406 _ -> Nothing
407 _ -> Nothing
408 instance ParseRef (Cell TL.Text) where
409 parseRef (Sourced (FileRange fp bp ep:|sp) t)
410 | (w,r) <- TL.span isTagChar t
411 , not $ TL.null w
412 , ew <- sconcat $ colNum bp :| (P.mkPos . Text.length <$> TL.toChunks w) =
413 Just
414 ( Sourced (FileRange fp bp bp{colNum=ew}:|sp) w
415 , Sourced (FileRange fp bp{colNum=ew} ep:|sp) r )
416 parseRef _ = Nothing
417
418 isTagChar :: Char -> Bool
419 isTagChar c =
420 Char.isAlphaNum c ||
421 c=='·' ||
422 case Char.generalCategory c of
423 Char.DashPunctuation -> True
424 Char.ConnectorPunctuation -> True
425 _ -> False
426
427 {-
428 -- | Build 'Tokens' from many 'Token's.
429 tokens :: [Cell Token] -> Tokens
430 tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
431
432 -- | Build 'Tokens' from one 'Token'.
433 tokens1 :: Tree (Cell Node) -> Tokens
434 tokens1 = Seq.singleton
435
436 unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
437 unTokenElem toks =
438 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
439 [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
440 _ -> Nothing
441
442 isTokenElem :: Tokens -> Bool
443 isTokenElem toks =
444 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
445 [Tree (unSourced -> NodePair PairElem{}) _] -> True
446 _ -> False
447 -}