]> Git — Sourcephile - doclang.git/blob - Language/TCT/Read/Token.hs
Fix HeaderGreat parsing.
[doclang.git] / Language / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# LANGUAGE ViewPatterns #-}
8 {-# OPTIONS_GHC -fno-warn-orphans #-}
9 module Language.TCT.Read.Token where
10
11 import Control.Applicative (Applicative(..), Alternative(..))
12 import Control.Monad (Monad(..))
13 import Data.Bool
14 import Data.Char (Char)
15 import Data.Either (Either(..))
16 import Data.Eq (Eq(..))
17 import Data.Foldable (Foldable(..))
18 import Data.Function (($), (.))
19 import Data.Functor ((<$>), ($>))
20 import Data.List.NonEmpty (NonEmpty(..))
21 import Data.Maybe (Maybe(..))
22 import Data.Monoid (Monoid(..))
23 import Data.Semigroup (Semigroup(..))
24 import Data.Sequence (ViewL(..), ViewR(..), (<|))
25 import Data.String (String)
26 import Data.TreeSeq.Strict (Tree(..), Trees)
27 import Data.Tuple (fst,snd)
28 import Data.Void (Void)
29 import Prelude (Num(..))
30 import Text.Show (Show(..))
31 import qualified Data.Char as Char
32 import qualified Data.List.NonEmpty as NonEmpty
33 import qualified Data.Sequence as Seq
34 import qualified Data.Text as Text
35 import qualified Data.Text.Lazy as TL
36 import qualified Text.Megaparsec as P
37 import qualified Text.Megaparsec.Char as P
38
39 import Language.TCT.Debug
40 import Language.TCT.Cell
41 import Language.TCT.Elem
42 import Language.TCT.Tree
43 import Language.TCT.Read.Elem
44 import Language.TCT.Read.Cell
45
46 -- * Type 'Pairs'
47 -- | Right-only Dyck language,
48 -- to keep track of opened 'Pair's.
49 type Pairs = (Tokens,[Opening])
50 type Tokens = Trees (Cell Node)
51
52 -- ** Type 'Opening'
53 -- | An opened 'Pair' and its content so far.
54 type Opening = (Cell Pair,Tokens)
55
56 appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs
57 appendPairsToken ps t = appendPairsTokens ps (pure t)
58
59 appendPairsText :: Pairs -> Cell TL.Text -> Pairs
60 appendPairsText ps (Cell bp ep t) =
61 appendPairsToken ps $
62 Tree0 $ Cell bp ep $
63 NodeToken $ TokenText t
64
65 appendPairsTokens :: Pairs -> Tokens -> Pairs
66 appendPairsTokens (ts,[]) toks = (ts`unionTokens`toks,[])
67 appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps)
68
69 -- | Unify two 'Tokens', merging border 'TokenText's if any.
70 unionTokens :: Tokens -> Tokens -> Tokens
71 unionTokens x y =
72 case (Seq.viewr x, Seq.viewl y) of
73 (xs :> x0, y0 :< ys) ->
74 case (x0,y0) of
75 ( Tree (Cell bx _ex (NodeToken (TokenText tx))) sx
76 , Tree (Cell _by ey (NodeToken (TokenText ty))) sy ) ->
77 xs `unionTokens`
78 pure (Tree (Cell bx ey $ NodeToken $ TokenText $ tx <> ty) (sx<>sy)) `unionTokens`
79 ys
80 _ -> x <> y
81 (EmptyR, _) -> y
82 (_, EmptyL) -> x
83
84 unionsTokens :: Foldable f => f Tokens -> Tokens
85 unionsTokens = foldl' unionTokens mempty
86
87 openPair :: Pairs -> Cell Pair -> Pairs
88 openPair (t,ps) p = (t,(p,mempty):ps)
89
90 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
91 closePair :: Pairs -> Cell Pair -> Pairs
92 closePair ps@(_,[]) (Cell bp ep p) = -- debug0 "closePair" $
93 appendPairsText ps $ Cell bp ep $
94 snd $ pairBordersDouble p
95 closePair (t,(p1,t1):ts) p = -- debug0 "closePair" $
96 case (p1,p) of
97 (Cell bx _ex (PairElem nx ax), Cell _by ey (PairElem ny ay)) | nx == ny ->
98 appendPairsToken (t,ts) $
99 Tree (Cell bx ey $ NodePair $ PairElem nx as) t1
100 where as | null ay = ax
101 | otherwise = ax<>ay
102 (Cell bx _ex x, Cell _by ey y) | x == y ->
103 appendPairsToken (t,ts) $
104 Tree (Cell bx ey $ NodePair x) t1
105 _ ->
106 (`closePair` p) $
107 appendPairsTokens
108 (t,ts)
109 (closeImpaired mempty (p1,t1))
110
111 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
112 closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens
113 closeImpaired acc (Cell bp ep pair, toks) = -- debug0 "closeImpaired" $
114 case pair of
115 -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
116 PairHash | Just (Cell _bt et tag, rest) <- tagFrom body ->
117 Tree0 (Cell bp et $ NodeToken $ TokenTag tag) <| rest
118 -- NOTE: use bp (not bt) to include the '#'
119 _ -> pure open `unionTokens` body
120 where
121 body = toks `unionTokens` acc
122 open = Tree0 $ Cell bp ep $ NodeToken $ TokenText $ fst $ pairBordersDouble pair
123
124 -- | Close remaining 'Pair's at end of parsing.
125 closePairs :: Pairs -> Tokens
126 closePairs (t0,ps) = -- debug0 "closePairs" $
127 t0 `unionTokens` foldl' closeImpaired mempty ps
128
129 appendLexeme :: Lexeme -> Pairs -> Pairs
130 appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
131 case lex of
132 LexemePairOpen ps -> foldl' open acc ps
133 where
134 -- NOTE: insert an empty node to encode <elem></elem>, not <elem/>
135 open a p@(Cell _bp ep (PairElem{})) = openPair a p `appendPairsText` Cell ep ep ""
136 open a p = openPair a p
137 LexemePairClose ps -> foldl' closePair acc ps
138 LexemePairAny ps -> foldl' openPair acc ps
139 {-
140 LexemePairAny ps ->
141 appendPairsText acc $ sconcat $
142 ((fst . pairBordersSingle) <$>) <$> ps
143 -}
144 LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
145 LexemeEscape c -> appendPairsToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
146 LexemeLink t -> appendPairsToken acc $ Tree0 $ NodeToken . TokenLink <$> t
147 {-LexemeWhite (unCell -> "") -> acc-}
148 -- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc
149 LexemeWhite t -> appendPairsText acc t
150 LexemeAlphaNum t -> appendPairsText acc t
151 LexemeOther t -> appendPairsText acc t
152 LexemeTree t -> appendPairsToken acc t
153 LexemeEnd -> acc
154
155 appendLexemes :: Pairs -> [Lexeme] -> Pairs
156 appendLexemes = foldr appendLexeme
157
158 -- * Type 'Lexeme'
159 -- | 'Lexeme's cut the input in the longest chunks of common semantic,
160 -- this enables 'orientLexemePairAny' to work with a more meaningful context.
161 data Lexeme
162 = LexemePairOpen !(NonEmpty (Cell Pair))
163 | LexemePairClose !(NonEmpty (Cell Pair))
164 | LexemePairAny !(NonEmpty (Cell Pair))
165 -- ^ orientation depending on the surrounding 'Lexeme's,
166 -- see 'orientLexemePairAny'
167 | LexemePairBoth !(NonEmpty (Cell Pair))
168 | LexemeEscape !(Cell Char)
169 | LexemeLink !(Cell TL.Text)
170 | LexemeWhite !(Cell TL.Text)
171 | LexemeAlphaNum !(Cell TL.Text)
172 | LexemeOther !(Cell TL.Text)
173 | LexemeTree !(Tree (Cell Node))
174 | LexemeEnd
175 deriving (Eq, Show)
176 instance Pretty Lexeme
177
178 parseTokens :: [Lexeme] -> Tokens
179 parseTokens ps =
180 closePairs $
181 appendLexemes mempty $
182 -- debug0 "Lexemes (post orient)" $
183 orientLexemePairAny $ LexemeEnd :
184 ps
185
186 parseLexemes ::
187 String ->
188 Cell TL.Text ->
189 Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme]
190 parseLexemes inp = runParserOnCell inp (p_Lexemes <* P.eof)
191
192 -- | Parse 'Lexeme's, returning them in reverse order
193 -- to apply 'orientLexemePairAny'.
194 p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
195 p_Lexemes = debugParser "Lexemes" $ go []
196 where
197 go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
198 go acc =
199 (P.eof $> acc) <|>
200 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
201
202 -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
203 -- so that it can try to orient nearby 'LexemePairAny'
204 -- to 'LexemePairOpen' or 'LexemePairClose'.
205 orientLexemePairAny :: [Lexeme] -> [Lexeme]
206 orientLexemePairAny = \case
207 -- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc
208
209 -- "   
210 t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
211 w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
212 LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
213 --    "
214 LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
215 LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
216 LexemePairAny p:[] -> LexemePairOpen p:[]
217
218 --    ,,,"
219 LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
220 LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
221 -- ",,,   
222 w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
223 LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
224
225 -- ",,,AAA
226 an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
227 -- ,,,"AAA
228 an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
229
230 -- ")
231 c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
232 -- ("
233 LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
234
235 -- "(
236 o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
237 -- )"
238 LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc
239
240 acc -> acc
241
242 p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
243 p_Lexeme = debugParser "Lexeme" $
244 P.choice
245 [ P.try $ LexemeWhite <$> p_Cell p_Spaces1
246 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
247 , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
248 , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen)
249 , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
250 , P.try $ LexemeEscape <$> p_Cell p_Escape
251 , P.try $ LexemeLink <$> p_Cell p_Link
252 , P.try $ LexemeAlphaNum <$> p_Cell (P.takeWhile1P (Just "AlphaNum") Char.isAlphaNum)
253 , LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar)
254 ]
255
256 p_some :: Parser e s a -> Parser e s (NonEmpty a)
257 p_some p = NonEmpty.fromList <$> P.some p
258
259 pairAny :: Char -> Maybe Pair
260 pairAny = \case
261 '-' -> Just PairDash
262 '/' -> Just PairSlash
263 '"' -> Just PairDoublequote
264 '\'' -> Just PairSinglequote
265 '`' -> Just PairBackquote
266 '_' -> Just PairUnderscore
267 '*' -> Just PairStar
268 '#' -> Just PairHash
269 _ -> Nothing
270
271 pairOpen :: Char -> Maybe Pair
272 pairOpen = \case
273 '(' -> Just PairParen
274 '[' -> Just PairBracket
275 '{' -> Just PairBrace
276 '«' -> Just PairFrenchquote
277 _ -> Nothing
278
279 pairClose :: Char -> Maybe Pair
280 pairClose = \case
281 ')' -> Just PairParen
282 ']' -> Just PairBracket
283 '}' -> Just PairBrace
284 '»' -> Just PairFrenchquote
285 _ -> Nothing
286
287 p_Escape :: Parser e s Char
288 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
289
290 p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text
291 p_Link =
292 P.try (P.char '<' *> p <* P.char '>') <|>
293 p
294 where
295 p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
296 p =
297 (\scheme addr -> scheme <> "//" <> addr)
298 <$> P.option "" (P.try p_scheme)
299 <* P.string "//"
300 <*> p_addr
301 p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
302 p_scheme =
303 (<> ":")
304 <$> (P.takeWhile1P (Just "scheme") $ \c ->
305 Char.isAlphaNum c
306 || c=='_'
307 || c=='-'
308 || c=='+')
309 <* P.char ':'
310 p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
311 p_addr =
312 P.takeWhileP (Just "addr") $ \c ->
313 Char.isAlphaNum c
314 || c=='%'
315 || c=='/'
316 || c=='('
317 || c==')'
318 || c=='-'
319 || c=='_'
320 || c=='.'
321 || c=='#'
322 || c=='?'
323 || c=='='
324
325 pairBorders :: Foldable f => Pair -> f a -> (TL.Text,TL.Text)
326 pairBorders p ts | null ts = pairBordersSingle p
327 | otherwise = pairBordersDouble p
328
329 pairBordersSingle :: Pair -> (TL.Text,TL.Text)
330 pairBordersSingle = \case
331 PairElem n as ->
332 ("<"<>n<>foldMap f as<>"/>","")
333 where f (elemAttr_white,ElemAttr{..}) =
334 elemAttr_white <>
335 elemAttr_name <>
336 elemAttr_open <>
337 elemAttr_value <>
338 elemAttr_close
339 p -> pairBordersDouble p
340
341 pairBordersDouble :: Pair -> (TL.Text,TL.Text)
342 pairBordersDouble = \case
343 PairElem n as -> ("<"<>n<>foldMap f as<>">","</"<>n<>">")
344 where f (elemAttr_white,ElemAttr{..}) =
345 elemAttr_white <>
346 elemAttr_name <>
347 elemAttr_open <>
348 elemAttr_value <>
349 elemAttr_close
350 PairHash -> ("#","#")
351 PairStar -> ("*","*")
352 PairSlash -> ("/","/")
353 PairUnderscore -> ("_","_")
354 PairDash -> ("-","-")
355 PairBackquote -> ("`","`")
356 PairSinglequote -> ("'","'")
357 PairDoublequote -> ("\"","\"")
358 PairFrenchquote -> ("«","»")
359 PairParen -> ("(",")")
360 PairBrace -> ("{","}")
361 PairBracket -> ("[","]")
362
363 -- * Class 'TagFrom'
364 class TagFrom a where
365 tagFrom :: a -> Maybe (Cell Tag, a)
366 instance TagFrom Tokens where
367 tagFrom ts =
368 case Seq.viewl ts of
369 EmptyL -> Nothing
370 Tree0 (Cell b0 e0 n) :< ns ->
371 case n of
372 NodeToken (TokenText t) ->
373 case tagFrom $ Cell b0 e0 t of
374 Nothing -> Nothing
375 Just (t0,r0) ->
376 if TL.null $ unCell r0
377 then
378 case tagFrom ns of
379 Just (t1@(Cell b1 _e1 _), r1) | e0 == b1 ->
380 Just (t0<>t1, r1)
381 _ -> Just (t0, ns)
382 else Just (t0, pure n0 `unionTokens` ns)
383 where n0 = Tree0 $ NodeToken . TokenText <$> r0
384 _ -> Nothing
385 _ -> Nothing
386 instance TagFrom (Cell TL.Text) where
387 tagFrom (Cell bp ep t)
388 | (w,r) <- TL.span isTagChar t
389 , not $ TL.null w
390 , ew <- pos_column bp + sum (Text.length <$> (TL.toChunks w)) =
391 Just
392 ( Cell bp bp{pos_column=ew} w
393 , Cell bp{pos_column=ew} ep r )
394 tagFrom _ = Nothing
395
396 isTagChar :: Char -> Bool
397 isTagChar c =
398 Char.isAlphaNum c ||
399 c=='·' ||
400 case Char.generalCategory c of
401 Char.DashPunctuation -> True
402 Char.ConnectorPunctuation -> True
403 _ -> False
404
405 {-
406 -- | Build 'Tokens' from many 'Token's.
407 tokens :: [Cell Token] -> Tokens
408 tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
409
410 -- | Build 'Tokens' from one 'Token'.
411 tokens1 :: Tree (Cell Node) -> Tokens
412 tokens1 = Seq.singleton
413
414 unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
415 unTokenElem toks =
416 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
417 [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
418 _ -> Nothing
419
420 isTokenElem :: Tokens -> Bool
421 isTokenElem toks =
422 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
423 [Tree (unCell -> NodePair PairElem{}) _] -> True
424 _ -> False
425 -}