]> Git — Sourcephile - doclang.git/blob - Language/TCT/Read/Token.hs
Add golden tests.
[doclang.git] / Language / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# OPTIONS_GHC -fno-warn-orphans #-}
8 module Language.TCT.Read.Token where
9
10 import Control.Applicative (Applicative(..), Alternative(..))
11 import Control.Monad (Monad(..))
12 import Data.Bool
13 import Data.Char (Char)
14 import Data.Either (Either(..))
15 import Data.Eq (Eq(..))
16 import Data.Foldable (Foldable(..))
17 import Data.Function (($), (.))
18 import Data.Functor ((<$>), ($>))
19 import Data.List.NonEmpty (NonEmpty(..))
20 import Data.Maybe (Maybe(..))
21 import Data.Monoid (Monoid(..))
22 import Data.Semigroup (Semigroup(..))
23 import Data.Sequence (ViewL(..), ViewR(..), (<|))
24 import Data.String (String)
25 import Data.TreeSeq.Strict (Tree(..), Trees)
26 import Data.Tuple (fst,snd)
27 import Data.Void (Void)
28 import Prelude (Num(..))
29 import Text.Show (Show(..))
30 import qualified Data.Char as Char
31 import qualified Data.List.NonEmpty as NonEmpty
32 import qualified Data.Sequence as Seq
33 import qualified Data.Text as Text
34 import qualified Data.Text.Lazy as TL
35 import qualified Text.Megaparsec as P
36 import qualified Text.Megaparsec.Char as P
37
38 import Language.TCT.Debug
39 import Language.TCT.Cell
40 import Language.TCT.Elem
41 import Language.TCT.Tree
42 import Language.TCT.Read.Elem
43 import Language.TCT.Read.Cell
44
45 -- * Type 'Pairs'
46 -- | Right-only Dyck language,
47 -- to keep track of opened 'Pair's.
48 type Pairs = (Tokens,[Opening])
49 type Tokens = Trees (Cell Node)
50
51 -- ** Type 'Opening'
52 -- | An opened 'Pair' and its content so far.
53 type Opening = (Cell Pair,Tokens)
54
55 appendPairsToken :: Pairs -> Tree (Cell Node) -> Pairs
56 appendPairsToken ps t = appendPairsTokens ps (pure t)
57
58 appendPairsText :: Pairs -> Cell TL.Text -> Pairs
59 appendPairsText ps (Cell bp ep t) =
60 appendPairsToken ps $
61 Tree0 $ Cell bp ep $
62 NodeToken $ TokenText t
63
64 appendPairsTokens :: Pairs -> Tokens -> Pairs
65 appendPairsTokens (ts,[]) toks = (ts`unionTokens`toks,[])
66 appendPairsTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0`unionTokens`toks):ps)
67
68 -- | Unify two 'Tokens', merging border 'TokenText's if any.
69 unionTokens :: Tokens -> Tokens -> Tokens
70 unionTokens x y =
71 case (Seq.viewr x, Seq.viewl y) of
72 (xs :> x0, y0 :< ys) ->
73 case (x0,y0) of
74 ( Tree (Cell bx _ex (NodeToken (TokenText tx))) sx
75 , Tree (Cell _by ey (NodeToken (TokenText ty))) sy ) ->
76 xs `unionTokens`
77 pure (Tree (Cell bx ey $ NodeToken $ TokenText $ tx <> ty) (sx<>sy)) `unionTokens`
78 ys
79 _ -> x <> y
80 (EmptyR, _) -> y
81 (_, EmptyL) -> x
82
83 unionsTokens :: Foldable f => f Tokens -> Tokens
84 unionsTokens = foldl' unionTokens mempty
85
86 openPair :: Pairs -> Cell Pair -> Pairs
87 openPair (t,ps) p = (t,(p,mempty):ps)
88
89 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
90 closePair :: Pairs -> Cell Pair -> Pairs
91 closePair ps@(_,[]) (Cell bp ep p) = -- debug0 "closePair" $
92 appendPairsText ps $ Cell bp ep $
93 snd $ pairBordersDouble p
94 closePair (t,(p1,t1):ts) p = -- debug0 "closePair" $
95 case (p1,p) of
96 (Cell bx _ex (PairElem nx ax), Cell _by ey (PairElem ny ay)) | nx == ny ->
97 appendPairsToken (t,ts) $
98 Tree (Cell bx ey $ NodePair $ PairElem nx as) t1
99 where as | null ay = ax
100 | otherwise = ax<>ay
101 (Cell bx _ex x, Cell _by ey y) | x == y ->
102 appendPairsToken (t,ts) $
103 Tree (Cell bx ey $ NodePair x) t1
104 _ ->
105 (`closePair` p) $
106 appendPairsTokens
107 (t,ts)
108 (closeImpaired mempty (p1,t1))
109
110 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
111 closeImpaired :: Tokens -> (Cell Pair, Tokens) -> Tokens
112 closeImpaired acc (Cell bp ep pair, toks) = -- debug0 "closeImpaired" $
113 case pair of
114 -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
115 PairHash | Just (Cell _bt et tag, rest) <- tagFrom body ->
116 Tree0 (Cell bp et $ NodeToken $ TokenTag tag) <| rest
117 -- NOTE: use bp (not bt) to include the '#'
118 _ -> pure open `unionTokens` body
119 where
120 body = toks `unionTokens` acc
121 open = Tree0 $ Cell bp ep $ NodeToken $ TokenText $ fst $ pairBordersDouble pair
122
123 -- | Close remaining 'Pair's at end of parsing.
124 closePairs :: Pairs -> Tokens
125 closePairs (t0,ps) = -- debug0 "closePairs" $
126 t0 `unionTokens` foldl' closeImpaired mempty ps
127
128 appendLexeme :: Lexeme -> Pairs -> Pairs
129 appendLexeme lex acc = -- debug2 "appendLexeme" "lex" "acc" $ \lex acc ->
130 case lex of
131 LexemePairOpen ps -> foldl' open acc ps
132 where
133 -- NOTE: insert an empty node to encode <elem></elem>, not <elem/>
134 open a p@(Cell _bp ep PairElem{}) = openPair a p `appendPairsText` Cell ep ep ""
135 open a p = openPair a p
136 LexemePairClose ps -> foldl' closePair acc ps
137 LexemePairAny ps -> foldl' openPair acc ps
138 {-
139 LexemePairAny ps ->
140 appendPairsText acc $ sconcat $
141 ((fst . pairBordersSingle) <$>) <$> ps
142 -}
143 LexemePairBoth ps -> appendPairsTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
144 LexemeEscape c -> appendPairsToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
145 LexemeLink t -> appendPairsToken acc $ Tree0 $ NodeToken . TokenLink <$> t
146 {-LexemeWhite (unCell -> "") -> acc-}
147 -- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc
148 LexemeWhite t -> appendPairsText acc t
149 LexemeAlphaNum t -> appendPairsText acc t
150 LexemeOther t -> appendPairsText acc t
151 LexemeTree t -> appendPairsToken acc t
152 LexemeEnd -> acc
153
154 appendLexemes :: Pairs -> [Lexeme] -> Pairs
155 appendLexemes = foldr appendLexeme
156
157 -- * Type 'Lexeme'
158 -- | 'Lexeme's cut the input in the longest chunks of common semantic,
159 -- this enables 'orientLexemePairAny' to work with a more meaningful context.
160 data Lexeme
161 = LexemePairOpen !(NonEmpty (Cell Pair))
162 | LexemePairClose !(NonEmpty (Cell Pair))
163 | LexemePairAny !(NonEmpty (Cell Pair))
164 -- ^ orientation depending on the surrounding 'Lexeme's,
165 -- see 'orientLexemePairAny'
166 | LexemePairBoth !(NonEmpty (Cell Pair))
167 | LexemeEscape !(Cell Char)
168 | LexemeLink !(Cell TL.Text)
169 | LexemeWhite !(Cell TL.Text)
170 | LexemeAlphaNum !(Cell TL.Text)
171 | LexemeOther !(Cell TL.Text)
172 | LexemeTree !(Tree (Cell Node))
173 | LexemeEnd
174 deriving (Eq, Show)
175 instance Pretty Lexeme
176
177 parseTokens :: [Lexeme] -> Tokens
178 parseTokens ps =
179 closePairs $
180 appendLexemes mempty $
181 -- debug0 "Lexemes (post orient)" $
182 orientLexemePairAny $ LexemeEnd :
183 ps
184
185 parseLexemes ::
186 String ->
187 Cell TL.Text ->
188 Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme]
189 parseLexemes inp = runParserOnCell inp (p_Lexemes <* P.eof)
190
191 -- | Parse 'Lexeme's, returning them in reverse order
192 -- to apply 'orientLexemePairAny'.
193 p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
194 p_Lexemes = debugParser "Lexemes" $ go []
195 where
196 go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
197 go acc =
198 (P.eof $> acc) <|>
199 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
200
201 -- | Must be called on parsed 'Lexeme's after the prepending of any new 'Lexeme',
202 -- so that it can try to orient nearby 'LexemePairAny'
203 -- to 'LexemePairOpen' or 'LexemePairClose'.
204 orientLexemePairAny :: [Lexeme] -> [Lexeme]
205 orientLexemePairAny = \case
206 -- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc
207
208 -- "   
209 t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
210 w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
211 LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
212 --    "
213 LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
214 LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
215 LexemePairAny p:[] -> LexemePairOpen p:[]
216
217 --    ,,,"
218 LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
219 LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
220 -- ",,,   
221 w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
222 LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
223
224 -- ",,,AAA
225 an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
226 -- ,,,"AAA
227 an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
228
229 -- ")
230 c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
231 -- ("
232 LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
233
234 -- "(
235 o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
236 -- )"
237 LexemePairAny p:c@LexemePairClose{}:acc -> LexemePairClose p:c:acc
238
239 acc -> acc
240
241 p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
242 p_Lexeme = debugParser "Lexeme" $
243 P.choice
244 [ P.try $ LexemeWhite <$> p_Cell p_Spaces1
245 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
246 , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
247 , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen)
248 , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
249 , P.try $ LexemeEscape <$> p_Cell p_Escape
250 , P.try $ LexemeLink <$> p_Cell p_Link
251 , P.try $ LexemeAlphaNum <$> p_Cell (P.takeWhile1P (Just "AlphaNum") Char.isAlphaNum)
252 , LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar)
253 ]
254
255 p_some :: Parser e s a -> Parser e s (NonEmpty a)
256 p_some p = NonEmpty.fromList <$> P.some p
257
258 pairAny :: Char -> Maybe Pair
259 pairAny = \case
260 '-' -> Just PairDash
261 '/' -> Just PairSlash
262 '"' -> Just PairDoublequote
263 '\'' -> Just PairSinglequote
264 '`' -> Just PairBackquote
265 '_' -> Just PairUnderscore
266 '*' -> Just PairStar
267 '#' -> Just PairHash
268 _ -> Nothing
269
270 pairOpen :: Char -> Maybe Pair
271 pairOpen = \case
272 '(' -> Just PairParen
273 '[' -> Just PairBracket
274 '{' -> Just PairBrace
275 '«' -> Just PairFrenchquote
276 _ -> Nothing
277
278 pairClose :: Char -> Maybe Pair
279 pairClose = \case
280 ')' -> Just PairParen
281 ']' -> Just PairBracket
282 '}' -> Just PairBrace
283 '»' -> Just PairFrenchquote
284 _ -> Nothing
285
286 p_Escape :: Parser e s Char
287 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
288
289 p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text
290 p_Link =
291 P.try (P.char '<' *> p <* P.char '>') <|>
292 p
293 where
294 p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
295 p =
296 (\scheme addr -> scheme <> "//" <> addr)
297 <$> P.option "" (P.try p_scheme)
298 <* P.string "//"
299 <*> p_addr
300 p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
301 p_scheme =
302 (<> ":")
303 <$> (P.takeWhile1P (Just "scheme") $ \c ->
304 Char.isAlphaNum c
305 || c=='_'
306 || c=='-'
307 || c=='+')
308 <* P.char ':'
309 p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
310 p_addr =
311 P.takeWhileP (Just "addr") $ \c ->
312 Char.isAlphaNum c
313 || c=='%'
314 || c=='/'
315 || c=='('
316 || c==')'
317 || c=='-'
318 || c=='_'
319 || c=='.'
320 || c=='#'
321 || c=='?'
322 || c=='='
323
324 pairBorders :: Foldable f => Pair -> f a -> (TL.Text,TL.Text)
325 pairBorders p ts | null ts = pairBordersSingle p
326 | otherwise = pairBordersDouble p
327
328 pairBordersSingle :: Pair -> (TL.Text,TL.Text)
329 pairBordersSingle = \case
330 PairElem n as ->
331 ("<"<>n<>foldMap f as<>"/>","")
332 where f (elemAttr_white,ElemAttr{..}) =
333 elemAttr_white <>
334 elemAttr_name <>
335 elemAttr_open <>
336 elemAttr_value <>
337 elemAttr_close
338 p -> pairBordersDouble p
339
340 pairBordersDouble :: Pair -> (TL.Text,TL.Text)
341 pairBordersDouble = \case
342 PairElem n as -> ("<"<>n<>foldMap f as<>">","</"<>n<>">")
343 where f (elemAttr_white,ElemAttr{..}) =
344 elemAttr_white <>
345 elemAttr_name <>
346 elemAttr_open <>
347 elemAttr_value <>
348 elemAttr_close
349 PairHash -> ("#","#")
350 PairStar -> ("*","*")
351 PairSlash -> ("/","/")
352 PairUnderscore -> ("_","_")
353 PairDash -> ("-","-")
354 PairBackquote -> ("`","`")
355 PairSinglequote -> ("'","'")
356 PairDoublequote -> ("\"","\"")
357 PairFrenchquote -> ("«","»")
358 PairParen -> ("(",")")
359 PairBrace -> ("{","}")
360 PairBracket -> ("[","]")
361
362 -- * Class 'TagFrom'
363 class TagFrom a where
364 tagFrom :: a -> Maybe (Cell Tag, a)
365 instance TagFrom Tokens where
366 tagFrom ts =
367 case Seq.viewl ts of
368 EmptyL -> Nothing
369 Tree0 (Cell b0 e0 n) :< ns ->
370 case n of
371 NodeToken (TokenText t) ->
372 case tagFrom $ Cell b0 e0 t of
373 Nothing -> Nothing
374 Just (t0,r0) ->
375 if TL.null $ unCell r0
376 then
377 case tagFrom ns of
378 Just (t1@(Cell b1 _e1 _), r1) | e0 == b1 ->
379 Just (t0<>t1, r1)
380 _ -> Just (t0, ns)
381 else Just (t0, pure n0 `unionTokens` ns)
382 where n0 = Tree0 $ NodeToken . TokenText <$> r0
383 _ -> Nothing
384 _ -> Nothing
385 instance TagFrom (Cell TL.Text) where
386 tagFrom (Cell bp ep t)
387 | (w,r) <- TL.span isTagChar t
388 , not $ TL.null w
389 , ew <- pos_column bp + sum (Text.length <$> TL.toChunks w) =
390 Just
391 ( Cell bp bp{pos_column=ew} w
392 , Cell bp{pos_column=ew} ep r )
393 tagFrom _ = Nothing
394
395 isTagChar :: Char -> Bool
396 isTagChar c =
397 Char.isAlphaNum c ||
398 c=='·' ||
399 case Char.generalCategory c of
400 Char.DashPunctuation -> True
401 Char.ConnectorPunctuation -> True
402 _ -> False
403
404 {-
405 -- | Build 'Tokens' from many 'Token's.
406 tokens :: [Cell Token] -> Tokens
407 tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
408
409 -- | Build 'Tokens' from one 'Token'.
410 tokens1 :: Tree (Cell Node) -> Tokens
411 tokens1 = Seq.singleton
412
413 unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
414 unTokenElem toks =
415 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
416 [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
417 _ -> Nothing
418
419 isTokenElem :: Tokens -> Bool
420 isTokenElem toks =
421 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
422 [Tree (unCell -> NodePair PairElem{}) _] -> True
423 _ -> False
424 -}