]> Git — Sourcephile - doclang.git/blob - Language/TCT/Read/Token.hs
Add NodePara and NodeGroup.
[doclang.git] / Language / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE OverloadedStrings #-}
4 {-# LANGUAGE Rank2Types #-}
5 {-# LANGUAGE ScopedTypeVariables #-}
6 {-# LANGUAGE TypeFamilies #-}
7 {-# LANGUAGE ViewPatterns #-}
8 module Language.TCT.Read.Token where
9
10 import Control.Applicative (Applicative(..), Alternative(..))
11 import Control.Monad (Monad(..))
12 import Data.Bool
13 import Data.Char (Char)
14 import Data.Eq (Eq(..))
15 import Data.Either (Either(..))
16 import Data.Foldable (Foldable(..))
17 import Data.Function (($), (.))
18 import Data.Functor ((<$>), ($>))
19 import Data.List.NonEmpty (NonEmpty(..))
20 import Data.Maybe (Maybe(..))
21 import Data.Monoid (Monoid(..))
22 import Data.Semigroup (Semigroup(..))
23 import Data.Sequence (Seq, ViewL(..), ViewR(..), (<|), (|>))
24 import Data.String (String)
25 import Data.TreeSeq.Strict (Tree(..), Trees)
26 import Data.Tuple (fst,snd)
27 import Data.Void (Void)
28 import Prelude (Num(..))
29 import Text.Show (Show(..))
30 import qualified Data.Char as Char
31 import qualified Data.List.NonEmpty as NonEmpty
32 import qualified Data.Sequence as Seq
33 import qualified Data.Text as Text
34 import qualified Data.Text.Lazy as TL
35 import qualified Text.Megaparsec as P
36 import qualified Text.Megaparsec.Char as P
37
38 import Language.TCT.Cell
39 import Language.TCT.Elem
40 -- import Language.TCT.Token
41 import Language.TCT.Tree
42 import Language.TCT.Read.Elem
43 import Language.TCT.Read.Cell
44
45 instance Pretty Pair where
46 pretty = return . show
47 instance Pretty a => Pretty (Cell a) where
48 pretty (Cell bp ep m) = do
49 s <- pretty m
50 return $ "Cell "<>show bp<>":"<>show ep<>" "<>s
51 instance Pretty Lexeme where
52 pretty = return . show
53
54 -- * Type 'Pairs'
55 -- | Right-only Dyck language
56 type Pairs = (Tokens,[Opening])
57 type Tokens = Trees (Cell Node)
58
59 -- ** Type 'Opening'
60 type Opening = (Cell Pair,Tokens)
61
62 appendToken :: Pairs -> Tree (Cell Node) -> Pairs
63 appendToken (ts,[]) tok = (ts|>tok,[])
64 appendToken (ts,(p0,t0):ps) tok = (ts,(p0,t0|>tok):ps)
65
66 appendTokens :: Pairs -> Tokens -> Pairs
67 appendTokens (ts,[]) toks = (ts<>toks,[])
68 appendTokens (ts,(p0,t0):ps) toks = (ts,(p0,t0<>toks):ps)
69
70 appendText :: Pairs -> Cell TL.Text -> Pairs
71 appendText ps tok =
72 case ps of
73 (ts,[]) -> (appendTokenText ts tok,[])
74 (ts,(p0,ts0):pss) -> (ts,(p0,appendTokenText ts0 tok):pss)
75
76 appendTokenText :: Tokens -> Cell TL.Text -> Tokens
77 appendTokenText ts (Cell bn en n)
78 {-
79 | TL.null n = ts
80 | otherwise-} =
81 case Seq.viewr ts of
82 EmptyR -> pure $ Tree0 $ Cell bn en $ NodeToken $ TokenText n
83 is :> Tree (Cell bo _eo nod) st ->
84 case nod of
85 NodeToken (TokenText o) -> is |> i
86 where i = Tree (Cell bo en (NodeToken $ TokenText (o <> n))) st
87 _ -> ts |> Tree0 (Cell bn en $ NodeToken $ TokenText n)
88
89 prependTokenText :: Tokens -> Cell TL.Text -> Tokens
90 prependTokenText ts (Cell bn en n)
91 {-
92 | TL.null n = ts
93 | otherwise-} =
94 case Seq.viewl ts of
95 EmptyL -> pure $ Tree0 $ Cell bn en $ NodeToken $ TokenText n
96 Tree (Cell _bo eo nod) st :< is ->
97 case nod of
98 NodeToken (TokenText o) -> i <| is
99 where i = Tree (Cell bn eo (NodeToken $ TokenText (n <> o))) st
100 _ -> Tree0 (Cell bn en $ NodeToken $ TokenText n) <| ts
101
102 openPair :: Pairs -> Cell Pair -> Pairs
103 openPair (t,ps) p = (t,(p,mempty):ps)
104
105 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
106 closePair :: Pairs -> Cell Pair -> Pairs
107 closePair ps@(_,[]) (Cell bp ep p) = -- dbg "closePair" $
108 appendText ps $ Cell bp ep $ snd $ pairBorders p
109 closePair (t,(p1,t1):ts) p = -- dbg "closePair" $
110 case (p1,p) of
111 (Cell bx _ex (PairElem nx ax), Cell _by ey (PairElem ny ay)) | nx == ny ->
112 appendToken (t,ts) $
113 Tree (Cell bx ey $ NodePair $ PairElem nx as) t1
114 where as | null ay = ax
115 | otherwise = ax<>ay
116 (Cell bx _ex x, Cell _by ey y) | x == y ->
117 appendToken (t,ts) $
118 Tree (Cell bx ey $ NodePair x) t1
119 _ ->
120 (`closePair` p) $
121 appendTokens
122 (t,ts)
123 (closeImpaired mempty (p1,t1))
124
125 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
126 closeImpaired :: Tokens -> (Cell Pair,Tokens) -> Tokens
127 closeImpaired acc (Cell bp ep p,toks) = dbg "closeImpaired" $
128 case p of
129 -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
130 PairHash | Just (Cell bt et t, ts) <- tagFrom $ toks <> acc ->
131 Tree0 (Cell bt et $ NodeToken $ TokenTag t) <| ts
132 {-
133 PairHash | (Tree0 (Cell bt et (TokenPlain t))) :< ts <- Seq.viewl $ toks <> acc ->
134 case Text.span isTagChar t of
135 ("",_) | Text.null t -> toksHash mempty <> toks <> acc
136 | otherwise -> Tree0 (Cell bp et (TokenTag t)) <| ts
137 (tag,t') ->
138 let len = Text.length tag in
139 Tree0 (Cell bp bt{columnPos = columnPos bt + len} (TokenTag tag)) <|
140 Tree0 (Cell bt{columnPos = columnPos bt + len + 1} et (TokenPlain t')) <|
141 ts
142 -}
143 _ -> prependTokenText (toks <> acc) toksHash
144 where
145 toksHash :: Cell TL.Text
146 toksHash = Cell bp ep $ fst $ pairBorders p
147
148 isTagChar :: Char -> Bool
149 isTagChar c =
150 Char.isAlphaNum c ||
151 c=='·' ||
152 case Char.generalCategory c of
153 Char.DashPunctuation -> True
154 Char.ConnectorPunctuation -> True
155 _ -> False
156
157 -- * Class 'TagFrom'
158 class TagFrom a where
159 tagFrom :: a -> Maybe (Cell Tag, a)
160 instance TagFrom Tokens where
161 tagFrom ts =
162 case Seq.viewl ts of
163 EmptyL -> Nothing
164 Tree0 (Cell b0 e0 n) :< ns ->
165 case n of
166 NodeToken (TokenText t) ->
167 case tagFrom $ Cell b0 e0 t of
168 Nothing -> Nothing
169 Just (t0,r0) ->
170 if TL.null (unCell r0)
171 then
172 case tagFrom ns of
173 Just (t1@(Cell b1 _e1 _), r1) | e0 == b1 ->
174 Just (t0<>t1, r1)
175 _ -> Just (t0, n0 <| ns)
176 else Just (t0, n0 <| ns)
177 where n0 = (Tree0 $ NodeToken . TokenText <$> r0)
178 _ -> Nothing
179 _ -> Nothing
180 instance TagFrom (Cell TL.Text) where
181 tagFrom (Cell bp ep t)
182 | (w,r) <- TL.span isTagChar t
183 , not $ TL.null w
184 , ew <- pos_column bp + sum (Text.length <$> (TL.toChunks w)) =
185 Just
186 ( Cell bp bp{pos_column=ew} w
187 , Cell bp{pos_column=ew} ep r )
188 tagFrom _ = Nothing
189
190 -- | Close remaining 'Pair's at end of parsing.
191 closePairs :: Pairs -> Tokens
192 closePairs (t0,ps) = -- dbg "closePairs" $
193 t0 <> foldl' closeImpaired mempty ps
194
195 appendLexeme :: Lexeme -> Pairs -> Pairs
196 appendLexeme lex acc =
197 -- dbg "appendLexeme" $
198 case lex of
199 LexemePairOpen ps -> foldl' openPair acc ps
200 {-
201 where
202 open a p@(Cell _bp ep (PairElem{})) = openPair a p `appendToken` (Tree0 $ Cell ep ep $ TokenPhrase $ PhraseWhite "")
203 open a p = openPair a p
204 -}
205 LexemePairClose ps -> foldl' closePair acc ps
206 LexemePairAny ps ->
207 appendText acc $ sconcat $
208 ((fst . pairBordersWithoutContent) <$>) <$> ps
209 LexemePairBoth ps -> appendTokens acc $ Seq.fromList $ toList $ Tree0 . (NodePair <$>) <$> ps
210 LexemeEscape c -> appendToken acc $ Tree0 $ NodeToken . TokenEscape <$> c
211 LexemeLink t -> appendToken acc $ Tree0 $ NodeToken . TokenLink <$> t
212 {-LexemeWhite (unCell -> "") -> acc-}
213 -- LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc
214 LexemeWhite t -> appendText acc t
215 LexemeAlphaNum t -> appendText acc t
216 LexemeOther t -> appendText acc t
217 LexemeTree t -> appendToken acc t
218 LexemeEnd -> acc
219
220 {- TODEL
221 appendTokenChild :: Pairs -> Tree (Cell Node) -> Pairs
222 appendTokenChild pairs tree =
223 debug "appendTokenChild" "pairs" pairs $
224 debug "appendTokenChild" "tree" tree $
225 dbg "appendTokenChild" $
226 go pairs tree
227 where
228 go (ts@(toList -> [unTree -> Cell bo _eo NodeText{}]),[])
229 tok@(Tree (Cell _bn en _n) _ns) =
230 (pure $ Tree (Cell bo en NodePara) (ts |> tok),[])
231 go (ts,[]) tok = (ts |> tok,[])
232 go (ts,(p0,t0):ps) tok = (ts,(p0,t0|>tok):ps)
233 -}
234
235 appendLexemes :: Pairs -> [Lexeme] -> Pairs
236 appendLexemes = foldr appendLexeme
237
238 -- * Type 'Lexeme'
239 data Lexeme
240 = LexemePairOpen !(NonEmpty (Cell Pair))
241 | LexemePairClose !(NonEmpty (Cell Pair))
242 | LexemePairAny !(NonEmpty (Cell Pair))
243 | LexemePairBoth !(NonEmpty (Cell Pair))
244 | LexemeEscape !(Cell Char)
245 | LexemeLink !(Cell TL.Text)
246 | LexemeWhite !(Cell TL.Text)
247 | LexemeAlphaNum !(Cell TL.Text)
248 | LexemeOther !(Cell TL.Text)
249 | LexemeTree !(Tree (Cell Node))
250 | LexemeEnd
251 deriving (Eq, Show)
252
253 -- ** Type 'Lexemes'
254 type Lexemes = Seq Lexeme
255
256 parseTokens :: [Lexeme] -> Tokens
257 parseTokens ps =
258 closePairs $
259 appendLexemes mempty $
260 -- dbg "Lexemes (post orient)" $
261 orientLexemePairAny $ LexemeEnd :
262 ps
263
264 parseLexemes ::
265 String ->
266 Cell TL.Text ->
267 Either (P.ParseError (P.Token TL.Text) (P.ErrorFancy Void)) [Lexeme]
268 parseLexemes inp = runParserOnCell inp (p_Lexemes <* P.eof)
269
270 -- | Parse 'Lexeme's, returning them in reverse order to apply 'orientLexemePairAny'.
271 p_Lexemes :: P.Tokens s ~ TL.Text => Parser e s [Lexeme]
272 p_Lexemes = pdbg "Lexemes" $ go []
273 where
274 go :: P.Tokens s ~ TL.Text => [Lexeme] -> Parser e s [Lexeme]
275 go acc =
276 (P.eof $> acc) <|>
277 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
278
279 orientLexemePairAny :: [Lexeme] -> [Lexeme]
280 orientLexemePairAny = \case
281 -- LexemeOther (Cell _bx ex x):LexemeOther (Cell by _ey y):acc -> LexemeOther (Cell by ex (x<>y)):acc
282
283 -- "   
284 t@LexemeTree{}:LexemePairAny p:acc -> t:LexemePairClose p:acc
285 w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
286 LexemeEnd:LexemePairAny p:acc -> LexemePairClose p:acc
287 --    "
288 LexemePairAny p:t@LexemeTree{}:acc -> LexemePairOpen p:t:acc
289 LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
290 LexemePairAny p:[] -> LexemePairOpen p:[]
291
292 --    ,,,"
293 LexemePairAny p:a@LexemeOther{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
294 LexemePairAny p:a@LexemeOther{}:[] -> LexemePairOpen p:a:[]
295 -- ",,,   
296 w@LexemeWhite{}:a@LexemeOther{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
297 LexemeEnd:a@LexemeOther{}:LexemePairAny p:acc -> a:LexemePairClose p:acc
298
299 -- ",,,AAA
300 an@LexemeAlphaNum{}:a@LexemeOther{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
301 -- ,,,"AAA
302 an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeOther{}:acc -> an:LexemePairOpen p:a:acc
303
304 -- ")
305 c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
306 -- ("
307 LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
308
309 -- "(
310 o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
311 -- )"
312 LexemePairAny p:c@LexemePairClose{}:acc -> c:LexemePairClose p:acc
313
314 acc -> acc
315
316 p_some :: Parser e s a -> Parser e s (NonEmpty a)
317 p_some p = NonEmpty.fromList <$> P.some p
318
319 p_Lexeme :: P.Tokens s ~ TL.Text => Parser e s Lexeme
320 p_Lexeme = pdbg "Lexeme" $
321 P.choice
322 [ P.try $ LexemeWhite <$> p_Cell p_Spaces1
323 , P.try $ LexemePairAny <$> p_some (p_Cell $ p_satisfyMaybe pairAny)
324 , P.try $ LexemePairBoth <$> p_some (P.try $ p_Cell p_ElemSingle)
325 , P.try $ LexemePairOpen <$> p_some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen)
326 , P.try $ LexemePairClose <$> p_some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
327 , P.try $ LexemeEscape <$> p_Cell p_Escape
328 , P.try $ LexemeLink <$> p_Cell p_Link
329 , P.try $ LexemeAlphaNum <$> p_Cell (P.takeWhile1P (Just "AlphaNum") Char.isAlphaNum)
330 , LexemeOther <$> p_Cell (TL.singleton <$> P.anyChar)
331 ]
332
333 pairAny :: Char -> Maybe Pair
334 pairAny = \case
335 '-' -> Just PairDash
336 '/' -> Just PairSlash
337 '"' -> Just PairDoublequote
338 '\'' -> Just PairSinglequote
339 '`' -> Just PairBackquote
340 '_' -> Just PairUnderscore
341 '*' -> Just PairStar
342 '#' -> Just PairHash
343 _ -> Nothing
344
345 pairOpen :: Char -> Maybe Pair
346 pairOpen = \case
347 '(' -> Just PairParen
348 '[' -> Just PairBracket
349 '{' -> Just PairBrace
350 '«' -> Just PairFrenchquote
351 _ -> Nothing
352
353 pairClose :: Char -> Maybe Pair
354 pairClose = \case
355 ')' -> Just PairParen
356 ']' -> Just PairBracket
357 '}' -> Just PairBrace
358 '»' -> Just PairFrenchquote
359 _ -> Nothing
360
361 p_Escape :: Parser e s Char
362 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
363
364 p_Link :: P.Tokens s ~ TL.Text => Parser e s TL.Text
365 p_Link =
366 P.try (P.char '<' *> p <* P.char '>') <|>
367 p
368 where
369 p :: P.Tokens s ~ TL.Text => Parser e s TL.Text
370 p =
371 (\scheme addr -> scheme <> "//" <> addr)
372 <$> P.option "" (P.try p_scheme)
373 <* P.string "//"
374 <*> p_addr
375 p_scheme :: P.Tokens s ~ TL.Text => Parser e s TL.Text
376 p_scheme =
377 (<> ":")
378 <$> (P.takeWhile1P (Just "scheme") $ \c ->
379 Char.isAlphaNum c
380 || c=='_'
381 || c=='-'
382 || c=='+')
383 <* P.char ':'
384 p_addr :: P.Tokens s ~ TL.Text => Parser e s TL.Text
385 p_addr =
386 P.takeWhileP (Just "addr") $ \c ->
387 Char.isAlphaNum c
388 || c=='%'
389 || c=='/'
390 || c=='('
391 || c==')'
392 || c=='-'
393 || c=='_'
394 || c=='.'
395 || c=='#'
396 || c=='?'
397 || c=='='
398
399 -- | Build 'Tokens' from many 'Token's.
400 tokens :: [Cell Token] -> Tokens
401 tokens ts = Seq.fromList $ Tree0 . (NodeToken <$>) <$> ts
402
403 -- | Build 'Tokens' from one 'Token'.
404 tokens1 :: Tree (Cell Node) -> Tokens
405 tokens1 = Seq.singleton
406
407 unTokenElem :: Tokens -> Maybe (Cell (ElemName,ElemAttrs,Tokens))
408 unTokenElem toks =
409 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
410 [Tree (Cell bp ep (NodePair (PairElem e as))) ts] -> Just (Cell bp ep (e,as,ts))
411 _ -> Nothing
412
413 isTokenElem :: Tokens -> Bool
414 isTokenElem toks =
415 case toList $ {-Seq.dropWhileR isTokenWhite-} toks of
416 [Tree (unCell -> NodePair PairElem{}) _] -> True
417 _ -> False
418
419 pairBordersWithoutContent :: Pair -> (TL.Text,TL.Text)
420 pairBordersWithoutContent = \case
421 PairElem n as ->
422 ("<"<>n<>foldMap f as<>"/>","")
423 where f (elemAttr_white,ElemAttr{..}) =
424 elemAttr_white <>
425 elemAttr_name <>
426 elemAttr_open <>
427 elemAttr_value <>
428 elemAttr_close
429 p -> pairBorders p
430
431 pairBorders :: Pair -> (TL.Text,TL.Text)
432 pairBorders = \case
433 PairElem n as -> ("<"<>n<>foldMap f as<>">","</"<>n<>">")
434 where f (elemAttr_white,ElemAttr{..}) =
435 elemAttr_white <>
436 elemAttr_name <>
437 elemAttr_open <>
438 elemAttr_value <>
439 elemAttr_close
440 PairHash -> ("#","#")
441 PairStar -> ("*","*")
442 PairSlash -> ("/","/")
443 PairUnderscore -> ("_","_")
444 PairDash -> ("-","-")
445 PairBackquote -> ("`","`")
446 PairSinglequote -> ("'","'")
447 PairDoublequote -> ("\"","\"")
448 PairFrenchquote -> ("«","»")
449 PairParen -> ("(",")")
450 PairBrace -> ("{","}")
451 PairBracket -> ("[","]")