]> Git — Sourcephile - doclang.git/blob - Language/TCT/Read/Token.hs
WIP add paragraph recognition, enabling footnote with note: instead of only <note>.
[doclang.git] / Language / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 {-# LANGUAGE Rank2Types #-}
4 {-# LANGUAGE TypeFamilies #-}
5 {-# LANGUAGE ViewPatterns #-}
6 module Language.TCT.Read.Token where
7
8 -- import Data.Text.Buildable (Buildable(..))
9 -- import qualified Data.Text.Lazy as TL
10 -- import qualified Data.Text.Lazy.Builder as Builder
11 import Control.Applicative (Applicative(..), Alternative(..))
12 import Control.Monad (Monad(..))
13 import Data.Bool
14 import Data.Char (Char)
15 import Data.Int (Int)
16 import Data.Eq (Eq(..))
17 import Data.Ord (Ord(..))
18 import Data.Foldable (Foldable(..))
19 import Data.Sequence (Seq)
20 import Data.Function (($), (.))
21 import Data.Functor ((<$>), ($>), (<$))
22 import Data.Maybe (Maybe(..))
23 import Data.Monoid (Monoid(..))
24 import Data.Semigroup (Semigroup(..))
25 import Data.Sequence (ViewL(..), (<|))
26 import Data.Text (Text)
27 import Data.TreeSeq.Strict (Tree(..))
28 import Data.Tuple (fst,snd)
29 import Prelude (Num(..))
30 import Text.Show (Show(..))
31 import qualified Data.Char as Char
32 import qualified Data.Sequence as Seq
33 import qualified Data.Text as Text
34 import qualified Text.Megaparsec as P
35 import qualified Text.Megaparsec.Char as P
36 import qualified System.FilePath as FP
37
38 import Language.TCT.Cell
39 import Language.TCT.Elem
40 import Language.TCT.Read.Elem
41 import Language.TCT.Read.Cell
42
43 -- * Type 'Row'
44 -- | In normal order: a list of 'Key's, maybe ended by 'Value', all read on the same line.
45 type Row = [Tree (Cell Key) (Cell Value)]
46
47 -- * Type 'Key'
48 data Key
49 = KeyColon !Name !White -- ^ @name: @
50 | KeyEqual !Name !White -- ^ @name=@
51 | KeyBar !Name !White -- ^ @name|@
52 | KeyGreat !Name !White -- ^ @name>@
53 | KeyLower !Name !Attrs -- ^ @<name a=b@
54 | KeyDot !Name -- ^ @1. @
55 | KeyDash -- ^ @- @
56 | KeyDashDash -- ^ @-- @
57 | KeySection !LevelSection -- ^ @# @
58 | KeyBrackets !Name -- ^ @[name]@
59 | KeyDotSlash !PathFile -- ^ @./file @
60 | KeyPara
61 deriving (Eq, Ord, Show)
62
63 -- ** Type 'Name'
64 type Name = Text
65
66 -- ** Type 'Value'
67 type Value = Text
68
69 -- ** Type 'PathFile'
70 type PathFile = FP.FilePath
71
72 -- ** Type 'LevelSection'
73 type LevelSection = Int
74
75 -- * Type 'Rows'
76 -- | In reverse order: a list of nodes in scope
77 -- (hence to which the next line can append to).
78 type Rows = [Tree (Cell Key) (Cell Value)]
79
80 -- * Type 'Token'
81 type Token = Tree (Cell TokenKey) (Cell TokenValue)
82
83 -- ** Type 'Tokens'
84 type Tokens = Seq Token
85
86 -- ** Type 'TokenKey'
87 type TokenKey = Pair
88 data Pair
89 = PairHash -- ^ @#value#@
90 | PairElem !Elem !Attrs -- ^ @<elem n0=v0 n1=v1>value</elem>@
91 | PairStar -- ^ @*value*@
92 | PairSlash -- ^ @/value/@
93 | PairUnderscore -- ^ @_value_@
94 | PairDash -- ^ @-value-@
95 | PairBackquote -- ^ @`value`@
96 | PairSinglequote -- ^ @'value'@
97 | PairDoublequote -- ^ @"value"@
98 | PairFrenchquote -- ^ @«value»@
99 | PairParen -- ^ @(value)@
100 | PairBrace -- ^ @{value}@
101 | PairBracket -- ^ @[value]@
102 deriving (Eq,Ord,Show)
103
104 -- ** Type 'TokenValue'
105 data TokenValue
106 = TokenPlain !Text
107 | TokenTag !Tag
108 | TokenEscape !Char
109 | TokenLink !Text
110 | TokenTree (Tree (Cell Key) (Cell Value))
111 deriving (Eq,Ord,Show)
112
113 -- ** Type 'Tag'
114 type Tag = Text
115
116 -- * Type 'Pairs'
117 -- | Right-only Dyck language
118 type Pairs = (Tokens,[Opening])
119
120 -- ** Type 'Opening'
121 type Opening = (Cell Pair,Tokens)
122
123 appendToken :: Pairs -> Token -> Pairs
124 appendToken ps = appendTokens ps . Seq.singleton
125
126 appendTokens :: Pairs -> Tokens -> Pairs
127 appendTokens (t,[]) toks = (t<>toks,[])
128 appendTokens (t,(p0,t0):ps) toks = (t,(p0,t0<>toks):ps)
129
130 openPair :: Pairs -> Cell Pair -> Pairs
131 openPair (t,ms) p = (t,(p,mempty):ms)
132
133 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
134 closePair :: Pairs -> Cell Pair -> Pairs
135 closePair ps@(_,[]) (Cell bp ep p) = dbg "closePair" $
136 appendToken ps $
137 Tree0 $ Cell bp ep $
138 TokenPlain $ snd $ pairBorders p tokensPlainEmpty
139 closePair (t,(p1,t1):ts) p = dbg "closePair" $
140 case (p1,p) of
141 (Cell bx _ex (PairElem x ax), Cell _by ey (PairElem y ay)) | x == y ->
142 appendToken (t,ts) $
143 TreeN (Cell bx ey $ PairElem x (ax<>ay)) t1
144 (Cell bx _ex x, Cell _by ey y) | x == y ->
145 appendToken (t,ts) $
146 TreeN (Cell bx ey x) t1
147 _ ->
148 (`closePair` p) $
149 appendTokens
150 (t,ts)
151 (closeImpaired mempty (p1,t1))
152
153 -- | Close a 'Pair' when there is no matching 'LexemePairClose'.
154 closeImpaired :: Tokens -> (Cell Pair,Tokens) -> Tokens
155 closeImpaired acc (Cell bp ep p,toks) = dbg "closeImpaired" $
156 case p of
157 -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
158 PairHash | (Tree0 (Cell bt et (TokenPlain t))) :< ts <- Seq.viewl $ toks <> acc ->
159 case Text.span isTagChar t of
160 ("",_) | Text.null t -> toksHash mempty <> toks <> acc
161 | otherwise -> Tree0 (Cell bp et (TokenTag t)) <| ts
162 (tag,t') ->
163 let len = Text.length tag in
164 Tree0 (Cell bp bt{columnPos = columnPos bt + len} (TokenTag tag)) <|
165 Tree0 (Cell bt{columnPos = columnPos bt + len + 1} et (TokenPlain t')) <|
166 ts
167 _ -> toksHash tokensPlainEmpty <> toks <> acc
168 where
169 toksHash = tokens1 . Tree0 . Cell bp ep . TokenPlain . fst . pairBorders p
170 isTagChar c =
171 Char.isAlphaNum c ||
172 c=='·' ||
173 case Char.generalCategory c of
174 Char.DashPunctuation -> True
175 Char.ConnectorPunctuation -> True
176 _ -> False
177
178 -- | Close remaining 'Pair's at end of parsing.
179 closePairs :: Pairs -> Tokens
180 closePairs (t0,ps) = dbg "closePairs" $
181 t0 <> foldl' closeImpaired mempty ps
182
183 appendLexeme :: Lexeme -> Pairs -> Pairs
184 appendLexeme lex acc =
185 dbg "appendLexeme" $
186 case lex of
187 LexemePairOpen ps -> foldl' open acc ps
188 where
189 open a p@(Cell _bp ep (PairElem{})) = openPair a p `appendToken` (Tree0 $ Cell ep ep $ TokenPlain "")
190 open a p = openPair a p
191 LexemePairClose ps -> foldl' closePair acc ps
192 LexemePairAny ps -> appendTokens acc $ tokens $ Tree0 . ((\p -> TokenPlain $ fst $ pairBorders p mempty) <$>) <$> ps
193 LexemePairBoth ps -> appendTokens acc $ tokens $ (`TreeN`mempty) <$> ps
194 LexemeEscape c -> appendToken acc $ Tree0 $ TokenEscape <$> c
195 LexemeLink t -> appendToken acc $ Tree0 $ TokenLink <$> t
196 {-LexemeWhite (unCell -> "") -> acc-}
197 LexemeWhite (unCell -> Text.all (==' ') -> True) -> acc
198 LexemeWhite cs -> appendToken acc $ Tree0 $ TokenPlain <$> cs
199 LexemeAlphaNum cs -> appendToken acc $ Tree0 $ TokenPlain . Text.pack <$> cs
200 LexemeAny cs -> appendToken acc $ Tree0 $ TokenPlain . Text.pack <$> cs
201 -- LexemeToken ts -> appendTokens acc ts
202
203 appendLexemes :: Pairs -> [Lexeme] -> Pairs
204 appendLexemes = foldr appendLexeme
205
206 -- * Type 'Lexeme'
207 data Lexeme
208 = LexemePairOpen ![Cell Pair]
209 | LexemePairClose ![Cell Pair]
210 | LexemePairAny ![Cell Pair]
211 | LexemePairBoth ![Cell Pair]
212 | LexemeEscape !(Cell Char)
213 | LexemeLink !(Cell Text)
214 | LexemeWhite !(Cell White)
215 | LexemeAlphaNum !(Cell [Char])
216 | LexemeAny !(Cell [Char])
217 | LexemeTree !(Tree (Cell Key) Tokens)
218 deriving (Eq, Ord, Show)
219
220 -- ** Type 'Lexemes'
221 type Lexemes = Seq Lexeme
222
223 parseTokens :: [Lexeme] -> Tokens
224 parseTokens ps =
225 closePairs $
226 appendLexemes mempty $
227 dbg "Lexemes" $
228 orientLexemePairAny $ LexemeWhite (cell0 "") :
229 ps
230
231 -- | Parse 'Lexeme's, returning them in reverse order to apply 'orientLexemePairAny'.
232 p_Lexemes :: Parser e s [Lexeme]
233 p_Lexemes = pdbg "Lexemes" $ go []
234 where
235 go :: [Lexeme] -> Parser e s [Lexeme]
236 go acc =
237 (P.eof $> acc) <|>
238 (p_Lexeme >>= \next -> go $ orientLexemePairAny $ next:acc)
239
240 orientLexemePairAny :: [Lexeme] -> [Lexeme]
241 orientLexemePairAny = \case
242 LexemeAny (Cell _bx ex x):LexemeAny (Cell by _ey y):acc -> LexemeAny (Cell by ex (x<>y)):acc
243
244 -- "   
245 w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
246 --    "
247 LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
248 LexemePairAny p:[] -> LexemePairOpen p:[]
249
250 --    ,,,"
251 LexemePairAny p:a@LexemeAny{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
252 LexemePairAny p:a@LexemeAny{}:[] -> LexemePairOpen p:a:[]
253 -- ",,,   
254 w@LexemeWhite{}:a@LexemeAny{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
255
256 -- ",,,AAA
257 an@LexemeAlphaNum{}:a@LexemeAny{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
258 -- ,,,"AAA
259 an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeAny{}:acc -> an:LexemePairOpen p:a:acc
260
261 -- ")
262 c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
263 -- ("
264 LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
265
266 -- "(
267 o@LexemePairOpen{}:LexemePairAny p:acc -> o:LexemePairClose p:acc
268 -- )"
269 LexemePairAny p:c@LexemePairClose{}:acc -> c:LexemePairClose p:acc
270
271 acc -> acc
272
273 p_Lexeme :: Parser e s Lexeme
274 p_Lexeme = pdbg "Lexeme" $
275 P.choice
276 [ P.try $ LexemeWhite <$> p_Cell p_Spaces
277 , P.try $ LexemePairAny <$> P.some (p_Cell $ p_satisfyMaybe pairAny)
278 , P.try $ LexemePairBoth <$> P.some (P.try $ p_Cell p_ElemSingle)
279 , P.try $ LexemePairOpen <$> P.some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen)
280 , P.try $ LexemePairClose <$> P.some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
281 , P.try $ LexemeEscape <$> p_Cell p_Escape
282 , P.try $ LexemeLink <$> p_Cell p_Link
283 , P.try $ LexemeAlphaNum <$> p_Cell (P.some p_AlphaNum)
284 , LexemeAny <$> p_Cell (pure <$> P.anyChar)
285 ]
286
287 p_Cell :: Parser e s a -> Parser e s (Cell a)
288 p_Cell pa = do
289 bp <- p_Position
290 a <- pa
291 ep <- p_Position
292 return $ Cell bp ep a
293
294 pairAny :: Char -> Maybe Pair
295 pairAny = \case
296 '-' -> Just PairDash
297 '/' -> Just PairSlash
298 '"' -> Just PairDoublequote
299 '\'' -> Just PairSinglequote
300 '`' -> Just PairBackquote
301 '_' -> Just PairUnderscore
302 '*' -> Just PairStar
303 '#' -> Just PairHash
304 _ -> Nothing
305
306 pairOpen :: Char -> Maybe Pair
307 pairOpen = \case
308 '(' -> Just PairParen
309 '[' -> Just PairBracket
310 '{' -> Just PairBrace
311 '«' -> Just PairFrenchquote
312 _ -> Nothing
313
314 pairClose :: Char -> Maybe Pair
315 pairClose = \case
316 ')' -> Just PairParen
317 ']' -> Just PairBracket
318 '}' -> Just PairBrace
319 '»' -> Just PairFrenchquote
320 _ -> Nothing
321
322 p_AlphaNum :: Parser e s Char
323 p_AlphaNum = P.satisfy Char.isAlphaNum
324
325 p_Escape :: Parser e s Char
326 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
327
328 p_Link :: Parser e s Text
329 p_Link =
330 P.try (P.char '<' *> p <* P.char '>') <|>
331 p
332 where
333 p =
334 (\scheme addr -> Text.pack $ scheme <> "//" <> addr)
335 <$> P.option "" (P.try p_scheme)
336 <* P.string "//"
337 <*> p_addr
338 p_scheme =
339 (<> ":")
340 <$> P.some (P.satisfy $ \c ->
341 Char.isAlphaNum c
342 || c=='_'
343 || c=='-'
344 || c=='+')
345 <* P.char ':'
346 p_addr =
347 P.many $
348 P.satisfy $ \c ->
349 Char.isAlphaNum c
350 || c=='%'
351 || c=='/'
352 || c=='('
353 || c==')'
354 || c=='-'
355 || c=='_'
356 || c=='.'
357 || c=='#'
358 || c=='?'
359 || c=='='
360
361 p_ElemSingle :: Parser e s Pair
362 p_ElemSingle = pdbg "ElemSingle" $
363 PairElem
364 <$ P.char '<'
365 <*> p_Word
366 <*> p_Attrs
367 <* P.string "/>"
368
369 p_ElemOpen :: Parser e s Pair
370 p_ElemOpen = pdbg "ElemOpen" $
371 PairElem
372 <$ P.char '<'
373 <*> p_Word
374 <*> p_Attrs
375 <* P.char '>'
376
377 p_ElemClose :: Parser e s Pair
378 p_ElemClose = pdbg "ElemClose" $
379 (`PairElem` [])
380 <$ P.string "</"
381 <*> p_Word
382 <* P.char '>'
383
384 {-
385 p_ElemOpenOrSingle :: Parser e s Pair
386 p_ElemOpenOrSingle =
387 p_ElemOpen >>= \p ->
388 P.char '>' $> LexemePairOpen p <|>
389 P.string "/>" $> LexemePairAny p
390 -}
391
392
393
394
395
396
397
398
399
400 -- | Build 'Tokens' from many 'Token's.
401 tokens :: [Token] -> Tokens
402 tokens = Seq.fromList
403
404 -- | Build 'Tokens' from one 'Token'.
405 tokens1 :: Token -> Tokens
406 tokens1 = Seq.singleton
407
408 tokensPlainEmpty :: Tokens
409 tokensPlainEmpty = tokens1 $ Tree0 $ cell1 $ TokenPlain ""
410
411 isTokenWhite :: Token -> Bool
412 isTokenWhite (Tree0 (unCell -> TokenPlain t)) = Text.all Char.isSpace t
413 isTokenWhite _ = False
414
415 unTokenElem :: Tokens -> Maybe (Cell (Elem,Attrs,Tokens))
416 unTokenElem toks =
417 case toList $ Seq.dropWhileR isTokenWhite toks of
418 [TreeN (Cell bp ep (PairElem e as)) ts] -> Just (Cell bp ep (e,as,ts))
419 _ -> Nothing
420
421 isTokenElem :: Tokens -> Bool
422 isTokenElem toks =
423 case toList $ Seq.dropWhileR isTokenWhite toks of
424 [TreeN (unCell -> PairElem{}) _] -> True
425 _ -> False
426
427 pairBorders :: TokenKey -> Tokens -> (Text,Text)
428 pairBorders p ts =
429 case p of
430 PairElem e attrs ->
431 if Seq.null ts
432 then ("<"<>e<>foldMap f attrs<>"/>","")
433 else ("<"<>e<>foldMap f attrs<>">","</"<>e<>">")
434 where f (attr_white,Attr{..}) =
435 attr_white <>
436 attr_name <>
437 attr_open <>
438 attr_value <>
439 attr_close
440 PairHash -> ("#","#")
441 PairStar -> ("*","*")
442 PairSlash -> ("/","/")
443 PairUnderscore -> ("_","_")
444 PairDash -> ("-","-")
445 PairBackquote -> ("`","`")
446 PairSinglequote -> ("'","'")
447 PairDoublequote -> ("\"","\"")
448 PairFrenchquote -> ("«","»")
449 PairParen -> ("(",")")
450 PairBrace -> ("{","}")
451 PairBracket -> ("[","]")