]> Git — Sourcephile - doclang.git/blob - Language/TCT/Read/Token.hs
Factorize XML utilities.
[doclang.git] / Language / TCT / Read / Token.hs
1 {-# LANGUAGE FlexibleContexts #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 {-# LANGUAGE Rank2Types #-}
4 {-# LANGUAGE TypeFamilies #-}
5 module Language.TCT.Read.Token where
6
7 import Control.Applicative (Applicative(..), Alternative(..))
8 import Control.Monad (Monad(..))
9 import Data.Bool
10 import Data.Char (Char)
11 import Data.Eq (Eq(..))
12 import Data.Foldable (Foldable(..))
13 import Data.Function (($), (.))
14 import Data.Functor ((<$>), ($>), (<$))
15 import Data.Maybe (Maybe(..))
16 import Data.Monoid (Monoid(..))
17 import Data.Semigroup (Semigroup(..))
18 import Data.Sequence (ViewL(..), (<|))
19 import Data.Text (Text)
20 import Data.Text.Buildable (Buildable(..))
21 import Data.Tuple (fst,snd)
22 import Prelude (Num(..))
23 import Text.Show (Show(..))
24 import qualified Data.Char as Char
25 import qualified Data.Sequence as Seq
26 import qualified Data.Text as Text
27 import qualified Data.Text.Lazy as TL
28 import qualified Data.Text.Lazy.Builder as Builder
29 import qualified Text.Megaparsec as P
30 import qualified Text.Megaparsec.Char as P
31
32 import Language.TCT.Token
33 import Language.TCT.Cell
34 import Language.TCT.Elem
35 import Language.TCT.Read.Elem
36 import Language.TCT.Read.Cell
37
38 textOf :: Buildable a => a -> Text
39 textOf = TL.toStrict . Builder.toLazyText . build
40
41 -- * Type 'Pairs'
42 type Pairs = (Tokens,[(Cell Pair,Tokens)])
43
44 appendToken :: Pairs -> Cell Token -> Pairs
45 appendToken ps = appendTokens ps . Seq.singleton
46
47 appendTokens :: Pairs -> Tokens -> Pairs
48 appendTokens (t,[]) toks = (t<>toks,[])
49 appendTokens (t,(p0,t0):ps) toks = (t,(p0,t0<>toks):ps)
50
51 openPair :: Pairs -> Cell Pair -> Pairs
52 openPair (t,ms) p = (t,(p,mempty):ms)
53
54 -- | Close a 'Pair' when there is a matching 'LexemePairClose'.
55 closePair :: Pairs -> Cell Pair -> Pairs
56 closePair ps@(_,[]) (Cell bp ep p) = dbg "closePair" $
57 appendToken ps $
58 Cell bp ep $
59 TokenPlain $ snd $ pairBorders p tokensPlainEmpty
60 closePair (t,(p1,t1):ts) p = dbg "closePair" $
61 case (p1,p) of
62 (Cell bx _ex (PairElem x ax), Cell _by ey (PairElem y ay)) | x == y ->
63 appendToken (t,ts) $
64 Cell bx ey $
65 TokenPair (PairElem x (ax<>ay)) t1
66 (Cell bx _ex x, Cell _by ey y) | x == y ->
67 appendToken (t,ts) $
68 Cell bx ey $
69 TokenPair x t1
70 _ ->
71 (`closePair` p) $
72 appendTokens
73 (t,ts)
74 (closeUnpaired mempty (p1,t1))
75
76 -- | Close a 'Pair' when there is not a matching 'LexemePairClose'.
77 closeUnpaired :: Tokens -> (Cell Pair,Tokens) -> Tokens
78 closeUnpaired acc (Cell bp ep p,toks) = dbg "closeUnpaired" $
79 case p of
80 -- NOTE: try to close 'PairHash' as 'TokenTag' instead of 'TokenPlain'.
81 PairHash | (Cell bt et (TokenPlain t)) :< ts <- Seq.viewl $ toks <> acc ->
82 case Text.findIndex (not . isTagChar) t of
83 -- Just 0 -> toksHash mempty <> toks <> acc
84 Just i ->
85 Cell bp bt{columnPos = columnPos bt + i} (TokenTag tag)
86 <| Cell bt{columnPos = columnPos bt + i + 1} et (TokenPlain t')
87 <| ts
88 where (tag,t') = Text.splitAt i t
89 Nothing | Text.null t -> toksHash mempty <> toks <> acc
90 Nothing -> Cell bp et (TokenTag t) <| ts
91 _ -> toksHash tokensPlainEmpty <> toks <> acc
92 where
93 toksHash = tokens1 . Cell bp ep . TokenPlain . fst . pairBorders p
94 isTagChar c =
95 Char.isAlphaNum c ||
96 c=='·' ||
97 case Char.generalCategory c of
98 Char.DashPunctuation -> True
99 Char.ConnectorPunctuation -> True
100 _ -> False
101
102 -- | Close remaining 'Pair's at end of parsing.
103 closePairs :: Pairs -> Tokens
104 closePairs (t0,ps) = dbg "closePairs" $
105 t0 <> foldl' closeUnpaired mempty ps
106
107 appendLexeme :: Lexeme -> Pairs -> Pairs
108 appendLexeme lex acc =
109 dbg "appendLexeme" $
110 case dbg "appendLexeme" lex of
111 LexemePairOpen ps -> foldl' open acc ps
112 where
113 open a p@(Cell _bp ep (PairElem{})) = openPair a p `appendToken` (Cell ep ep $ TokenPlain "")
114 open a p = openPair a p
115 LexemePairClose ps -> foldl' closePair acc ps
116 LexemePairAny ps -> appendTokens acc $ tokens $ ((\p -> TokenPlain $ fst $ pairBorders p mempty) <$>) <$> ps
117 LexemePairBoth ps -> appendTokens acc $ tokens $ ((`TokenPair`mempty) <$>) <$> ps
118 LexemeEscape c -> appendToken acc $ TokenEscape <$> c
119 LexemeLink t -> appendToken acc $ TokenLink <$> t
120 LexemeWhite (unCell -> "") -> acc
121 LexemeWhite cs -> appendToken acc $ TokenPlain <$> cs
122 LexemeAlphaNum cs -> appendToken acc $ TokenPlain . Text.pack <$> cs
123 LexemeAny cs -> appendToken acc $ TokenPlain . Text.pack <$> cs
124 LexemeToken ts -> appendTokens acc ts
125
126 -- * Type 'Lexeme'
127 data Lexeme
128 = LexemePairOpen ![Cell Pair]
129 | LexemePairClose ![Cell Pair]
130 | LexemePairAny ![Cell Pair]
131 | LexemePairBoth ![Cell Pair]
132 | LexemeEscape !(Cell Char)
133 | LexemeLink !(Cell Text)
134 | LexemeWhite !(Cell White)
135 | LexemeAlphaNum !(Cell [Char])
136 | LexemeAny !(Cell [Char])
137 | LexemeToken !Tokens
138 deriving (Eq, Show)
139
140 p_Tokens :: Parser e s Tokens
141 p_Tokens = pdbg "Tokens" $
142 closePairs .
143 foldr appendLexeme mempty .
144 dbg "Lexemes" .
145 mangleLexemes .
146 (LexemeWhite (cell0 "") :) <$>
147 go [LexemeWhite (cell0 "")]
148 where
149 go :: [Lexeme] -> Parser e s [Lexeme]
150 go acc =
151 (P.eof $> acc) <|>
152 (p_Lexeme >>= \next -> go $ mangleLexemes $ next:acc)
153
154 mangleLexemes = \case
155 LexemeAny (Cell _bx ex x):LexemeAny (Cell by _ey y):acc -> LexemeAny (Cell by ex (x<>y)):acc
156
157 -- "   
158 w@LexemeWhite{}:LexemePairAny p:acc -> w:LexemePairClose p:acc
159 --    "
160 LexemePairAny p:w@LexemeWhite{}:acc -> LexemePairOpen p:w:acc
161
162 --    ,,,"
163 LexemePairAny p:a@LexemeAny{}:w@LexemeWhite{}:acc -> LexemePairOpen p:a:w:acc
164 -- ",,,   
165 w@LexemeWhite{}:a@LexemeAny{}:LexemePairAny p:acc -> w:a:LexemePairClose p:acc
166
167 -- ",,,AAA
168 an@LexemeAlphaNum{}:a@LexemeAny{}:LexemePairAny p:acc -> an:a:LexemePairClose p:acc
169 -- ,,,"AAA
170 an@LexemeAlphaNum{}:LexemePairAny p:a@LexemeAny{}:acc -> an:LexemePairOpen p:a:acc
171
172 -- ")
173 c@LexemePairClose{}:LexemePairAny p:acc -> c:LexemePairClose p:acc
174 -- ("
175 LexemePairAny p:o@LexemePairOpen{}:acc -> LexemePairOpen p:o:acc
176
177 acc -> acc
178
179 pairAny :: Char -> Maybe Pair
180 pairAny = \case
181 '-' -> Just PairDash
182 '/' -> Just PairSlash
183 '"' -> Just PairDoublequote
184 '\'' -> Just PairSinglequote
185 '`' -> Just PairBackquote
186 '_' -> Just PairUnderscore
187 '*' -> Just PairStar
188 '#' -> Just PairHash
189 _ -> Nothing
190
191 pairOpen :: Char -> Maybe Pair
192 pairOpen = \case
193 '(' -> Just PairParen
194 '[' -> Just PairBracket
195 '{' -> Just PairBrace
196 '«' -> Just PairFrenchquote
197 _ -> Nothing
198
199 pairClose :: Char -> Maybe Pair
200 pairClose = \case
201 ')' -> Just PairParen
202 ']' -> Just PairBracket
203 '}' -> Just PairBrace
204 '»' -> Just PairFrenchquote
205 _ -> Nothing
206
207 p_Cell :: Parser e s a -> Parser e s (Cell a)
208 p_Cell pa = do
209 bp <- p_Position
210 a <- pa
211 ep <- p_Position
212 return $ Cell bp ep a
213
214 p_Lexeme :: Parser e s Lexeme
215 p_Lexeme = pdbg "Lexeme" $
216 P.choice
217 [ P.try $ LexemeWhite <$> p_Cell p_Spaces
218 , P.try $ LexemePairAny <$> P.some (p_Cell $ p_satisfyMaybe pairAny)
219 , P.try $ LexemePairBoth <$> P.some (P.try $ p_Cell p_ElemSingle)
220 , P.try $ LexemePairOpen <$> P.some (p_Cell $ p_satisfyMaybe pairOpen <|> P.try p_ElemOpen)
221 , P.try $ LexemePairClose <$> P.some (p_Cell $ p_satisfyMaybe pairClose <|> P.try p_ElemClose)
222 , P.try $ LexemeEscape <$> p_Cell p_Escape
223 , P.try $ LexemeLink <$> p_Cell p_Link
224 , P.try $ LexemeAlphaNum <$> p_Cell (P.some p_AlphaNum)
225 , LexemeAny <$> p_Cell (pure <$> P.anyChar)
226 ]
227
228 p_AlphaNum :: Parser e s Char
229 p_AlphaNum = P.satisfy Char.isAlphaNum
230
231 p_Escape :: Parser e s Char
232 p_Escape = P.char '\\' *> P.satisfy Char.isPrint
233
234 p_Link :: Parser e s Text
235 p_Link =
236 (\scheme addr -> Text.pack $ scheme <> "//" <> addr)
237 <$> P.option "" (P.try p_scheme)
238 <* P.string "//"
239 <*> p_addr
240 where
241 p_scheme =
242 (<> ":")
243 <$> P.some (P.satisfy $ \c ->
244 Char.isAlphaNum c
245 || c=='_'
246 || c=='-'
247 || c=='+')
248 <* P.char ':'
249 p_addr =
250 P.many $
251 P.satisfy $ \c ->
252 Char.isAlphaNum c
253 || c=='%'
254 || c=='/'
255 || c=='('
256 || c==')'
257 || c=='-'
258 || c=='_'
259 || c=='.'
260 || c=='#'
261 || c=='?'
262 || c=='='
263
264 p_ElemSingle :: Parser e s Pair
265 p_ElemSingle = pdbg "ElemSingle" $
266 PairElem
267 <$ P.char '<'
268 <*> p_Word
269 <*> p_Attrs
270 <* P.string "/>"
271
272 p_ElemOpen :: Parser e s Pair
273 p_ElemOpen = pdbg "ElemOpen" $
274 PairElem
275 <$ P.char '<'
276 <*> p_Word
277 <*> p_Attrs
278 <* P.char '>'
279
280 p_ElemClose :: Parser e s Pair
281 p_ElemClose = pdbg "ElemClose" $
282 (`PairElem` [])
283 <$ P.string "</"
284 <*> p_Word
285 <* P.char '>'
286
287 {-
288 p_ElemOpenOrSingle :: Parser e s Pair
289 p_ElemOpenOrSingle =
290 p_ElemOpen >>= \p ->
291 P.char '>' $> LexemePairOpen p <|>
292 P.string "/>" $> LexemePairAny p
293 -}