]> Git — Sourcephile - julm/worksheets.git/blob - src/Worksheets/Utils/Char.hs
update
[julm/worksheets.git] / src / Worksheets / Utils / Char.hs
1 module Worksheets.Utils.Char (
2 module Worksheets.Utils.Char,
3 module Data.Char,
4 ) where
5
6 import Data.Char
7 import Data.List qualified as List
8 import Data.Set qualified as Set
9 import Worksheets.Utils.Prelude
10
11 data UnicodeBlock
12 = UnicodeBlockAlphabetic_Presentation_Forms
13 | UnicodeBlockArabic
14 | UnicodeBlockArabic_Presentation_FormsA
15 | UnicodeBlockArabic_Presentation_FormsB
16 | UnicodeBlockArmenian
17 | UnicodeBlockArrows
18 | UnicodeBlockLatin UnicodeBlockLatin
19 | UnicodeBlockBengali
20 | UnicodeBlockBlock_Elements
21 | UnicodeBlockBopomofo
22 | UnicodeBlockBopomofo_Extended
23 | UnicodeBlockBox_Drawing
24 | UnicodeBlockBraille_Patterns
25 | UnicodeBlockBuhid
26 | UnicodeBlockCJK UnicodeBlockCJK
27 | UnicodeBlockCherokee
28 | UnicodeBlockCombining_Diacritical_Marks
29 | UnicodeBlockCombining_Diacritical_Marks_for_Symbols
30 | UnicodeBlockCombining_Half_Marks
31 | UnicodeBlockControl_Pictures
32 | UnicodeBlockCurrency_Symbols
33 | UnicodeBlockCyrillic
34 | UnicodeBlockCyrillic_Supplementary
35 | UnicodeBlockDevanagari
36 | UnicodeBlockDingbats
37 | UnicodeBlockEnclosed_Alphanumerics
38 | UnicodeBlockEthiopic
39 | UnicodeBlockGeneral_Punctuation
40 | UnicodeBlockGeometric_Shapes
41 | UnicodeBlockGeorgian
42 | UnicodeBlockGreek_Extended
43 | UnicodeBlockGreek_and_Coptic
44 | UnicodeBlockGujarati
45 | UnicodeBlockGurmukhi
46 | UnicodeBlockHalfwidth_and_Fullwidth_Forms
47 | UnicodeBlockHangul_Compatibility_Jamo
48 | UnicodeBlockHangul_Jamo
49 | UnicodeBlockHangul_Syllables
50 | UnicodeBlockHanunoo
51 | UnicodeBlockHebrew
52 | UnicodeBlockHigh_Private_Use_Surrogates
53 | UnicodeBlockHigh_Surrogates
54 | UnicodeBlockHiragana
55 | UnicodeBlockIPA_Extensions
56 | UnicodeBlockIdeographic_Description_Characters
57 | UnicodeBlockKanbun
58 | UnicodeBlockKangxi_Radicals
59 | UnicodeBlockKannada
60 | UnicodeBlockKatakana
61 | UnicodeBlockKatakana_Phonetic_Extensions
62 | UnicodeBlockKhmer
63 | UnicodeBlockKhmer_Symbols
64 | UnicodeBlockLao
65 | UnicodeBlockLetterlike_Symbols
66 | UnicodeBlockLimbu
67 | UnicodeBlockLow_Surrogates
68 | UnicodeBlockMalayalam
69 | UnicodeBlockMathematical_Operators
70 | UnicodeBlockMiscellaneous_Mathematical_SymbolsA
71 | UnicodeBlockMiscellaneous_Mathematical_SymbolsB
72 | UnicodeBlockMiscellaneous_Symbols
73 | UnicodeBlockMiscellaneous_Symbols_and_Arrows
74 | UnicodeBlockMiscellaneous_Technical
75 | UnicodeBlockMongolian
76 | UnicodeBlockMyanmar
77 | UnicodeBlockNumber_Forms
78 | UnicodeBlockOgham
79 | UnicodeBlockOptical_Character_Recognition
80 | UnicodeBlockOriya
81 | UnicodeBlockPhonetic_Extensions
82 | UnicodeBlockPrivate_Use_Area
83 | UnicodeBlockRunic
84 | UnicodeBlockSinhala
85 | UnicodeBlockSmall_Form_Variants
86 | UnicodeBlockSpacing_Modifier_Letters
87 | UnicodeBlockSpecials
88 | UnicodeBlockSuperscripts_and_Subscripts
89 | UnicodeBlockSupplemental_ArrowsA
90 | UnicodeBlockSupplemental_ArrowsB
91 | UnicodeBlockSupplemental_Mathematical_Operators
92 | UnicodeBlockSyriac
93 | UnicodeBlockTagalog
94 | UnicodeBlockTagbanwa
95 | UnicodeBlockTai_Le
96 | UnicodeBlockTamil
97 | UnicodeBlockTelugu
98 | UnicodeBlockThaana
99 | UnicodeBlockThai
100 | UnicodeBlockTibetan
101 | UnicodeBlockUnified_Canadian_Aboriginal_Syllabics
102 | UnicodeBlockVariation_Selectors
103 | UnicodeBlockYi_Radicals
104 | UnicodeBlockYi_Syllables
105 | UnicodeBlockYijing_Hexagram_Symbols
106 deriving (Eq, Ord, Show)
107
108 unicodeBlocks =
109 [ unicodeBlockCJK
110 , unicodeBlockLatin
111 , [ UnicodeBlockHalfwidth_and_Fullwidth_Forms
112 , UnicodeBlockMiscellaneous_Technical
113 ]
114 & Set.fromList
115 ]
116 & mconcat
117
118 unicodeBlockCJK = UnicodeBlockCJK <$> enumAll & Set.fromList
119 unicodeBlockLatin = UnicodeBlockLatin <$> enumAll & Set.fromList
120
121 data UnicodeBlockLatin
122 = UnicodeBlockLatin_Basic
123 | UnicodeBlockLatin1_Supplement
124 | UnicodeBlockLatin_ExtendedA
125 | UnicodeBlockLatin_ExtendedB
126 | UnicodeBlockLatin_Extended_Additional
127 deriving (Eq, Ord, Show, Enum)
128
129 data UnicodeBlockCJK
130 = UnicodeBlockCJK_Compatibility
131 | UnicodeBlockCJK_Compatibility_Forms
132 | UnicodeBlockCJK_Compatibility_Ideographs
133 | UnicodeBlockCJK_Radicals_Supplement
134 | UnicodeBlockCJK_Symbols_and_Punctuation
135 | -- | CJK Unified Ideographs: U+4E00 to U+9FFF.
136 -- This block contains the most commonly used Chinese characters in modern writing.
137 UnicodeBlockCJK_Unified_Ideographs
138 | -- | CJK Unified Ideographs Extension A: U+3400 to U+4DBF.
139 -- Includes rare and historical characters.
140 UnicodeBlockCJK_Unified_Ideographs_Extension_A
141 | -- | CJK Unified Ideographs Extension B: U+20000 to U+2A6DF.
142 -- Contains rare and historic characters, often used in academic or specialized contexts.
143 UnicodeBlockCJK_Unified_Ideographs_Extension_B
144 | -- | CJK Unified Ideographs Extension C: U+2A700 to U+2B73F.
145 -- Include additional rare and historic characters.
146 UnicodeBlockCJK_Unified_Ideographs_Extension_C
147 | -- | CJK Unified Ideographs Extension D: U+2B740 to U+2B81F.
148 -- Include additional rare and historic characters.
149 UnicodeBlockCJK_Unified_Ideographs_Extension_D
150 | -- | CJK Unified Ideographs Extension E: U+2B820 to U+2CEAF.
151 -- Include additional rare and historic characters.
152 UnicodeBlockCJK_Unified_Ideographs_Extension_E
153 | -- | CJK Unified Ideographs Extension F: U+2CEB0 to U+2EBEF.
154 -- Include additional rare and historic characters.
155 UnicodeBlockCJK_Unified_Ideographs_Extension_F
156 | -- | CJK Unified Ideographs Extension G: U+30000 to U+3134F.
157 -- Include additional rare and historic characters.
158 UnicodeBlockCJK_Unified_Ideographs_Extension_G
159 | -- | CJK Unified Ideographs Extension H: U+31350 to U+323AF.
160 -- Include additional rare and historic characters.
161 UnicodeBlockCJK_Unified_Ideographs_Extension_H
162 | UnicodeBlockCJK_Enclosed_Letters_and_Months
163 deriving (Eq, Ord, Show, Enum)
164
165 -- TODO:
166 -- CJK Compatibility Ideographs: U+F900 to U+FAFF
167 -- Contains duplicate characters and variants for compatibility with older encodings.
168
169 -- CJK Compatibility Ideographs Supplement: U+2F800 to U+2FA1F
170 -- Includes unifiable variants of ideographs.
171
172 unicodeBlock c
173 | '\x0000' <= c && c <= '\x007F' = Just $ UnicodeBlockLatin $ UnicodeBlockLatin_Basic
174 | '\x0080' <= c && c <= '\x00FF' = Just $ UnicodeBlockLatin $ UnicodeBlockLatin1_Supplement
175 | '\x0100' <= c && c <= '\x017F' = Just $ UnicodeBlockLatin $ UnicodeBlockLatin_ExtendedA
176 | '\x0180' <= c && c <= '\x024F' = Just $ UnicodeBlockLatin $ UnicodeBlockLatin_ExtendedB
177 | '\x0250' <= c && c <= '\x02AF' = Just UnicodeBlockIPA_Extensions
178 | '\x02B0' <= c && c <= '\x02FF' = Just UnicodeBlockSpacing_Modifier_Letters
179 | '\x0300' <= c && c <= '\x036F' = Just UnicodeBlockCombining_Diacritical_Marks
180 | '\x0370' <= c && c <= '\x03FF' = Just UnicodeBlockGreek_and_Coptic
181 | '\x0400' <= c && c <= '\x04FF' = Just UnicodeBlockCyrillic
182 | '\x0500' <= c && c <= '\x052F' = Just UnicodeBlockCyrillic_Supplementary
183 | '\x0530' <= c && c <= '\x058F' = Just UnicodeBlockArmenian
184 | '\x0590' <= c && c <= '\x05FF' = Just UnicodeBlockHebrew
185 | '\x0600' <= c && c <= '\x06FF' = Just UnicodeBlockArabic
186 | '\x0700' <= c && c <= '\x074F' = Just UnicodeBlockSyriac
187 | '\x0780' <= c && c <= '\x07BF' = Just UnicodeBlockThaana
188 | '\x0900' <= c && c <= '\x097F' = Just UnicodeBlockDevanagari
189 | '\x0980' <= c && c <= '\x09FF' = Just UnicodeBlockBengali
190 | '\x0A00' <= c && c <= '\x0A7F' = Just UnicodeBlockGurmukhi
191 | '\x0A80' <= c && c <= '\x0AFF' = Just UnicodeBlockGujarati
192 | '\x0B00' <= c && c <= '\x0B7F' = Just UnicodeBlockOriya
193 | '\x0B80' <= c && c <= '\x0BFF' = Just UnicodeBlockTamil
194 | '\x0C00' <= c && c <= '\x0C7F' = Just UnicodeBlockTelugu
195 | '\x0C80' <= c && c <= '\x0CFF' = Just UnicodeBlockKannada
196 | '\x0D00' <= c && c <= '\x0D7F' = Just UnicodeBlockMalayalam
197 | '\x0D80' <= c && c <= '\x0DFF' = Just UnicodeBlockSinhala
198 | '\x0E00' <= c && c <= '\x0E7F' = Just UnicodeBlockThai
199 | '\x0E80' <= c && c <= '\x0EFF' = Just UnicodeBlockLao
200 | '\x0F00' <= c && c <= '\x0FFF' = Just UnicodeBlockTibetan
201 | '\x1000' <= c && c <= '\x109F' = Just UnicodeBlockMyanmar
202 | '\x10A0' <= c && c <= '\x10FF' = Just UnicodeBlockGeorgian
203 | '\x1100' <= c && c <= '\x11FF' = Just UnicodeBlockHangul_Jamo
204 | '\x1200' <= c && c <= '\x137F' = Just UnicodeBlockEthiopic
205 | '\x13A0' <= c && c <= '\x13FF' = Just UnicodeBlockCherokee
206 | '\x1400' <= c && c <= '\x167F' = Just UnicodeBlockUnified_Canadian_Aboriginal_Syllabics
207 | '\x1680' <= c && c <= '\x169F' = Just UnicodeBlockOgham
208 | '\x16A0' <= c && c <= '\x16FF' = Just UnicodeBlockRunic
209 | '\x1700' <= c && c <= '\x171F' = Just UnicodeBlockTagalog
210 | '\x1720' <= c && c <= '\x173F' = Just UnicodeBlockHanunoo
211 | '\x1740' <= c && c <= '\x175F' = Just UnicodeBlockBuhid
212 | '\x1760' <= c && c <= '\x177F' = Just UnicodeBlockTagbanwa
213 | '\x1780' <= c && c <= '\x17FF' = Just UnicodeBlockKhmer
214 | '\x1800' <= c && c <= '\x18AF' = Just UnicodeBlockMongolian
215 | '\x1900' <= c && c <= '\x194F' = Just UnicodeBlockLimbu
216 | '\x1950' <= c && c <= '\x197F' = Just UnicodeBlockTai_Le
217 | '\x19E0' <= c && c <= '\x19FF' = Just UnicodeBlockKhmer_Symbols
218 | '\x1D00' <= c && c <= '\x1D7F' = Just UnicodeBlockPhonetic_Extensions
219 | '\x1E00' <= c && c <= '\x1EFF' = Just $ UnicodeBlockLatin $ UnicodeBlockLatin_Extended_Additional
220 | '\x1F00' <= c && c <= '\x1FFF' = Just UnicodeBlockGreek_Extended
221 | '\x2000' <= c && c <= '\x206F' = Just UnicodeBlockGeneral_Punctuation
222 | '\x2070' <= c && c <= '\x209F' = Just UnicodeBlockSuperscripts_and_Subscripts
223 | '\x20A0' <= c && c <= '\x20CF' = Just UnicodeBlockCurrency_Symbols
224 | '\x20D0' <= c && c <= '\x20FF' = Just UnicodeBlockCombining_Diacritical_Marks_for_Symbols
225 | '\x2100' <= c && c <= '\x214F' = Just UnicodeBlockLetterlike_Symbols
226 | '\x2150' <= c && c <= '\x218F' = Just UnicodeBlockNumber_Forms
227 | '\x2190' <= c && c <= '\x21FF' = Just UnicodeBlockArrows
228 | '\x2200' <= c && c <= '\x22FF' = Just UnicodeBlockMathematical_Operators
229 | '\x2300' <= c && c <= '\x23FF' = Just UnicodeBlockMiscellaneous_Technical
230 | '\x2400' <= c && c <= '\x243F' = Just UnicodeBlockControl_Pictures
231 | '\x2440' <= c && c <= '\x245F' = Just UnicodeBlockOptical_Character_Recognition
232 | '\x2460' <= c && c <= '\x24FF' = Just UnicodeBlockEnclosed_Alphanumerics
233 | '\x2500' <= c && c <= '\x257F' = Just UnicodeBlockBox_Drawing
234 | '\x2580' <= c && c <= '\x259F' = Just UnicodeBlockBlock_Elements
235 | '\x25A0' <= c && c <= '\x25FF' = Just UnicodeBlockGeometric_Shapes
236 | '\x2600' <= c && c <= '\x26FF' = Just UnicodeBlockMiscellaneous_Symbols
237 | '\x2700' <= c && c <= '\x27BF' = Just UnicodeBlockDingbats
238 | '\x27C0' <= c && c <= '\x27EF' = Just UnicodeBlockMiscellaneous_Mathematical_SymbolsA
239 | '\x27F0' <= c && c <= '\x27FF' = Just UnicodeBlockSupplemental_ArrowsA
240 | '\x2800' <= c && c <= '\x28FF' = Just UnicodeBlockBraille_Patterns
241 | '\x2900' <= c && c <= '\x297F' = Just UnicodeBlockSupplemental_ArrowsB
242 | '\x2980' <= c && c <= '\x29FF' = Just UnicodeBlockMiscellaneous_Mathematical_SymbolsB
243 | '\x2A00' <= c && c <= '\x2AFF' = Just UnicodeBlockSupplemental_Mathematical_Operators
244 | '\x2B00' <= c && c <= '\x2BFF' = Just UnicodeBlockMiscellaneous_Symbols_and_Arrows
245 | '\x2E80' <= c && c <= '\x2EFF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Radicals_Supplement
246 | '\x2F00' <= c && c <= '\x2FDF' = Just UnicodeBlockKangxi_Radicals
247 | '\x2FF0' <= c && c <= '\x2FFF' = Just UnicodeBlockIdeographic_Description_Characters
248 | '\x3000' <= c && c <= '\x303F' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Symbols_and_Punctuation
249 | '\x3040' <= c && c <= '\x309F' = Just UnicodeBlockHiragana
250 | '\x30A0' <= c && c <= '\x30FF' = Just UnicodeBlockKatakana
251 | '\x3100' <= c && c <= '\x312F' = Just UnicodeBlockBopomofo
252 | '\x3130' <= c && c <= '\x318F' = Just UnicodeBlockHangul_Compatibility_Jamo
253 | '\x3190' <= c && c <= '\x319F' = Just UnicodeBlockKanbun
254 | '\x31A0' <= c && c <= '\x31BF' = Just UnicodeBlockBopomofo_Extended
255 | '\x31F0' <= c && c <= '\x31FF' = Just UnicodeBlockKatakana_Phonetic_Extensions
256 | '\x3200' <= c && c <= '\x32FF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Enclosed_Letters_and_Months
257 | '\x3300' <= c && c <= '\x33FF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Compatibility
258 | '\x3400' <= c && c <= '\x4DBF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Unified_Ideographs_Extension_A
259 | '\x4DC0' <= c && c <= '\x4DFF' = Just UnicodeBlockYijing_Hexagram_Symbols
260 | '\x4E00' <= c && c <= '\x9FFF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Unified_Ideographs
261 | '\xA000' <= c && c <= '\xA48F' = Just UnicodeBlockYi_Syllables
262 | '\xA490' <= c && c <= '\xA4CF' = Just UnicodeBlockYi_Radicals
263 | '\xAC00' <= c && c <= '\xD7AF' = Just UnicodeBlockHangul_Syllables
264 | '\xD800' <= c && c <= '\xDB7F' = Just UnicodeBlockHigh_Surrogates
265 | '\xDB80' <= c && c <= '\xDBFF' = Just UnicodeBlockHigh_Private_Use_Surrogates
266 | '\xDC00' <= c && c <= '\xDFFF' = Just UnicodeBlockLow_Surrogates
267 | '\xE000' <= c && c <= '\xF8FF' = Just UnicodeBlockPrivate_Use_Area
268 | '\xF900' <= c && c <= '\xFAFF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Compatibility_Ideographs
269 | '\xFB00' <= c && c <= '\xFB4F' = Just UnicodeBlockAlphabetic_Presentation_Forms
270 | '\xFB50' <= c && c <= '\xFDFF' = Just UnicodeBlockArabic_Presentation_FormsA
271 | '\xFE00' <= c && c <= '\xFE0F' = Just UnicodeBlockVariation_Selectors
272 | '\xFE20' <= c && c <= '\xFE2F' = Just UnicodeBlockCombining_Half_Marks
273 | '\xFE30' <= c && c <= '\xFE4F' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Compatibility_Forms
274 | '\xFE50' <= c && c <= '\xFE6F' = Just UnicodeBlockSmall_Form_Variants
275 | '\xFE70' <= c && c <= '\xFEFF' = Just UnicodeBlockArabic_Presentation_FormsB
276 | '\xFF00' <= c && c <= '\xFFEF' = Just UnicodeBlockHalfwidth_and_Fullwidth_Forms
277 | '\xFFF0' <= c && c <= '\xFFFF' = Just UnicodeBlockSpecials
278 | '\x20000' <= c && c <= '\x2A6DF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Unified_Ideographs_Extension_B
279 | '\x2A700' <= c && c <= '\x2B73F' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Unified_Ideographs_Extension_C
280 | '\x2B740' <= c && c <= '\x2B81F' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Unified_Ideographs_Extension_D
281 | '\x2B820' <= c && c <= '\x2CEAF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Unified_Ideographs_Extension_E
282 | '\x2CEB0' <= c && c <= '\x2EBEF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Unified_Ideographs_Extension_F
283 | '\x30000' <= c && c <= '\x3134F' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Unified_Ideographs_Extension_G
284 | '\x31350' <= c && c <= '\x323AF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Unified_Ideographs_Extension_H
285 | otherwise = Nothing
286
287 consecutiveGroups :: Eq k => [(k, v)] -> [(k, [v])]
288 consecutiveGroups [] = []
289 consecutiveGroups ((k, v) : nexts) = ((k, v : List.map snd ks)) : consecutiveGroups notKs
290 where
291 (ks, notKs) = List.span (\(k', _v) -> k == k') nexts