module Worksheets.Utils.Char ( module Worksheets.Utils.Char, module Data.Char, ) where import Data.Char import Data.List qualified as List import Data.Set qualified as Set import Worksheets.Utils.Prelude data UnicodeBlock = UnicodeBlockAlphabetic_Presentation_Forms | UnicodeBlockArabic | UnicodeBlockArabic_Presentation_FormsA | UnicodeBlockArabic_Presentation_FormsB | UnicodeBlockArmenian | UnicodeBlockArrows | UnicodeBlockLatin UnicodeBlockLatin | UnicodeBlockBengali | UnicodeBlockBlock_Elements | UnicodeBlockBopomofo | UnicodeBlockBopomofo_Extended | UnicodeBlockBox_Drawing | UnicodeBlockBraille_Patterns | UnicodeBlockBuhid | UnicodeBlockCJK UnicodeBlockCJK | UnicodeBlockCherokee | UnicodeBlockCombining_Diacritical_Marks | UnicodeBlockCombining_Diacritical_Marks_for_Symbols | UnicodeBlockCombining_Half_Marks | UnicodeBlockControl_Pictures | UnicodeBlockCurrency_Symbols | UnicodeBlockCyrillic | UnicodeBlockCyrillic_Supplementary | UnicodeBlockDevanagari | UnicodeBlockDingbats | UnicodeBlockEnclosed_Alphanumerics | UnicodeBlockEthiopic | UnicodeBlockGeneral_Punctuation | UnicodeBlockGeometric_Shapes | UnicodeBlockGeorgian | UnicodeBlockGreek_Extended | UnicodeBlockGreek_and_Coptic | UnicodeBlockGujarati | UnicodeBlockGurmukhi | UnicodeBlockHalfwidth_and_Fullwidth_Forms | UnicodeBlockHangul_Compatibility_Jamo | UnicodeBlockHangul_Jamo | UnicodeBlockHangul_Syllables | UnicodeBlockHanunoo | UnicodeBlockHebrew | UnicodeBlockHigh_Private_Use_Surrogates | UnicodeBlockHigh_Surrogates | UnicodeBlockHiragana | UnicodeBlockIPA_Extensions | UnicodeBlockIdeographic_Description_Characters | UnicodeBlockKanbun | UnicodeBlockKangxi_Radicals | UnicodeBlockKannada | UnicodeBlockKatakana | UnicodeBlockKatakana_Phonetic_Extensions | UnicodeBlockKhmer | UnicodeBlockKhmer_Symbols | UnicodeBlockLao | UnicodeBlockLetterlike_Symbols | UnicodeBlockLimbu | UnicodeBlockLow_Surrogates | UnicodeBlockMalayalam | UnicodeBlockMathematical_Operators | UnicodeBlockMiscellaneous_Mathematical_SymbolsA | UnicodeBlockMiscellaneous_Mathematical_SymbolsB | UnicodeBlockMiscellaneous_Symbols | UnicodeBlockMiscellaneous_Symbols_and_Arrows | UnicodeBlockMiscellaneous_Technical | UnicodeBlockMongolian | UnicodeBlockMyanmar | UnicodeBlockNumber_Forms | UnicodeBlockOgham | UnicodeBlockOptical_Character_Recognition | UnicodeBlockOriya | UnicodeBlockPhonetic_Extensions | UnicodeBlockPrivate_Use_Area | UnicodeBlockRunic | UnicodeBlockSinhala | UnicodeBlockSmall_Form_Variants | UnicodeBlockSpacing_Modifier_Letters | UnicodeBlockSpecials | UnicodeBlockSuperscripts_and_Subscripts | UnicodeBlockSupplemental_ArrowsA | UnicodeBlockSupplemental_ArrowsB | UnicodeBlockSupplemental_Mathematical_Operators | UnicodeBlockSyriac | UnicodeBlockTagalog | UnicodeBlockTagbanwa | UnicodeBlockTai_Le | UnicodeBlockTamil | UnicodeBlockTelugu | UnicodeBlockThaana | UnicodeBlockThai | UnicodeBlockTibetan | UnicodeBlockUnified_Canadian_Aboriginal_Syllabics | UnicodeBlockVariation_Selectors | UnicodeBlockYi_Radicals | UnicodeBlockYi_Syllables | UnicodeBlockYijing_Hexagram_Symbols deriving (Eq, Ord, Show) unicodeBlocks = [ unicodeBlockCJK , unicodeBlockLatin , [ UnicodeBlockHalfwidth_and_Fullwidth_Forms , UnicodeBlockMiscellaneous_Technical ] & Set.fromList ] & mconcat unicodeBlockCJK = UnicodeBlockCJK <$> enumAll & Set.fromList unicodeBlockLatin = UnicodeBlockLatin <$> enumAll & Set.fromList data UnicodeBlockLatin = UnicodeBlockLatin_Basic | UnicodeBlockLatin1_Supplement | UnicodeBlockLatin_ExtendedA | UnicodeBlockLatin_ExtendedB | UnicodeBlockLatin_Extended_Additional deriving (Eq, Ord, Show, Enum) data UnicodeBlockCJK = UnicodeBlockCJK_Compatibility | UnicodeBlockCJK_Compatibility_Forms | UnicodeBlockCJK_Compatibility_Ideographs | UnicodeBlockCJK_Radicals_Supplement | UnicodeBlockCJK_Symbols_and_Punctuation | -- | CJK Unified Ideographs: U+4E00 to U+9FFF. -- This block contains the most commonly used Chinese characters in modern writing. UnicodeBlockCJK_Unified_Ideographs | -- | CJK Unified Ideographs Extension A: U+3400 to U+4DBF. -- Includes rare and historical characters. UnicodeBlockCJK_Unified_Ideographs_Extension_A | -- | CJK Unified Ideographs Extension B: U+20000 to U+2A6DF. -- Contains rare and historic characters, often used in academic or specialized contexts. UnicodeBlockCJK_Unified_Ideographs_Extension_B | -- | CJK Unified Ideographs Extension C: U+2A700 to U+2B73F. -- Include additional rare and historic characters. UnicodeBlockCJK_Unified_Ideographs_Extension_C | -- | CJK Unified Ideographs Extension D: U+2B740 to U+2B81F. -- Include additional rare and historic characters. UnicodeBlockCJK_Unified_Ideographs_Extension_D | -- | CJK Unified Ideographs Extension E: U+2B820 to U+2CEAF. -- Include additional rare and historic characters. UnicodeBlockCJK_Unified_Ideographs_Extension_E | -- | CJK Unified Ideographs Extension F: U+2CEB0 to U+2EBEF. -- Include additional rare and historic characters. UnicodeBlockCJK_Unified_Ideographs_Extension_F | -- | CJK Unified Ideographs Extension G: U+30000 to U+3134F. -- Include additional rare and historic characters. UnicodeBlockCJK_Unified_Ideographs_Extension_G | -- | CJK Unified Ideographs Extension H: U+31350 to U+323AF. -- Include additional rare and historic characters. UnicodeBlockCJK_Unified_Ideographs_Extension_H | UnicodeBlockCJK_Enclosed_Letters_and_Months deriving (Eq, Ord, Show, Enum) -- TODO: -- CJK Compatibility Ideographs: U+F900 to U+FAFF -- Contains duplicate characters and variants for compatibility with older encodings. -- CJK Compatibility Ideographs Supplement: U+2F800 to U+2FA1F -- Includes unifiable variants of ideographs. unicodeBlock c | '\x0000' <= c && c <= '\x007F' = Just $ UnicodeBlockLatin $ UnicodeBlockLatin_Basic | '\x0080' <= c && c <= '\x00FF' = Just $ UnicodeBlockLatin $ UnicodeBlockLatin1_Supplement | '\x0100' <= c && c <= '\x017F' = Just $ UnicodeBlockLatin $ UnicodeBlockLatin_ExtendedA | '\x0180' <= c && c <= '\x024F' = Just $ UnicodeBlockLatin $ UnicodeBlockLatin_ExtendedB | '\x0250' <= c && c <= '\x02AF' = Just UnicodeBlockIPA_Extensions | '\x02B0' <= c && c <= '\x02FF' = Just UnicodeBlockSpacing_Modifier_Letters | '\x0300' <= c && c <= '\x036F' = Just UnicodeBlockCombining_Diacritical_Marks | '\x0370' <= c && c <= '\x03FF' = Just UnicodeBlockGreek_and_Coptic | '\x0400' <= c && c <= '\x04FF' = Just UnicodeBlockCyrillic | '\x0500' <= c && c <= '\x052F' = Just UnicodeBlockCyrillic_Supplementary | '\x0530' <= c && c <= '\x058F' = Just UnicodeBlockArmenian | '\x0590' <= c && c <= '\x05FF' = Just UnicodeBlockHebrew | '\x0600' <= c && c <= '\x06FF' = Just UnicodeBlockArabic | '\x0700' <= c && c <= '\x074F' = Just UnicodeBlockSyriac | '\x0780' <= c && c <= '\x07BF' = Just UnicodeBlockThaana | '\x0900' <= c && c <= '\x097F' = Just UnicodeBlockDevanagari | '\x0980' <= c && c <= '\x09FF' = Just UnicodeBlockBengali | '\x0A00' <= c && c <= '\x0A7F' = Just UnicodeBlockGurmukhi | '\x0A80' <= c && c <= '\x0AFF' = Just UnicodeBlockGujarati | '\x0B00' <= c && c <= '\x0B7F' = Just UnicodeBlockOriya | '\x0B80' <= c && c <= '\x0BFF' = Just UnicodeBlockTamil | '\x0C00' <= c && c <= '\x0C7F' = Just UnicodeBlockTelugu | '\x0C80' <= c && c <= '\x0CFF' = Just UnicodeBlockKannada | '\x0D00' <= c && c <= '\x0D7F' = Just UnicodeBlockMalayalam | '\x0D80' <= c && c <= '\x0DFF' = Just UnicodeBlockSinhala | '\x0E00' <= c && c <= '\x0E7F' = Just UnicodeBlockThai | '\x0E80' <= c && c <= '\x0EFF' = Just UnicodeBlockLao | '\x0F00' <= c && c <= '\x0FFF' = Just UnicodeBlockTibetan | '\x1000' <= c && c <= '\x109F' = Just UnicodeBlockMyanmar | '\x10A0' <= c && c <= '\x10FF' = Just UnicodeBlockGeorgian | '\x1100' <= c && c <= '\x11FF' = Just UnicodeBlockHangul_Jamo | '\x1200' <= c && c <= '\x137F' = Just UnicodeBlockEthiopic | '\x13A0' <= c && c <= '\x13FF' = Just UnicodeBlockCherokee | '\x1400' <= c && c <= '\x167F' = Just UnicodeBlockUnified_Canadian_Aboriginal_Syllabics | '\x1680' <= c && c <= '\x169F' = Just UnicodeBlockOgham | '\x16A0' <= c && c <= '\x16FF' = Just UnicodeBlockRunic | '\x1700' <= c && c <= '\x171F' = Just UnicodeBlockTagalog | '\x1720' <= c && c <= '\x173F' = Just UnicodeBlockHanunoo | '\x1740' <= c && c <= '\x175F' = Just UnicodeBlockBuhid | '\x1760' <= c && c <= '\x177F' = Just UnicodeBlockTagbanwa | '\x1780' <= c && c <= '\x17FF' = Just UnicodeBlockKhmer | '\x1800' <= c && c <= '\x18AF' = Just UnicodeBlockMongolian | '\x1900' <= c && c <= '\x194F' = Just UnicodeBlockLimbu | '\x1950' <= c && c <= '\x197F' = Just UnicodeBlockTai_Le | '\x19E0' <= c && c <= '\x19FF' = Just UnicodeBlockKhmer_Symbols | '\x1D00' <= c && c <= '\x1D7F' = Just UnicodeBlockPhonetic_Extensions | '\x1E00' <= c && c <= '\x1EFF' = Just $ UnicodeBlockLatin $ UnicodeBlockLatin_Extended_Additional | '\x1F00' <= c && c <= '\x1FFF' = Just UnicodeBlockGreek_Extended | '\x2000' <= c && c <= '\x206F' = Just UnicodeBlockGeneral_Punctuation | '\x2070' <= c && c <= '\x209F' = Just UnicodeBlockSuperscripts_and_Subscripts | '\x20A0' <= c && c <= '\x20CF' = Just UnicodeBlockCurrency_Symbols | '\x20D0' <= c && c <= '\x20FF' = Just UnicodeBlockCombining_Diacritical_Marks_for_Symbols | '\x2100' <= c && c <= '\x214F' = Just UnicodeBlockLetterlike_Symbols | '\x2150' <= c && c <= '\x218F' = Just UnicodeBlockNumber_Forms | '\x2190' <= c && c <= '\x21FF' = Just UnicodeBlockArrows | '\x2200' <= c && c <= '\x22FF' = Just UnicodeBlockMathematical_Operators | '\x2300' <= c && c <= '\x23FF' = Just UnicodeBlockMiscellaneous_Technical | '\x2400' <= c && c <= '\x243F' = Just UnicodeBlockControl_Pictures | '\x2440' <= c && c <= '\x245F' = Just UnicodeBlockOptical_Character_Recognition | '\x2460' <= c && c <= '\x24FF' = Just UnicodeBlockEnclosed_Alphanumerics | '\x2500' <= c && c <= '\x257F' = Just UnicodeBlockBox_Drawing | '\x2580' <= c && c <= '\x259F' = Just UnicodeBlockBlock_Elements | '\x25A0' <= c && c <= '\x25FF' = Just UnicodeBlockGeometric_Shapes | '\x2600' <= c && c <= '\x26FF' = Just UnicodeBlockMiscellaneous_Symbols | '\x2700' <= c && c <= '\x27BF' = Just UnicodeBlockDingbats | '\x27C0' <= c && c <= '\x27EF' = Just UnicodeBlockMiscellaneous_Mathematical_SymbolsA | '\x27F0' <= c && c <= '\x27FF' = Just UnicodeBlockSupplemental_ArrowsA | '\x2800' <= c && c <= '\x28FF' = Just UnicodeBlockBraille_Patterns | '\x2900' <= c && c <= '\x297F' = Just UnicodeBlockSupplemental_ArrowsB | '\x2980' <= c && c <= '\x29FF' = Just UnicodeBlockMiscellaneous_Mathematical_SymbolsB | '\x2A00' <= c && c <= '\x2AFF' = Just UnicodeBlockSupplemental_Mathematical_Operators | '\x2B00' <= c && c <= '\x2BFF' = Just UnicodeBlockMiscellaneous_Symbols_and_Arrows | '\x2E80' <= c && c <= '\x2EFF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Radicals_Supplement | '\x2F00' <= c && c <= '\x2FDF' = Just UnicodeBlockKangxi_Radicals | '\x2FF0' <= c && c <= '\x2FFF' = Just UnicodeBlockIdeographic_Description_Characters | '\x3000' <= c && c <= '\x303F' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Symbols_and_Punctuation | '\x3040' <= c && c <= '\x309F' = Just UnicodeBlockHiragana | '\x30A0' <= c && c <= '\x30FF' = Just UnicodeBlockKatakana | '\x3100' <= c && c <= '\x312F' = Just UnicodeBlockBopomofo | '\x3130' <= c && c <= '\x318F' = Just UnicodeBlockHangul_Compatibility_Jamo | '\x3190' <= c && c <= '\x319F' = Just UnicodeBlockKanbun | '\x31A0' <= c && c <= '\x31BF' = Just UnicodeBlockBopomofo_Extended | '\x31F0' <= c && c <= '\x31FF' = Just UnicodeBlockKatakana_Phonetic_Extensions | '\x3200' <= c && c <= '\x32FF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Enclosed_Letters_and_Months | '\x3300' <= c && c <= '\x33FF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Compatibility | '\x3400' <= c && c <= '\x4DBF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Unified_Ideographs_Extension_A | '\x4DC0' <= c && c <= '\x4DFF' = Just UnicodeBlockYijing_Hexagram_Symbols | '\x4E00' <= c && c <= '\x9FFF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Unified_Ideographs | '\xA000' <= c && c <= '\xA48F' = Just UnicodeBlockYi_Syllables | '\xA490' <= c && c <= '\xA4CF' = Just UnicodeBlockYi_Radicals | '\xAC00' <= c && c <= '\xD7AF' = Just UnicodeBlockHangul_Syllables | '\xD800' <= c && c <= '\xDB7F' = Just UnicodeBlockHigh_Surrogates | '\xDB80' <= c && c <= '\xDBFF' = Just UnicodeBlockHigh_Private_Use_Surrogates | '\xDC00' <= c && c <= '\xDFFF' = Just UnicodeBlockLow_Surrogates | '\xE000' <= c && c <= '\xF8FF' = Just UnicodeBlockPrivate_Use_Area | '\xF900' <= c && c <= '\xFAFF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Compatibility_Ideographs | '\xFB00' <= c && c <= '\xFB4F' = Just UnicodeBlockAlphabetic_Presentation_Forms | '\xFB50' <= c && c <= '\xFDFF' = Just UnicodeBlockArabic_Presentation_FormsA | '\xFE00' <= c && c <= '\xFE0F' = Just UnicodeBlockVariation_Selectors | '\xFE20' <= c && c <= '\xFE2F' = Just UnicodeBlockCombining_Half_Marks | '\xFE30' <= c && c <= '\xFE4F' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Compatibility_Forms | '\xFE50' <= c && c <= '\xFE6F' = Just UnicodeBlockSmall_Form_Variants | '\xFE70' <= c && c <= '\xFEFF' = Just UnicodeBlockArabic_Presentation_FormsB | '\xFF00' <= c && c <= '\xFFEF' = Just UnicodeBlockHalfwidth_and_Fullwidth_Forms | '\xFFF0' <= c && c <= '\xFFFF' = Just UnicodeBlockSpecials | '\x20000' <= c && c <= '\x2A6DF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Unified_Ideographs_Extension_B | '\x2A700' <= c && c <= '\x2B73F' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Unified_Ideographs_Extension_C | '\x2B740' <= c && c <= '\x2B81F' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Unified_Ideographs_Extension_D | '\x2B820' <= c && c <= '\x2CEAF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Unified_Ideographs_Extension_E | '\x2CEB0' <= c && c <= '\x2EBEF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Unified_Ideographs_Extension_F | '\x30000' <= c && c <= '\x3134F' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Unified_Ideographs_Extension_G | '\x31350' <= c && c <= '\x323AF' = Just $ UnicodeBlockCJK $ UnicodeBlockCJK_Unified_Ideographs_Extension_H | otherwise = Nothing consecutiveGroups :: Eq k => [(k, v)] -> [(k, [v])] consecutiveGroups [] = [] consecutiveGroups ((k, v) : nexts) = ((k, v : List.map snd ks)) : consecutiveGroups notKs where (ks, notKs) = List.span (\(k', _v) -> k == k') nexts