1 module Worksheets.Text where
3 import Control.Monad (when)
4 import Data.ByteString.Builder (Builder)
5 import Data.Char qualified as Char
6 import Data.Foldable (foldMap)
7 import Data.List qualified as List
8 import Data.Map.Strict qualified as Map
9 import Data.Maybe (Maybe (..))
10 import Data.Text (Text)
11 import Data.Text qualified as Text
12 import Paths_worksheets qualified as Self
13 import System.FilePath.Posix ((</>))
14 import System.FilePath.Posix qualified as File
16 import Text.Blaze.Html5 qualified as H
17 import Text.Blaze.Html5.Attributes qualified as HA
18 import Text.Blaze.Renderer.Utf8 qualified as Blaze
19 import Text.Show (Show (..))
22 type Texte = [TexteAtome]
23 data TexteAtome = TexteAtome
24 { texteAtomeLangue :: Langue
25 , texteAtomeGlyphs :: Text
26 , texteAtomePhonetic :: Maybe Text
28 deriving (Eq, Ord, Show)
34 | LangueMandarinPinyin
36 deriving (Eq, Ord, Show)
38 splitOnWords :: Text -> [Text]
39 splitOnWords = splitOnChar splitterOnWords
41 splitOnChar :: (Char -> Bool) -> Text -> [Text]
45 case f `Text.span` t of
47 | Text.null l && Text.null r -> []
48 | Text.null l -> Text.singleton (Text.head t) : go (Text.tail t)
50 | otherwise -> l : go r
52 splitterOnWords :: Char -> Bool
53 splitterOnWords c = case Char.generalCategory c of
54 Char.UppercaseLetter -> False
55 Char.LowercaseLetter -> False
56 Char.TitlecaseLetter -> False
57 Char.ModifierLetter -> False
58 Char.OtherLetter -> False
59 Char.NonSpacingMark -> True
60 Char.SpacingCombiningMark -> True
61 Char.EnclosingMark -> True
62 Char.DecimalNumber -> False
63 Char.LetterNumber -> False
64 Char.OtherNumber -> False
65 Char.ConnectorPunctuation -> True
66 Char.DashPunctuation -> True
67 Char.OpenPunctuation -> True
68 Char.ClosePunctuation -> True
69 Char.InitialQuote -> True
70 Char.FinalQuote -> True
71 Char.OtherPunctuation -> True
72 Char.MathSymbol -> True
73 Char.CurrencySymbol -> True
74 Char.ModifierSymbol -> True
75 Char.OtherSymbol -> True
77 Char.LineSeparator -> True
78 Char.ParagraphSeparator -> True
81 Char.Surrogate -> True
82 Char.PrivateUse -> True
83 Char.NotAssigned -> True
89 | CharClassPunctuation
94 deriving (Eq, Ord, Show)
97 [ (CharClassLetter, isLetter)
98 , (CharClassMark, isMark)
99 , (CharClassNumber, isNumber)
100 , (CharClassPunctuation, isPunctuation)
101 , (CharClassQuote, isQuote)
102 , (CharClassSymbol, isSymbol)
103 , (CharClassSeparator, isSeparator)
104 , (CharClassOther, isOther)
107 isLetter Char.UppercaseLetter = True
108 isLetter Char.LowercaseLetter = True
109 isLetter Char.TitlecaseLetter = True
110 isLetter Char.ModifierLetter = True
111 isLetter Char.OtherLetter = True
113 isMark Char.NonSpacingMark = True
114 isMark Char.SpacingCombiningMark = True
115 isMark Char.EnclosingMark = True
117 isNumber Char.DecimalNumber = True
118 isNumber Char.LetterNumber = True
119 isNumber Char.OtherNumber = True
121 isPunctuation Char.ConnectorPunctuation = True
122 isPunctuation Char.DashPunctuation = True
123 isPunctuation Char.OpenPunctuation = True
124 isPunctuation Char.ClosePunctuation = True
125 isPunctuation Char.OtherPunctuation = True
126 isPunctuation _ = False
127 isQuote Char.InitialQuote = True
128 isQuote Char.FinalQuote = True
130 isSymbol Char.MathSymbol = True
131 isSymbol Char.CurrencySymbol = True
132 isSymbol Char.ModifierSymbol = True
133 isSymbol Char.OtherSymbol = True
135 isSeparator Char.Space = True
136 isSeparator Char.LineSeparator = True
137 isSeparator Char.ParagraphSeparator = True
138 isSeparator _ = False
139 isOther Char.Control = True
140 isOther Char.Format = True
141 isOther Char.Surrogate = True
142 isOther Char.PrivateUse = True
143 isOther Char.NotAssigned = True
146 classifyText = go charClassifiers
148 go _ t | Text.null t = []
149 go [] t = (CharClassOther, Text.singleton (Text.head t)) : go charClassifiers (Text.tail t)
150 go ((clsD, clsF) : fs) t =
151 case Text.span (clsF . Char.generalCategory) t of
153 | Text.null l -> go fs t
154 | otherwise -> (clsD, l) : go charClassifiers r
156 isClassUnderscore (CharClassPunctuation, Text.all (== '_') -> True) = True
157 isClassUnderscore _ = False
158 isClassSeparator = \case
159 (CharClassSeparator, _) -> True
161 splitOnUnderscore = splitCharClass isClassUnderscore
162 splitOnSeparator = splitCharClass isClassSeparator
164 splitCharClass :: ((CharClass, Text) -> Bool) -> [(CharClass, Text)] -> [[(CharClass, Text)]]
165 splitCharClass isSep = go
169 case ts & List.span isSep of
171 case ts & List.break isSep of
175 zipTexte :: Texte -> Texte -> [(TexteAtome, TexteAtome)]
178 -- m0 = "开心果 冰淇淋 很 好吃"
179 -- p0 = "kāi_xīn_guǒ bīng_qí_lín hěn hào_chī"
184 & splitCharClass isClassSeparator
185 <&> foldMap \(_cls, c) -> c & Text.unpack
189 & splitCharClass isClassSeparator
190 <&> foldMap \ct@(_cls, c) -> [c | not (isClassUnderscore ct)]
192 -- z0 = List.zipWith (List.zip) m0 p0