]> Git — Sourcephile - julm/worksheets.git/blob - src/Language/Chinese.py
update
[julm/worksheets.git] / src / Language / Chinese.py
1 import itertools
2 from typing import Dict, Generator, List, Optional, Tuple
3
4 from ordered_set import OrderedSet
5 from pypinyin.contrib.tone_convert import to_finals, to_initials, to_normal, to_tone3
6
7 # References:
8 # https://en.wikipedia.org/wiki/Help:IPA/Mandarin
9 # https://en.wikipedia.org/wiki/Standard_Chinese_phonology
10 # https://en.wikipedia.org/wiki/Pinyin
11 # https://de.wikipedia.org/wiki/Pinyin
12 # - Duanmu, San. 2007. The Phonology of Standard Chinese. 2nd ed. Oxford ;
13 # New York: Oxford University Press.
14 # - Lin, Yen-Hwei. 2007. The Sounds of Chinese. Cambridge, UK ;
15 # New York: Cambridge University Press.
16
17
18 INITIAL_MAPPING: Dict[str, List[Tuple[str, ...]]] = {
19 "b": [("p",)],
20 "c": [("tsʰ",)], # tsʰ
21 "ch": [("ʈʂʰ",)], # tʂʰ
22 "d": [("t",)],
23 "f": [("f",)],
24 "g": [("k",)],
25 "h": [("x",), ("h",)],
26 "j": [("tɕ",)],
27 "k": [("kʰ",)],
28 "l": [("l",)],
29 "m": [("m",)],
30 "n": [("n",)],
31 "p": [("pʰ",)],
32 "q": [("tɕʰ",)],
33 "r": [("ɻ",), ("ʐ",)],
34 "s": [("s",)],
35 "sh": [("ʂ",)],
36 "t": [("tʰ",)],
37 "x": [("ɕ",)],
38 "z": [("ts",)],
39 "zh": [("ʈʂ",)], # tʂ
40 # w and y only occur in non-strict initials
41 # "w": [("w",)],
42 # "y": [("j",), ("ɥ",)],
43 }
44
45 INITIALS = INITIAL_MAPPING.keys()
46
47 # Note: Syllabic consonants may also arise as a result of weak syllable reduction.
48 # Syllabic nasal consonants are also heard in certain interjections;
49 # pronunciations of such words include [m], [n], [ŋ], [hm], [hŋ].
50 SYLLABIC_CONSONANT_MAPPINGS: Dict[str, List[Tuple[str, ...]]] = {
51 "hm": [("h", "m0")],
52 "hng": [("h", "ŋ0")],
53 "m": [("m0",)],
54 "n": [("n0",)],
55 "ng": [("ŋ0",)],
56 }
57
58 SYLLABIC_CONSONANTS = SYLLABIC_CONSONANT_MAPPINGS.keys()
59
60 INTERJECTION_MAPPINGS: Dict[str, List[Tuple[str, ...]]] = {
61 "io": [("j", "ɔ0")], # /
62 "ê": [("ɛ0",)], # /
63 # Note: In a small number of independent words or morphemes pronounced [ɚ] or [aɚ̯],
64 # written in pinyin as er (with some tone), such as 二 èr "two", 耳 ěr "ear",
65 # and 儿 (traditional 兒) ér "son". Similar to the sound in bar in English.
66 # Can also be pronounced [ɚ] depending on the speaker.
67 # Duanmu (2007, p. 40)
68 "er": [("ɚ0",), ("aɚ̯0",)], # ɑɻ
69 "o": [("ɔ0",)], # ɔ
70 }
71
72 INTERJECTIONS = INTERJECTION_MAPPINGS.keys()
73
74
75 # Duanmu (2000, p. 37) and Lin (2007, p. 68f)
76 # Diphtongs from Duanmu (2007, p. 40): au, əu, əi, ai
77 # Diphthongs from Lin (2007, p. 68f): au̯, ou̯, ei̯, ai̯
78 FINAL_MAPPING: Dict[str, List[Tuple[str, ...]]] = {
79 "a": [("a0",)], # /
80 "ai": [("ai̯0",)], # aɪ̯
81 "an": [("a0", "n")], # an
82 "ang": [("a0", "ŋ")], # ɑŋ
83 "ao": [("au̯0",)], # ɑʊ̯
84 "e": [("ɤ0",)], # ɯ̯ʌ
85 "ei": [("ei̯0",)], # eɪ̯
86 "en": [("ə0", "n")], # ən
87 "eng": [("ə0", "ŋ")], # əŋ
88 "i": [("i0",)], # i
89 "ia": [("j", "a0")], # i̯ɑ
90 "ian": [("j", "ɛ0", "n")], # iɛn
91 "iang": [("j", "a0", "ŋ")], # i̯ɑŋ
92 "iao": [("j", "au̯0")], # i̯ɑʊ̯
93 "ie": [("j", "e0")], # iɛ
94 "in": [("i0", "n")], # in
95 # "iu": [("j", "ou̯0")], # i̯ɤʊ̯
96 "iou": [("j", "ou̯0")], # i̯ɤʊ̯
97 "ing": [("i0", "ŋ")], # iŋ
98 "iong": [("j", "ʊ0", "ŋ")], # i̯ʊŋ
99 "ong": [("ʊ0", "ŋ")], # ʊŋ
100 "ou": [("ou̯0",)], # ɤʊ̯
101 "u": [("u0",)], # u
102 # "ui": [("w", "ei̯0")], # u̯eɪ̯
103 "uei": [("w", "ei̯0")], # u̯eɪ̯
104 "ua": [("w", "a0")], # u̯ɑ
105 "uai": [("w", "ai̯0")], # u̯aɪ̯
106 "uan": [("w", "a0", "n")], # u̯an
107 # "un": [("w", "ə0", "n")], # u̯ən
108 "uen": [("w", "ə0", "n")], # u̯ən
109 "uang": [("w", "a0", "ŋ")], # u̯ɑŋ
110 "ueng": [("w", "ə0", "ŋ")], # /
111 # see: https://en.wikipedia.org/wiki/Pinyin "Finals beginning with u- (w-)"
112 "uo": [("w", "o0")], # u̯ɔ
113 # Normally uo is written as o after b, p, m, or f
114 # other cases (lo, yo) also considered as [wo]
115 "o": [("w", "o0")], # u̯ɔ
116 # Note: Normally ü is written as u after j, q, or x
117 # (the /u/ phoneme never occurs in these positions)
118 # pypinyin returns u as ü after (y), j, q, or x
119 "ü": [("y0",)], # u after y, j, q, or x ; # y
120 "üe": [("ɥ", "e0")], # ue after y, j, q, or x ; # y̯œ
121 "üan": [("ɥ", "ɛ0", "n")], # uan after y, j, q, or x ; # /
122 "ün": [("y0", "n")], # un after y, j, q, or x
123 }
124
125 FINALS = FINAL_MAPPING.keys()
126
127 # Note: [ɻ̩ ~ ʐ̩], an apical retroflex voiced continuant
128 # in zhi, chi, shi, ri ([ʈʂɻ̩ ʈʂʰɻ̩ ʂɻ̩ ɻɻ̩]).
129 # Duanmu (2007, p. 34f)
130 # Lin (2007, p. 72)
131 FINAL_MAPPING_AFTER_ZH_CH_SH_R: Dict[str, List[Tuple[str, ...]]] = {
132 "i": [("ɻ̩0",), ("ʐ̩0",)], # ʅ
133 }
134
135 # Note: [ɹ̩ ~ z̩], a laminal denti-alveolar voiced continuant,
136 # in zi, ci, si ([tsɹ̩ tsʰɹ̩ sɹ̩]);
137 # Duanmu (2007, p. 34f)
138 # Lin (2007, p. 72)
139 FINAL_MAPPING_AFTER_Z_C_S: Dict[str, List[Tuple[str, ...]]] = {
140 "i": [("ɹ̩0",), ("z̩0",)], # ɿ
141 }
142
143 # Note: Normally ü is written as u after j, q, or x
144 # (the /u/ phoneme never occurs in these positions)
145 # but in pypinyin this is not the case, e.g. it returns ü for ju
146 # FINAL_MAPPING_AFTER_J_Q_X = {
147 # "u": FINAL_MAPPING["ü"],
148 # "ue": FINAL_MAPPING["üe"],
149 # "uan": FINAL_MAPPING["üan"],
150 # "un": FINAL_MAPPING["ün"],
151 # }
152
153 # Note: uo is written as o after b, p, m, or f.
154 # FINAL_MAPPING_AFTER_B_P_M_F = {
155 # "o": FINAL_MAPPING["uo"]
156 # }
157
158 TONE_MAPPING = {
159 1: "˥", # ā
160 2: "˧˥", # á
161 3: "˧˩˧", # ǎ
162 4: "˥˩", # à
163 5: "", # a
164 }
165
166
167 def get_tone(pinyin: str) -> int:
168 pinyin_tone3 = to_tone3(pinyin, neutral_tone_with_five=True, v_to_u=True)
169 if len(pinyin_tone3) == 0:
170 raise ValueError("Parameter 'pinyin': Tone couldn't be detected!")
171
172 tone_nr_str = pinyin_tone3[-1]
173
174 try:
175 tone_nr = int(tone_nr_str)
176 except ValueError as error:
177 raise ValueError(
178 f"Parameter 'pinyin': Tone '{tone_nr_str}' couldn't be detected!"
179 ) from error
180
181 # Note: in case to_tone3 returns other values than expected
182 if tone_nr not in TONE_MAPPING:
183 raise ValueError(f"Parameter 'pinyin': Tone '{tone_nr_str}' couldn't be detected!")
184
185 return tone_nr
186
187
188 def get_syllabic_consonant(normal_pinyin: str) -> Optional[str]:
189 if normal_pinyin in SYLLABIC_CONSONANTS:
190 return normal_pinyin
191 return None
192
193
194 def get_interjection(normal_pinyin: str) -> Optional[str]:
195 if normal_pinyin in INTERJECTIONS:
196 return normal_pinyin
197 return None
198
199
200 def get_initials(normal_pinyin: str) -> Optional[str]:
201 if normal_pinyin in SYLLABIC_CONSONANTS:
202 return None
203
204 if normal_pinyin in INTERJECTIONS:
205 return None
206
207 pinyin_initial = to_initials(normal_pinyin, strict=True)
208
209 if pinyin_initial == "":
210 return None
211
212 # in case pypinyin returns unexpected result
213 if pinyin_initial not in INITIAL_MAPPING:
214 raise ValueError(
215 f"Parameter 'normal_pinyin': Initial '{pinyin_initial}' couldn't be detected!"
216 )
217
218 return pinyin_initial
219
220
221 def get_finals(normal_pinyin: str) -> Optional[str]:
222 if normal_pinyin in SYLLABIC_CONSONANTS:
223 return None
224
225 if normal_pinyin in INTERJECTIONS:
226 return None
227
228 pinyin_final = to_finals(normal_pinyin, strict=True, v_to_u=True)
229
230 if pinyin_final == "":
231 raise ValueError("Parameter 'normal_pinyin': Final couldn't be detected!")
232
233 # in case pypinyin returns unexpected result
234 if pinyin_final not in FINAL_MAPPING:
235 raise ValueError(
236 f"Parameter 'normal_pinyin': Final '{pinyin_final}' couldn't be detected!"
237 )
238
239 return pinyin_final
240
241
242 def apply_tone(
243 variants: List[Tuple[str, ...]], tone: int
244 ) -> Generator[Tuple[str, ...], None, None]:
245 tone_ipa = TONE_MAPPING[tone]
246 yield from (
247 tuple(phoneme.replace("0", tone_ipa) for phoneme in variant) for variant in variants
248 )
249
250
251 def pinyin_to_ipa(pinyin: str) -> OrderedSet[Tuple[str, ...]]:
252 """
253 Convert a Pinyin syllable into its corresponding
254 International Phonetic Alphabet (IPA) transcription.
255
256 Parameters
257 ----------
258 pinyin : str
259 A syllable representing the Pinyin input to be transcribed into IPA. The input
260 can include tone markers (e.g., "zhong", "zhōng", "zho1ng", "zhong1").
261
262 Returns
263 -------
264 OrderedSet[Tuple[str, ...]]
265 A set of tuples, where each tuple represents a possible IPA transcription of
266 the input Pinyin. Each tuple contains phonemes as strings.
267
268 Raises
269 ------
270 ValueError
271 If the tone cannot be detected from the input or if the initial or final part
272 of the Pinyin cannot be mapped to IPA.
273
274 Notes
275 -----
276 - The function supports edge cases like interjections and syllabic consonants,
277 which are not strictly part of the initial-final structure.
278 - Tone markers are applied to the vowel or syllabic consonant of the syllable.
279 - Relies on the `pypinyin` library for splitting Pinyin into initials and finals.
280
281 Examples
282 --------
283 Convert a Pinyin string with a tone:
284
285 >>> result = pinyin_to_ipa("zhong4")
286 >>> print(result)
287 OrderedSet([('ʈʂ', 'ʊ˥˩', 'ŋ')])
288
289 Handle a syllabic consonant:
290
291 >>> result = pinyin_to_ipa("ng")
292 >>> print(result)
293 OrderedSet([('ŋ',)])
294
295 Process an interjection:
296
297 >>> result = pinyin_to_ipa("er")
298 >>> print(result)
299 OrderedSet([('ɚ',), ('aɚ̯',)])
300
301 Pinyin with multiple possible IPA transcriptions:
302
303 >>> result = pinyin_to_ipa("zhi")
304 >>> print(result)
305 OrderedSet([('ʈʂ', 'ɻ̩'), ('ʈʂ', 'ʐ̩')])
306 """
307 tone_nr = get_tone(pinyin)
308 pinyin_normal = to_normal(pinyin)
309
310 interjection = get_interjection(pinyin_normal)
311 if interjection is not None:
312 interjection_ipa_mapping = INTERJECTION_MAPPINGS[pinyin_normal]
313 interjection_ipa = OrderedSet(apply_tone(interjection_ipa_mapping, tone_nr))
314 return interjection_ipa
315
316 syllabic_consonant = get_syllabic_consonant(pinyin_normal)
317 if syllabic_consonant is not None:
318 syllabic_consonant_ipa_mapping = SYLLABIC_CONSONANT_MAPPINGS[syllabic_consonant]
319 syllabic_consonant_ipa = OrderedSet(
320 apply_tone(syllabic_consonant_ipa_mapping, tone_nr)
321 )
322 return syllabic_consonant_ipa
323
324 parts = []
325 pinyin_initial = get_initials(pinyin_normal)
326 pinyin_final = get_finals(pinyin_normal)
327 assert pinyin_final is not None
328
329 if pinyin_initial is not None:
330 initial_phonemes = INITIAL_MAPPING[pinyin_initial]
331 parts.append(initial_phonemes)
332
333 final_phonemes: List[Tuple[str, ...]]
334 if (
335 pinyin_initial in {"zh", "ch", "sh", "r"}
336 and pinyin_final in FINAL_MAPPING_AFTER_ZH_CH_SH_R
337 ):
338 final_phonemes = FINAL_MAPPING_AFTER_ZH_CH_SH_R[pinyin_final]
339 elif pinyin_initial in {"z", "c", "s"} and pinyin_final in FINAL_MAPPING_AFTER_Z_C_S:
340 final_phonemes = FINAL_MAPPING_AFTER_Z_C_S[pinyin_final]
341 else:
342 final_phonemes = FINAL_MAPPING[pinyin_final]
343
344 final_phonemes = list(apply_tone(final_phonemes, tone_nr))
345 parts.append(final_phonemes)
346
347 assert len(parts) >= 1
348
349 all_syllable_combinations = OrderedSet(
350 tuple(itertools.chain.from_iterable(combination))
351 for combination in itertools.product(*parts)
352 )
353
354 return all_syllable_combinations
355