2 from typing
import Dict
, Generator
, List
, Optional
, Tuple
4 from ordered_set
import OrderedSet
5 from pypinyin
.contrib
.tone_convert
import to_finals
, to_initials
, to_normal
, to_tone3
8 # https://en.wikipedia.org/wiki/Help:IPA/Mandarin
9 # https://en.wikipedia.org/wiki/Standard_Chinese_phonology
10 # https://en.wikipedia.org/wiki/Pinyin
11 # https://de.wikipedia.org/wiki/Pinyin
12 # - Duanmu, San. 2007. The Phonology of Standard Chinese. 2nd ed. Oxford ;
13 # New York: Oxford University Press.
14 # - Lin, Yen-Hwei. 2007. The Sounds of Chinese. Cambridge, UK ;
15 # New York: Cambridge University Press.
18 INITIAL_MAPPING
: Dict
[str, List
[Tuple
[str, ...]]] = {
20 "c": [("tsʰ",)], # tsʰ
21 "ch": [("ʈʂʰ",)], # tʂʰ
25 "h": [("x",), ("h",)],
33 "r": [("ɻ",), ("ʐ",)],
40 # w and y only occur in non-strict initials
42 # "y": [("j",), ("ɥ",)],
45 INITIALS
= INITIAL_MAPPING
.keys()
47 # Note: Syllabic consonants may also arise as a result of weak syllable reduction.
48 # Syllabic nasal consonants are also heard in certain interjections;
49 # pronunciations of such words include [m], [n], [ŋ], [hm], [hŋ].
50 SYLLABIC_CONSONANT_MAPPINGS
: Dict
[str, List
[Tuple
[str, ...]]] = {
58 SYLLABIC_CONSONANTS
= SYLLABIC_CONSONANT_MAPPINGS
.keys()
60 INTERJECTION_MAPPINGS
: Dict
[str, List
[Tuple
[str, ...]]] = {
61 "io": [("j", "ɔ0")], # /
63 # Note: In a small number of independent words or morphemes pronounced [ɚ] or [aɚ̯],
64 # written in pinyin as er (with some tone), such as 二 èr "two", 耳 ěr "ear",
65 # and 儿 (traditional 兒) ér "son". Similar to the sound in bar in English.
66 # Can also be pronounced [ɚ] depending on the speaker.
67 # Duanmu (2007, p. 40)
68 "er": [("ɚ0",), ("aɚ̯0",)], # ɑɻ
72 INTERJECTIONS
= INTERJECTION_MAPPINGS
.keys()
75 # Duanmu (2000, p. 37) and Lin (2007, p. 68f)
76 # Diphtongs from Duanmu (2007, p. 40): au, əu, əi, ai
77 # Diphthongs from Lin (2007, p. 68f): au̯, ou̯, ei̯, ai̯
78 FINAL_MAPPING
: Dict
[str, List
[Tuple
[str, ...]]] = {
80 "ai": [("ai̯0",)], # aɪ̯
81 "an": [("a0", "n")], # an
82 "ang": [("a0", "ŋ")], # ɑŋ
83 "ao": [("au̯0",)], # ɑʊ̯
85 "ei": [("ei̯0",)], # eɪ̯
86 "en": [("ə0", "n")], # ən
87 "eng": [("ə0", "ŋ")], # əŋ
89 "ia": [("j", "a0")], # i̯ɑ
90 "ian": [("j", "ɛ0", "n")], # iɛn
91 "iang": [("j", "a0", "ŋ")], # i̯ɑŋ
92 "iao": [("j", "au̯0")], # i̯ɑʊ̯
93 "ie": [("j", "e0")], # iɛ
94 "in": [("i0", "n")], # in
95 # "iu": [("j", "ou̯0")], # i̯ɤʊ̯
96 "iou": [("j", "ou̯0")], # i̯ɤʊ̯
97 "ing": [("i0", "ŋ")], # iŋ
98 "iong": [("j", "ʊ0", "ŋ")], # i̯ʊŋ
99 "ong": [("ʊ0", "ŋ")], # ʊŋ
100 "ou": [("ou̯0",)], # ɤʊ̯
102 # "ui": [("w", "ei̯0")], # u̯eɪ̯
103 "uei": [("w", "ei̯0")], # u̯eɪ̯
104 "ua": [("w", "a0")], # u̯ɑ
105 "uai": [("w", "ai̯0")], # u̯aɪ̯
106 "uan": [("w", "a0", "n")], # u̯an
107 # "un": [("w", "ə0", "n")], # u̯ən
108 "uen": [("w", "ə0", "n")], # u̯ən
109 "uang": [("w", "a0", "ŋ")], # u̯ɑŋ
110 "ueng": [("w", "ə0", "ŋ")], # /
111 # see: https://en.wikipedia.org/wiki/Pinyin "Finals beginning with u- (w-)"
112 "uo": [("w", "o0")], # u̯ɔ
113 # Normally uo is written as o after b, p, m, or f
114 # other cases (lo, yo) also considered as [wo]
115 "o": [("w", "o0")], # u̯ɔ
116 # Note: Normally ü is written as u after j, q, or x
117 # (the /u/ phoneme never occurs in these positions)
118 # pypinyin returns u as ü after (y), j, q, or x
119 "ü": [("y0",)], # u after y, j, q, or x ; # y
120 "üe": [("ɥ", "e0")], # ue after y, j, q, or x ; # y̯œ
121 "üan": [("ɥ", "ɛ0", "n")], # uan after y, j, q, or x ; # /
122 "ün": [("y0", "n")], # un after y, j, q, or x
125 FINALS
= FINAL_MAPPING
.keys()
127 # Note: [ɻ̩ ~ ʐ̩], an apical retroflex voiced continuant
128 # in zhi, chi, shi, ri ([ʈʂɻ̩ ʈʂʰɻ̩ ʂɻ̩ ɻɻ̩]).
129 # Duanmu (2007, p. 34f)
131 FINAL_MAPPING_AFTER_ZH_CH_SH_R
: Dict
[str, List
[Tuple
[str, ...]]] = {
132 "i": [("ɻ̩0",), ("ʐ̩0",)], # ʅ
135 # Note: [ɹ̩ ~ z̩], a laminal denti-alveolar voiced continuant,
136 # in zi, ci, si ([tsɹ̩ tsʰɹ̩ sɹ̩]);
137 # Duanmu (2007, p. 34f)
139 FINAL_MAPPING_AFTER_Z_C_S
: Dict
[str, List
[Tuple
[str, ...]]] = {
140 "i": [("ɹ̩0",), ("z̩0",)], # ɿ
143 # Note: Normally ü is written as u after j, q, or x
144 # (the /u/ phoneme never occurs in these positions)
145 # but in pypinyin this is not the case, e.g. it returns ü for ju
146 # FINAL_MAPPING_AFTER_J_Q_X = {
147 # "u": FINAL_MAPPING["ü"],
148 # "ue": FINAL_MAPPING["üe"],
149 # "uan": FINAL_MAPPING["üan"],
150 # "un": FINAL_MAPPING["ün"],
153 # Note: uo is written as o after b, p, m, or f.
154 # FINAL_MAPPING_AFTER_B_P_M_F = {
155 # "o": FINAL_MAPPING["uo"]
167 def get_tone(pinyin
: str) -> int:
168 pinyin_tone3
= to_tone3(pinyin
, neutral_tone_with_five
=True, v_to_u
=True)
169 if len(pinyin_tone3
) == 0:
170 raise ValueError("Parameter 'pinyin': Tone couldn't be detected!")
172 tone_nr_str
= pinyin_tone3
[-1]
175 tone_nr
= int(tone_nr_str
)
176 except ValueError as error
:
178 f
"Parameter 'pinyin': Tone '{tone_nr_str}' couldn't be detected!"
181 # Note: in case to_tone3 returns other values than expected
182 if tone_nr
not in TONE_MAPPING
:
183 raise ValueError(f
"Parameter 'pinyin': Tone '{tone_nr_str}' couldn't be detected!")
188 def get_syllabic_consonant(normal_pinyin
: str) -> Optional
[str]:
189 if normal_pinyin
in SYLLABIC_CONSONANTS
:
194 def get_interjection(normal_pinyin
: str) -> Optional
[str]:
195 if normal_pinyin
in INTERJECTIONS
:
200 def get_initials(normal_pinyin
: str) -> Optional
[str]:
201 if normal_pinyin
in SYLLABIC_CONSONANTS
:
204 if normal_pinyin
in INTERJECTIONS
:
207 pinyin_initial
= to_initials(normal_pinyin
, strict
=True)
209 if pinyin_initial
== "":
212 # in case pypinyin returns unexpected result
213 if pinyin_initial
not in INITIAL_MAPPING
:
215 f
"Parameter 'normal_pinyin': Initial '{pinyin_initial}' couldn't be detected!"
218 return pinyin_initial
221 def get_finals(normal_pinyin
: str) -> Optional
[str]:
222 if normal_pinyin
in SYLLABIC_CONSONANTS
:
225 if normal_pinyin
in INTERJECTIONS
:
228 pinyin_final
= to_finals(normal_pinyin
, strict
=True, v_to_u
=True)
230 if pinyin_final
== "":
231 raise ValueError("Parameter 'normal_pinyin': Final couldn't be detected!")
233 # in case pypinyin returns unexpected result
234 if pinyin_final
not in FINAL_MAPPING
:
236 f
"Parameter 'normal_pinyin': Final '{pinyin_final}' couldn't be detected!"
243 variants
: List
[Tuple
[str, ...]], tone
: int
244 ) -> Generator
[Tuple
[str, ...], None, None]:
245 tone_ipa
= TONE_MAPPING
[tone
]
247 tuple(phoneme
.replace("0", tone_ipa
) for phoneme
in variant
) for variant
in variants
251 def pinyin_to_ipa(pinyin
: str) -> OrderedSet
[Tuple
[str, ...]]:
253 Convert a Pinyin syllable into its corresponding
254 International Phonetic Alphabet (IPA) transcription.
259 A syllable representing the Pinyin input to be transcribed into IPA. The input
260 can include tone markers (e.g., "zhong", "zhōng", "zho1ng", "zhong1").
264 OrderedSet[Tuple[str, ...]]
265 A set of tuples, where each tuple represents a possible IPA transcription of
266 the input Pinyin. Each tuple contains phonemes as strings.
271 If the tone cannot be detected from the input or if the initial or final part
272 of the Pinyin cannot be mapped to IPA.
276 - The function supports edge cases like interjections and syllabic consonants,
277 which are not strictly part of the initial-final structure.
278 - Tone markers are applied to the vowel or syllabic consonant of the syllable.
279 - Relies on the `pypinyin` library for splitting Pinyin into initials and finals.
283 Convert a Pinyin string with a tone:
285 >>> result = pinyin_to_ipa("zhong4")
287 OrderedSet([('ʈʂ', 'ʊ˥˩', 'ŋ')])
289 Handle a syllabic consonant:
291 >>> result = pinyin_to_ipa("ng")
295 Process an interjection:
297 >>> result = pinyin_to_ipa("er")
299 OrderedSet([('ɚ',), ('aɚ̯',)])
301 Pinyin with multiple possible IPA transcriptions:
303 >>> result = pinyin_to_ipa("zhi")
305 OrderedSet([('ʈʂ', 'ɻ̩'), ('ʈʂ', 'ʐ̩')])
307 tone_nr
= get_tone(pinyin
)
308 pinyin_normal
= to_normal(pinyin
)
310 interjection
= get_interjection(pinyin_normal
)
311 if interjection
is not None:
312 interjection_ipa_mapping
= INTERJECTION_MAPPINGS
[pinyin_normal
]
313 interjection_ipa
= OrderedSet(apply_tone(interjection_ipa_mapping
, tone_nr
))
314 return interjection_ipa
316 syllabic_consonant
= get_syllabic_consonant(pinyin_normal
)
317 if syllabic_consonant
is not None:
318 syllabic_consonant_ipa_mapping
= SYLLABIC_CONSONANT_MAPPINGS
[syllabic_consonant
]
319 syllabic_consonant_ipa
= OrderedSet(
320 apply_tone(syllabic_consonant_ipa_mapping
, tone_nr
)
322 return syllabic_consonant_ipa
325 pinyin_initial
= get_initials(pinyin_normal
)
326 pinyin_final
= get_finals(pinyin_normal
)
327 assert pinyin_final
is not None
329 if pinyin_initial
is not None:
330 initial_phonemes
= INITIAL_MAPPING
[pinyin_initial
]
331 parts
.append(initial_phonemes
)
333 final_phonemes
: List
[Tuple
[str, ...]]
335 pinyin_initial
in {"zh", "ch", "sh", "r"}
336 and pinyin_final
in FINAL_MAPPING_AFTER_ZH_CH_SH_R
338 final_phonemes
= FINAL_MAPPING_AFTER_ZH_CH_SH_R
[pinyin_final
]
339 elif pinyin_initial
in {"z", "c", "s"}
and pinyin_final
in FINAL_MAPPING_AFTER_Z_C_S
:
340 final_phonemes
= FINAL_MAPPING_AFTER_Z_C_S
[pinyin_final
]
342 final_phonemes
= FINAL_MAPPING
[pinyin_final
]
344 final_phonemes
= list(apply_tone(final_phonemes
, tone_nr
))
345 parts
.append(final_phonemes
)
347 assert len(parts
) >= 1
349 all_syllable_combinations
= OrderedSet(
350 tuple(itertools
.chain
.from_iterable(combination
))
351 for combination
in itertools
.product(*parts
)
354 return all_syllable_combinations