import itertools from typing import Dict, Generator, List, Optional, Tuple from ordered_set import OrderedSet from pypinyin.contrib.tone_convert import to_finals, to_initials, to_normal, to_tone3 # References: # https://en.wikipedia.org/wiki/Help:IPA/Mandarin # https://en.wikipedia.org/wiki/Standard_Chinese_phonology # https://en.wikipedia.org/wiki/Pinyin # https://de.wikipedia.org/wiki/Pinyin # - Duanmu, San. 2007. The Phonology of Standard Chinese. 2nd ed. Oxford ; # New York: Oxford University Press. # - Lin, Yen-Hwei. 2007. The Sounds of Chinese. Cambridge, UK ; # New York: Cambridge University Press. INITIAL_MAPPING: Dict[str, List[Tuple[str, ...]]] = { "b": [("p",)], "c": [("tsʰ",)], # tsʰ "ch": [("ʈʂʰ",)], # tʂʰ "d": [("t",)], "f": [("f",)], "g": [("k",)], "h": [("x",), ("h",)], "j": [("tɕ",)], "k": [("kʰ",)], "l": [("l",)], "m": [("m",)], "n": [("n",)], "p": [("pʰ",)], "q": [("tɕʰ",)], "r": [("ɻ",), ("ʐ",)], "s": [("s",)], "sh": [("ʂ",)], "t": [("tʰ",)], "x": [("ɕ",)], "z": [("ts",)], "zh": [("ʈʂ",)], # tʂ # w and y only occur in non-strict initials # "w": [("w",)], # "y": [("j",), ("ɥ",)], } INITIALS = INITIAL_MAPPING.keys() # Note: Syllabic consonants may also arise as a result of weak syllable reduction. # Syllabic nasal consonants are also heard in certain interjections; # pronunciations of such words include [m], [n], [ŋ], [hm], [hŋ]. SYLLABIC_CONSONANT_MAPPINGS: Dict[str, List[Tuple[str, ...]]] = { "hm": [("h", "m0")], "hng": [("h", "ŋ0")], "m": [("m0",)], "n": [("n0",)], "ng": [("ŋ0",)], } SYLLABIC_CONSONANTS = SYLLABIC_CONSONANT_MAPPINGS.keys() INTERJECTION_MAPPINGS: Dict[str, List[Tuple[str, ...]]] = { "io": [("j", "ɔ0")], # / "ê": [("ɛ0",)], # / # Note: In a small number of independent words or morphemes pronounced [ɚ] or [aɚ̯], # written in pinyin as er (with some tone), such as 二 èr "two", 耳 ěr "ear", # and 儿 (traditional 兒) ér "son". Similar to the sound in bar in English. # Can also be pronounced [ɚ] depending on the speaker. # Duanmu (2007, p. 40) "er": [("ɚ0",), ("aɚ̯0",)], # ɑɻ "o": [("ɔ0",)], # ɔ } INTERJECTIONS = INTERJECTION_MAPPINGS.keys() # Duanmu (2000, p. 37) and Lin (2007, p. 68f) # Diphtongs from Duanmu (2007, p. 40): au, əu, əi, ai # Diphthongs from Lin (2007, p. 68f): au̯, ou̯, ei̯, ai̯ FINAL_MAPPING: Dict[str, List[Tuple[str, ...]]] = { "a": [("a0",)], # / "ai": [("ai̯0",)], # aɪ̯ "an": [("a0", "n")], # an "ang": [("a0", "ŋ")], # ɑŋ "ao": [("au̯0",)], # ɑʊ̯ "e": [("ɤ0",)], # ɯ̯ʌ "ei": [("ei̯0",)], # eɪ̯ "en": [("ə0", "n")], # ən "eng": [("ə0", "ŋ")], # əŋ "i": [("i0",)], # i "ia": [("j", "a0")], # i̯ɑ "ian": [("j", "ɛ0", "n")], # iɛn "iang": [("j", "a0", "ŋ")], # i̯ɑŋ "iao": [("j", "au̯0")], # i̯ɑʊ̯ "ie": [("j", "e0")], # iɛ "in": [("i0", "n")], # in # "iu": [("j", "ou̯0")], # i̯ɤʊ̯ "iou": [("j", "ou̯0")], # i̯ɤʊ̯ "ing": [("i0", "ŋ")], # iŋ "iong": [("j", "ʊ0", "ŋ")], # i̯ʊŋ "ong": [("ʊ0", "ŋ")], # ʊŋ "ou": [("ou̯0",)], # ɤʊ̯ "u": [("u0",)], # u # "ui": [("w", "ei̯0")], # u̯eɪ̯ "uei": [("w", "ei̯0")], # u̯eɪ̯ "ua": [("w", "a0")], # u̯ɑ "uai": [("w", "ai̯0")], # u̯aɪ̯ "uan": [("w", "a0", "n")], # u̯an # "un": [("w", "ə0", "n")], # u̯ən "uen": [("w", "ə0", "n")], # u̯ən "uang": [("w", "a0", "ŋ")], # u̯ɑŋ "ueng": [("w", "ə0", "ŋ")], # / # see: https://en.wikipedia.org/wiki/Pinyin "Finals beginning with u- (w-)" "uo": [("w", "o0")], # u̯ɔ # Normally uo is written as o after b, p, m, or f # other cases (lo, yo) also considered as [wo] "o": [("w", "o0")], # u̯ɔ # Note: Normally ü is written as u after j, q, or x # (the /u/ phoneme never occurs in these positions) # pypinyin returns u as ü after (y), j, q, or x "ü": [("y0",)], # u after y, j, q, or x ; # y "üe": [("ɥ", "e0")], # ue after y, j, q, or x ; # y̯œ "üan": [("ɥ", "ɛ0", "n")], # uan after y, j, q, or x ; # / "ün": [("y0", "n")], # un after y, j, q, or x } FINALS = FINAL_MAPPING.keys() # Note: [ɻ̩ ~ ʐ̩], an apical retroflex voiced continuant # in zhi, chi, shi, ri ([ʈʂɻ̩ ʈʂʰɻ̩ ʂɻ̩ ɻɻ̩]). # Duanmu (2007, p. 34f) # Lin (2007, p. 72) FINAL_MAPPING_AFTER_ZH_CH_SH_R: Dict[str, List[Tuple[str, ...]]] = { "i": [("ɻ̩0",), ("ʐ̩0",)], # ʅ } # Note: [ɹ̩ ~ z̩], a laminal denti-alveolar voiced continuant, # in zi, ci, si ([tsɹ̩ tsʰɹ̩ sɹ̩]); # Duanmu (2007, p. 34f) # Lin (2007, p. 72) FINAL_MAPPING_AFTER_Z_C_S: Dict[str, List[Tuple[str, ...]]] = { "i": [("ɹ̩0",), ("z̩0",)], # ɿ } # Note: Normally ü is written as u after j, q, or x # (the /u/ phoneme never occurs in these positions) # but in pypinyin this is not the case, e.g. it returns ü for ju # FINAL_MAPPING_AFTER_J_Q_X = { # "u": FINAL_MAPPING["ü"], # "ue": FINAL_MAPPING["üe"], # "uan": FINAL_MAPPING["üan"], # "un": FINAL_MAPPING["ün"], # } # Note: uo is written as o after b, p, m, or f. # FINAL_MAPPING_AFTER_B_P_M_F = { # "o": FINAL_MAPPING["uo"] # } TONE_MAPPING = { 1: "˥", # ā 2: "˧˥", # á 3: "˧˩˧", # ǎ 4: "˥˩", # à 5: "", # a } def get_tone(pinyin: str) -> int: pinyin_tone3 = to_tone3(pinyin, neutral_tone_with_five=True, v_to_u=True) if len(pinyin_tone3) == 0: raise ValueError("Parameter 'pinyin': Tone couldn't be detected!") tone_nr_str = pinyin_tone3[-1] try: tone_nr = int(tone_nr_str) except ValueError as error: raise ValueError( f"Parameter 'pinyin': Tone '{tone_nr_str}' couldn't be detected!" ) from error # Note: in case to_tone3 returns other values than expected if tone_nr not in TONE_MAPPING: raise ValueError(f"Parameter 'pinyin': Tone '{tone_nr_str}' couldn't be detected!") return tone_nr def get_syllabic_consonant(normal_pinyin: str) -> Optional[str]: if normal_pinyin in SYLLABIC_CONSONANTS: return normal_pinyin return None def get_interjection(normal_pinyin: str) -> Optional[str]: if normal_pinyin in INTERJECTIONS: return normal_pinyin return None def get_initials(normal_pinyin: str) -> Optional[str]: if normal_pinyin in SYLLABIC_CONSONANTS: return None if normal_pinyin in INTERJECTIONS: return None pinyin_initial = to_initials(normal_pinyin, strict=True) if pinyin_initial == "": return None # in case pypinyin returns unexpected result if pinyin_initial not in INITIAL_MAPPING: raise ValueError( f"Parameter 'normal_pinyin': Initial '{pinyin_initial}' couldn't be detected!" ) return pinyin_initial def get_finals(normal_pinyin: str) -> Optional[str]: if normal_pinyin in SYLLABIC_CONSONANTS: return None if normal_pinyin in INTERJECTIONS: return None pinyin_final = to_finals(normal_pinyin, strict=True, v_to_u=True) if pinyin_final == "": raise ValueError("Parameter 'normal_pinyin': Final couldn't be detected!") # in case pypinyin returns unexpected result if pinyin_final not in FINAL_MAPPING: raise ValueError( f"Parameter 'normal_pinyin': Final '{pinyin_final}' couldn't be detected!" ) return pinyin_final def apply_tone( variants: List[Tuple[str, ...]], tone: int ) -> Generator[Tuple[str, ...], None, None]: tone_ipa = TONE_MAPPING[tone] yield from ( tuple(phoneme.replace("0", tone_ipa) for phoneme in variant) for variant in variants ) def pinyin_to_ipa(pinyin: str) -> OrderedSet[Tuple[str, ...]]: """ Convert a Pinyin syllable into its corresponding International Phonetic Alphabet (IPA) transcription. Parameters ---------- pinyin : str A syllable representing the Pinyin input to be transcribed into IPA. The input can include tone markers (e.g., "zhong", "zhōng", "zho1ng", "zhong1"). Returns ------- OrderedSet[Tuple[str, ...]] A set of tuples, where each tuple represents a possible IPA transcription of the input Pinyin. Each tuple contains phonemes as strings. Raises ------ ValueError If the tone cannot be detected from the input or if the initial or final part of the Pinyin cannot be mapped to IPA. Notes ----- - The function supports edge cases like interjections and syllabic consonants, which are not strictly part of the initial-final structure. - Tone markers are applied to the vowel or syllabic consonant of the syllable. - Relies on the `pypinyin` library for splitting Pinyin into initials and finals. Examples -------- Convert a Pinyin string with a tone: >>> result = pinyin_to_ipa("zhong4") >>> print(result) OrderedSet([('ʈʂ', 'ʊ˥˩', 'ŋ')]) Handle a syllabic consonant: >>> result = pinyin_to_ipa("ng") >>> print(result) OrderedSet([('ŋ',)]) Process an interjection: >>> result = pinyin_to_ipa("er") >>> print(result) OrderedSet([('ɚ',), ('aɚ̯',)]) Pinyin with multiple possible IPA transcriptions: >>> result = pinyin_to_ipa("zhi") >>> print(result) OrderedSet([('ʈʂ', 'ɻ̩'), ('ʈʂ', 'ʐ̩')]) """ tone_nr = get_tone(pinyin) pinyin_normal = to_normal(pinyin) interjection = get_interjection(pinyin_normal) if interjection is not None: interjection_ipa_mapping = INTERJECTION_MAPPINGS[pinyin_normal] interjection_ipa = OrderedSet(apply_tone(interjection_ipa_mapping, tone_nr)) return interjection_ipa syllabic_consonant = get_syllabic_consonant(pinyin_normal) if syllabic_consonant is not None: syllabic_consonant_ipa_mapping = SYLLABIC_CONSONANT_MAPPINGS[syllabic_consonant] syllabic_consonant_ipa = OrderedSet( apply_tone(syllabic_consonant_ipa_mapping, tone_nr) ) return syllabic_consonant_ipa parts = [] pinyin_initial = get_initials(pinyin_normal) pinyin_final = get_finals(pinyin_normal) assert pinyin_final is not None if pinyin_initial is not None: initial_phonemes = INITIAL_MAPPING[pinyin_initial] parts.append(initial_phonemes) final_phonemes: List[Tuple[str, ...]] if ( pinyin_initial in {"zh", "ch", "sh", "r"} and pinyin_final in FINAL_MAPPING_AFTER_ZH_CH_SH_R ): final_phonemes = FINAL_MAPPING_AFTER_ZH_CH_SH_R[pinyin_final] elif pinyin_initial in {"z", "c", "s"} and pinyin_final in FINAL_MAPPING_AFTER_Z_C_S: final_phonemes = FINAL_MAPPING_AFTER_Z_C_S[pinyin_final] else: final_phonemes = FINAL_MAPPING[pinyin_final] final_phonemes = list(apply_tone(final_phonemes, tone_nr)) parts.append(final_phonemes) assert len(parts) >= 1 all_syllable_combinations = OrderedSet( tuple(itertools.chain.from_iterable(combination)) for combination in itertools.product(*parts) ) return all_syllable_combinations