Source code for learn_to_pronounce.phonetics.arpabet_convertor

"""
Copyright 2022 Balacoon

Convertor from ARPABet to IPA or X-SAMPA
TODO: Move conversion between phoneme sets to pronunciation_generation package
"""

from typing import List


[docs]class ARPAbetConvertor:
    """
    Converts ARPAbet (phonemeset of CMUDict) to more universal IPA or X-SAMPA.
    X-SAMPA - is unicode-friendly version of IPA (international pronunciation alphabet).
    ARPAbet: https://en.wikipedia.org/wiki/ARPABET
    IPA: https://en.wikipedia.org/wiki/International_Phonetic_Alphabet
    X-SAMPA: https://en.wikipedia.org/wiki/X-SAMPA
    Conversion notes: https://github.com/menelik3/cmudict-ipa, https://github.com/Epicalert/xsampadict.
    One modification to IPA/XSAMPA notation: stress marks are put just before vowel, not before syllable.
    That helps to treat stressed/unstressed vowels as separate phonemes for g2p training/forced alignment purposes
    """

[docs]    def __init__(self):
        # https://github.com/menelik3/cmudict-ipa/issues/2
        # https://github.com/danmysak/multilingual-ipa-data/blob/main/cmudict/collection/data/phonemes
        # exceptions from https://github.com/menelik3/cmudict-ipa
        self._arpa2ipa = {
            "AA": ["ɑ:"],
            "AE": ["æ"],
            "AH0": ["ə"],
            "AH2": ["ə"],
            "AH1": ["ʌ"],
            "AO": ["ɔ"],
            "AW": ["a", "ʊ̯"],
            "AY": ["a", "ɪ̯"],
            "B": ["b"],
            "CH": ["t", "ʃ"],
            "D": ["d"],
            "DH": ["ð"],
            "EH": ["ɛ"],
            "ER0": ["ɚ"],
            "ER": ["ɝ:"],
            "EY": ["e", "ɪ̯"],
            "F": ["f"],
            "G": ["ɡ"],
            "HH": ["h"],
            "IH": ["ɪ"],
            "IY": ["i:"],
            "JH": ["d", "ʒ"],
            "K": ["k"],
            "L": ["l"],
            "M": ["m"],
            "N": ["n"],
            "NG": ["ŋ"],
            "OW": ["o", "ʊ̯"],
            "OY": ["ɔ", "ɪ̯"],
            "P": ["p"],
            "R": ["ɹ"],
            "S": ["s"],
            "SH": ["ʃ"],
            "T": ["t"],
            "TH": ["θ"],
            "UH": ["ʊ"],
            "UW": ["u:"],
            "V": ["v"],
            "W": ["w"],
            "Y": ["j"],
            "Z": ["z"],
            "ZH": ["ʒ"],
            "<eps>": ["<eps>"],
        }
        self._arpa2ipa_stress = {"0": "", "1": "ˈ", "2": "ˌ"}

        # https://en.wikipedia.org/wiki/X-SAMPA
        self._ipa2xsampa = {
            "ɑ": "A",
            "ɪ̯": "I",
            "æ": "{",
            "ʌ": "V",
            "ə": "@",
            "ɔ": "O",
            "a": "a",
            "ʊ̯": "U",
            "ɪ": "I",
            "b": "b",
            "t": "t",
            "ʃ": "S",
            "d": "d",
            "ð": "D",
            "ɛ": "E",
            "ɚ": "@`",
            "ɝ": "@`",
            "e": "e",
            "f": "f",
            "ɡ": "g",
            "h": "h",
            "i": "i",
            "ʒ": "Z",
            "k": "k",
            "l": "l",
            "m": "m",
            "n": "n",
            "ŋ": "N",
            "o": "o",
            "p": "p",
            "ɹ": "r\\",
            "s": "s",
            "θ": "T",
            "u": "u",
            "ʊ": "U",
            "v": "v",
            "w": "w",
            "j": "j",
            "z": "z",
            "<eps>": "<eps>",
        }
        self._ipa2xsampa_stress = {"ˈ": '"', "ˌ": "%"}

    def arpa2ipa(self, arpa_phonemes: List[str]) -> List[str]:
        ipa_phonemes = []
        for x in arpa_phonemes:
            # check if its already there without removing a stress
            ipa_x = self._arpa2ipa.get(x, None)

            # convert stress to ipa if any
            ipa_stress = ""
            if x[-1].isdigit():
                ipa_stress = self._arpa2ipa_stress[x[-1]]
                x = x[:-1]

            # if ipa phoneme was not found, look up for phoneme without stress (should be there)
            if ipa_x is None:
                assert (
                    x in self._arpa2ipa
                ), "Cant find [{}] in ARPAbet to IPA mapping".format(x)
                ipa_x = self._arpa2ipa[x]

            # add ipa version of stress
            ipa_x = list(ipa_x)
            ipa_x[0] = ipa_stress + ipa_x[0]
            ipa_phonemes.extend(ipa_x)
        return ipa_phonemes

    @staticmethod
    def is_ascii(s):
        return all(ord(c) < 128 for c in s)

    def ipa2xsampa(self, ipa_phonemes: List[str]) -> List[str]:
        xsampa_phonemes = []
        for x in ipa_phonemes:

            # convert stress if any
            xsampa_stress = ""
            if x[0] in self._ipa2xsampa_stress:
                xsampa_stress = self._ipa2xsampa_stress[x[0]]
                x = x[1:]

            # transfer prolongation
            xsampa_prolongation = ""
            if x[-1] == ":":
                xsampa_prolongation = ":"
                x = x[:-1]

            # convert ipa to xsampa
            assert (
                x in self._ipa2xsampa
            ), "Cant find [{}] in IPA to X-SAMPA mapping".format(x)
            xsampa_x = xsampa_stress + self._ipa2xsampa[x] + xsampa_prolongation
            assert self.is_ascii(xsampa_x), "X-SAMPA phoneme is non-ASCII: {}".format(
                xsampa_x
            )
            xsampa_phonemes.append(xsampa_x)

        return xsampa_phonemes

    def arpa2xsampa(self, arpa_phonemes: List[str]) -> List[str]:
        ipa_phonemes = self.arpa2ipa(arpa_phonemes)
        return self.ipa2xsampa(ipa_phonemes)