Source code for espnet2.text.phoneme_tokenizer

import logging
import re
import warnings
from pathlib import Path
from typing import Iterable, List, Optional, Union

import g2p_en
import jamo
from packaging.version import parse as V
from typeguard import typechecked

from espnet2.text.abs_tokenizer import AbsTokenizer

g2p_choices = [
    None,
    "g2p_en",
    "g2p_en_no_space",
    "pyopenjtalk",
    "pyopenjtalk_kana",
    "pyopenjtalk_accent",
    "pyopenjtalk_accent_with_pause",
    "pyopenjtalk_prosody",
    "pypinyin_g2p",
    "pypinyin_g2p_phone",
    "pypinyin_g2p_phone_without_prosody",
    "espeak_ng_arabic",
    "espeak_ng_german",
    "espeak_ng_french",
    "espeak_ng_spanish",
    "espeak_ng_russian",
    "espeak_ng_greek",
    "espeak_ng_finnish",
    "espeak_ng_hungarian",
    "espeak_ng_dutch",
    "espeak_ng_english_us_vits",
    "espeak_ng_hindi",
    "espeak_ng_italian",
    "espeak_ng_ukrainian",
    "espeak_ng_polish",
    "g2pk",
    "g2pk_no_space",
    "g2pk_explicit_space",
    "korean_jaso",
    "korean_jaso_no_space",
    "g2p_is",
]


[docs]def split_by_space(text) -> List[str]: if " " in text: text = text.replace(" ", " <space> ") return [c.replace("<space>", " ") for c in text.split(" ")] else: return text.split(" ")
[docs]def pyopenjtalk_g2p(text) -> List[str]: import pyopenjtalk # phones is a str object separated by space phones = pyopenjtalk.g2p(text, kana=False) phones = phones.split(" ") return phones
def _extract_fullcontext_label(text): import pyopenjtalk if V(pyopenjtalk.__version__) >= V("0.3.0"): return pyopenjtalk.make_label(pyopenjtalk.run_frontend(text)) else: return pyopenjtalk.run_frontend(text)[1]
[docs]def pyopenjtalk_g2p_accent(text) -> List[str]: phones = [] for labels in _extract_fullcontext_label(text): p = re.findall(r"\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9]+)", labels) if len(p) == 1: phones += [p[0][0], p[0][2], p[0][1]] return phones
[docs]def pyopenjtalk_g2p_accent_with_pause(text) -> List[str]: phones = [] for labels in _extract_fullcontext_label(text): if labels.split("-")[1].split("+")[0] == "pau": phones += ["pau"] continue p = re.findall(r"\-(.*?)\+.*?\/A:([0-9\-]+).*?\/F:.*?_([0-9]+)", labels) if len(p) == 1: phones += [p[0][0], p[0][2], p[0][1]] return phones
[docs]def pyopenjtalk_g2p_kana(text) -> List[str]: import pyopenjtalk kanas = pyopenjtalk.g2p(text, kana=True) return list(kanas)
[docs]def pyopenjtalk_g2p_prosody(text: str, drop_unvoiced_vowels: bool = True) -> List[str]: """Extract phoneme + prosoody symbol sequence from input full-context labels. The algorithm is based on `Prosodic features control by symbols as input of sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks. Args: text (str): Input text. drop_unvoiced_vowels (bool): whether to drop unvoiced vowels. Returns: List[str]: List of phoneme + prosody symbols. Examples: >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody >>> pyopenjtalk_g2p_prosody("こんにちは。") ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$'] .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104 """ labels = _extract_fullcontext_label(text) N = len(labels) phones = [] for n in range(N): lab_curr = labels[n] # current phoneme p3 = re.search(r"\-(.*?)\+", lab_curr).group(1) # deal unvoiced vowels as normal vowels if drop_unvoiced_vowels and p3 in "AEIOU": p3 = p3.lower() # deal with sil at the beginning and the end of text if p3 == "sil": assert n == 0 or n == N - 1 if n == 0: phones.append("^") elif n == N - 1: # check question form or not e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr) if e3 == 0: phones.append("$") elif e3 == 1: phones.append("?") continue elif p3 == "pau": phones.append("_") continue else: phones.append(p3) # accent type and position info (forward or backward) a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr) a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr) a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr) # number of mora in accent phrase f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr) a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1]) # accent phrase border if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl": phones.append("#") # pitch falling elif a1 == 0 and a2_next == a2 + 1 and a2 != f1: phones.append("]") # pitch rising elif a2 == 1 and a2_next == 2: phones.append("[") return phones
def _numeric_feature_by_regex(regex, s): match = re.search(regex, s) if match is None: return -50 return int(match.group(1))
[docs]def pypinyin_g2p(text) -> List[str]: from pypinyin import Style, pinyin phones = [phone[0] for phone in pinyin(text, style=Style.TONE3)] return phones
[docs]def pypinyin_g2p_phone(text) -> List[str]: from pypinyin import Style, pinyin from pypinyin.style._utils import get_finals, get_initials phones = [ p for phone in pinyin(text, style=Style.TONE3) for p in [ get_initials(phone[0], strict=True), ( get_finals(phone[0][:-1], strict=True) + phone[0][-1] if phone[0][-1].isdigit() else ( get_finals(phone[0], strict=True) if phone[0][-1].isalnum() else phone[0] ) ), ] # Remove the case of individual tones as a phoneme if len(p) != 0 and not p.isdigit() ] return phones
[docs]def pypinyin_g2p_phone_without_prosody(text) -> List[str]: from pypinyin import Style, pinyin from pypinyin.style._utils import get_finals, get_initials phones = [] for phone in pinyin(text, style=Style.NORMAL, strict=False): initial = get_initials(phone[0], strict=False) final = get_finals(phone[0], strict=False) if len(initial) != 0: if initial in ["x", "y", "j", "q"]: if final == "un": final = "vn" elif final == "uan": final = "van" elif final == "u": final = "v" if final == "ue": final = "ve" phones.append(initial + "_" + final) else: phones.append(final) return phones
[docs]class G2p_en: """On behalf of g2p_en.G2p. g2p_en.G2p isn't pickalable and it can't be copied to the other processes via multiprocessing module. As a workaround, g2p_en.G2p is instantiated upon calling this class. """ def __init__(self, no_space: bool = False): self.no_space = no_space self.g2p = None def __call__(self, text) -> List[str]: if self.g2p is None: self.g2p = g2p_en.G2p() phones = self.g2p(text) if self.no_space: # remove space which represents word serapater phones = list(filter(lambda s: s != " ", phones)) return phones
[docs]class G2pk: """On behalf of g2pk.G2p. g2pk.G2p isn't pickalable and it can't be copied to the other processes via multiprocessing module. As a workaround, g2pk.G2p is instantiated upon calling this class. """ def __init__( self, descritive=False, group_vowels=False, to_syl=False, no_space=False, explicit_space=False, space_symbol="<space>", ): self.descritive = descritive self.group_vowels = group_vowels self.to_syl = to_syl self.no_space = no_space self.explicit_space = explicit_space self.space_symbol = space_symbol self.g2p = None def __call__(self, text) -> List[str]: if self.g2p is None: import g2pk self.g2p = g2pk.G2p() phones = list( self.g2p( text, descriptive=self.descritive, group_vowels=self.group_vowels, to_syl=self.to_syl, ) ) if self.no_space: # remove space which represents word serapater phones = list(filter(lambda s: s != " ", phones)) if self.explicit_space: # replace space as explicit space symbol phones = list(map(lambda s: s if s != " " else self.space_symbol, phones)) return phones
[docs]class Jaso: PUNC = "!'(),-.:;?" SPACE = " " JAMO_LEADS = "".join([chr(_) for _ in range(0x1100, 0x1113)]) JAMO_VOWELS = "".join([chr(_) for _ in range(0x1161, 0x1176)]) JAMO_TAILS = "".join([chr(_) for _ in range(0x11A8, 0x11C3)]) VALID_CHARS = JAMO_LEADS + JAMO_VOWELS + JAMO_TAILS + PUNC + SPACE def __init__(self, space_symbol=" ", no_space=False): self.space_symbol = space_symbol self.no_space = no_space def _text_to_jaso(self, line: str) -> List[str]: jasos = list(jamo.hangul_to_jamo(line)) return jasos def _remove_non_korean_characters(self, tokens): new_tokens = [token for token in tokens if token in self.VALID_CHARS] return new_tokens def __call__(self, text) -> List[str]: graphemes = [x for x in self._text_to_jaso(text)] graphemes = self._remove_non_korean_characters(graphemes) if self.no_space: graphemes = list(filter(lambda s: s != " ", graphemes)) else: graphemes = [x if x != " " else self.space_symbol for x in graphemes] return graphemes
[docs]class Phonemizer: """Phonemizer module for various languages. This is wrapper module of https://github.com/bootphon/phonemizer. You can define various g2p modules by specifying options for phonemizer. See available options: https://github.com/bootphon/phonemizer/blob/master/phonemizer/phonemize.py#L32 """ def __init__( self, backend, word_separator: Optional[str] = None, syllable_separator: Optional[str] = None, phone_separator: Optional[str] = " ", strip=False, split_by_single_token: bool = False, **phonemizer_kwargs, ): # delayed import from phonemizer.backend import BACKENDS from phonemizer.separator import Separator self.separator = Separator( word=word_separator, syllable=syllable_separator, phone=phone_separator, ) # define logger to suppress the warning in phonemizer logger = logging.getLogger("phonemizer") logger.setLevel(logging.ERROR) self.phonemizer = BACKENDS[backend]( **phonemizer_kwargs, logger=logger, ) self.strip = strip self.split_by_single_token = split_by_single_token def __call__(self, text) -> List[str]: tokens = self.phonemizer.phonemize( [text], separator=self.separator, strip=self.strip, njobs=1, )[0] if not self.split_by_single_token: return tokens.split() else: # "a: ab" -> ["a", ":", "<space>", "a", "b"] # TODO(kan-bayashi): space replacement should be dealt in PhonemeTokenizer return [c.replace(" ", "<space>") for c in tokens]
[docs]class IsG2p: # pylint: disable=too-few-public-methods """Minimal wrapper for https://github.com/grammatek/ice-g2p The g2p module uses a Bi-LSTM model along with a pronunciation dictionary to generate phonemization Unfortunately does not support multi-thread phonemization as of yet """ def __init__( self, dialect: str = "standard", syllabify: bool = True, word_sep: str = ",", use_dict: bool = True, ): self.dialect = dialect self.syllabify = syllabify self.use_dict = use_dict from ice_g2p.transcriber import Transcriber self.transcriber = Transcriber( use_dict=self.use_dict, syllab_symbol=".", stress_label=True, word_sep=word_sep, lang_detect=True, ) def __call__(self, text) -> List[str]: return self.transcriber.transcribe(text).split()
[docs]class PhonemeTokenizer(AbsTokenizer): @typechecked def __init__( self, g2p_type: Union[None, str], non_linguistic_symbols: Union[None, Path, str, Iterable[str]] = None, space_symbol: str = "<space>", remove_non_linguistic_symbols: bool = False, ): if g2p_type is None: self.g2p = split_by_space elif g2p_type == "g2p_en": self.g2p = G2p_en(no_space=False) elif g2p_type == "g2p_en_no_space": self.g2p = G2p_en(no_space=True) elif g2p_type == "pyopenjtalk": self.g2p = pyopenjtalk_g2p elif g2p_type == "pyopenjtalk_kana": self.g2p = pyopenjtalk_g2p_kana elif g2p_type == "pyopenjtalk_accent": self.g2p = pyopenjtalk_g2p_accent elif g2p_type == "pyopenjtalk_accent_with_pause": self.g2p = pyopenjtalk_g2p_accent_with_pause elif g2p_type == "pyopenjtalk_prosody": self.g2p = pyopenjtalk_g2p_prosody elif g2p_type == "pypinyin_g2p": self.g2p = pypinyin_g2p elif g2p_type == "pypinyin_g2p_phone": self.g2p = pypinyin_g2p_phone elif g2p_type == "pypinyin_g2p_phone_without_prosody": self.g2p = pypinyin_g2p_phone_without_prosody elif g2p_type == "espeak_ng_arabic": self.g2p = Phonemizer( language="ar", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_german": self.g2p = Phonemizer( language="de", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_french": self.g2p = Phonemizer( language="fr-fr", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_spanish": self.g2p = Phonemizer( language="es", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_russian": self.g2p = Phonemizer( language="ru", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_greek": self.g2p = Phonemizer( language="el", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_finnish": self.g2p = Phonemizer( language="fi", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_hungarian": self.g2p = Phonemizer( language="hu", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_dutch": self.g2p = Phonemizer( language="nl", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_hindi": self.g2p = Phonemizer( language="hi", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_italian": self.g2p = Phonemizer( language="it", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "espeak_ng_polish": self.g2p = Phonemizer( language="pl", backend="espeak", with_stress=True, preserve_punctuation=True, ) elif g2p_type == "g2pk": self.g2p = G2pk(no_space=False) elif g2p_type == "g2pk_no_space": self.g2p = G2pk(no_space=True) elif g2p_type == "g2pk_explicit_space": self.g2p = G2pk(explicit_space=True, space_symbol=space_symbol) elif g2p_type == "espeak_ng_english_us_vits": # VITS official implementation-like processing # Reference: https://github.com/jaywalnut310/vits self.g2p = Phonemizer( language="en-us", backend="espeak", with_stress=True, preserve_punctuation=True, strip=True, word_separator=" ", phone_separator="", split_by_single_token=True, ) elif g2p_type == "korean_jaso": self.g2p = Jaso(space_symbol=space_symbol, no_space=False) elif g2p_type == "korean_jaso_no_space": self.g2p = Jaso(no_space=True) elif g2p_type == "g2p_is": self.g2p = IsG2p() elif g2p_type == "g2p_is_north": self.g2p = IsG2p(dialect="north") else: raise NotImplementedError(f"Not supported: g2p_type={g2p_type}") self.g2p_type = g2p_type self.space_symbol = space_symbol if non_linguistic_symbols is None: self.non_linguistic_symbols = set() elif isinstance(non_linguistic_symbols, (Path, str)): non_linguistic_symbols = Path(non_linguistic_symbols) try: with non_linguistic_symbols.open("r", encoding="utf-8") as f: self.non_linguistic_symbols = set(line.rstrip() for line in f) except FileNotFoundError: warnings.warn(f"{non_linguistic_symbols} doesn't exist.") self.non_linguistic_symbols = set() else: self.non_linguistic_symbols = set(non_linguistic_symbols) self.remove_non_linguistic_symbols = remove_non_linguistic_symbols def __repr__(self): return ( f"{self.__class__.__name__}(" f'g2p_type="{self.g2p_type}", ' f'space_symbol="{self.space_symbol}", ' f'non_linguistic_symbols="{self.non_linguistic_symbols}"' ")" )
[docs] def text2tokens(self, line: str) -> List[str]: tokens = [] while len(line) != 0: for w in self.non_linguistic_symbols: if line.startswith(w): if not self.remove_non_linguistic_symbols: tokens.append(line[: len(w)]) line = line[len(w) :] break else: t = line[0] tokens.append(t) line = line[1:] line = "".join(tokens) tokens = self.g2p(line) return tokens
[docs] def tokens2text(self, tokens: Iterable[str]) -> str: # phoneme type is not invertible return "".join(tokens)
[docs] def text2tokens_svs(self, syllable: str) -> List[str]: # Note(Yuning): fix syllabel2phoneme mismatch # If needed, customed_dic can be changed into extra input customed_dic = { "へ": ["h", "e"], "は": ["h", "a"], "シ": ["sh", "I"], "ヴぁ": ["v", "a"], "ヴぃ": ["v", "i"], "ヴぇ": ["v", "e"], "ヴぉ": ["v", "o"], "でぇ": ["dy", "e"], "くぁ": ["k", "w", "a"], "くぃ": ["k", "w", "i"], "くぅ": ["k", "w", "u"], "くぇ": ["k", "w", "e"], "くぉ": ["k", "w", "o"], "ぐぁ": ["g", "w", "a"], "ぐぃ": ["g", "w", "i"], "ぐぅ": ["g", "w", "u"], "ぐぇ": ["g", "w", "e"], "ぐぉ": ["g", "w", "o"], "くぉっ": ["k", "w", "o", "cl"], } tokens = self.g2p(syllable) if syllable in customed_dic: tokens = customed_dic[syllable] return tokens