[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/espnet/notebook/blob/master/espnet2_tts_realtime_demo.ipynb)

# ESPnet2-TTS realtime demonstration

This notebook provides a demonstration of the realtime E2E-TTS using ESPnet2-TTS and ParallelWaveGAN repo.

- ESPnet2-TTS: https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1
- ParallelWaveGAN: https://github.com/kan-bayashi/ParallelWaveGAN

Author: Tomoki Hayashi ([@kan-bayashi](https://github.com/kan-bayashi))

## Installation

In [None]:
# NOTE: pip shows imcompatible errors due to preinstalled libraries but you do not need to care
!pip install -q espnet==202308 pypinyin==0.44.0 parallel_wavegan==0.5.4 gdown==4.4.0 espnet_model_zoo


## Single speaker model demo


### Model Selection

Please select model: English, Japanese, and Mandarin are supported.

You can try end-to-end text2wav model & combination of text2mel and vocoder. 
If you use text2wav model, you do not need to use vocoder (automatically disabled).

**Text2wav models**:
- VITS

**Text2mel models**:
- Tacotron2
- Transformer-TTS
- (Conformer) FastSpeech
- (Conformer) FastSpeech2

**Vocoders**:
- Parallel WaveGAN
- Multi-band MelGAN
- HiFiGAN
- Style MelGAN.


> The terms of use follow that of each corpus. We use the following corpora:
- `ljspeech_*`: LJSpeech dataset 
 - https://keithito.com/LJ-Speech-Dataset/
- `jsut_*`: JSUT corpus
 - https://sites.google.com/site/shinnosuketakamichi/publication/jsut
- `jvs_*`: JVS corpus + JSUT corpus
 - https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_corpus
 - https://sites.google.com/site/shinnosuketakamichi/publication/jsut
- `tsukuyomi_*`: つくよみちゃんコーパス + JSUT corpus
 - https://tyc.rei-yumesaki.net/material/corpus/
 - https://sites.google.com/site/shinnosuketakamichi/publication/jsut
- `csmsc_*`: Chinese Standard Mandarin Speech Corpus
 - https://www.data-baker.com/open_source.html 



In [None]:
#@title Choose English model { run: "auto" }
lang = 'English'
tag = 'kan-bayashi/ljspeech_vits' #@param ["kan-bayashi/ljspeech_tacotron2", "kan-bayashi/ljspeech_fastspeech", "kan-bayashi/ljspeech_fastspeech2", "kan-bayashi/ljspeech_conformer_fastspeech2", "kan-bayashi/ljspeech_joint_finetune_conformer_fastspeech2_hifigan", "kan-bayashi/ljspeech_joint_train_conformer_fastspeech2_hifigan", "kan-bayashi/ljspeech_vits"] {type:"string"}
vocoder_tag = "none" #@param ["none", "parallel_wavegan/ljspeech_parallel_wavegan.v1", "parallel_wavegan/ljspeech_full_band_melgan.v2", "parallel_wavegan/ljspeech_multi_band_melgan.v2", "parallel_wavegan/ljspeech_hifigan.v1", "parallel_wavegan/ljspeech_style_melgan.v1"] {type:"string"}

In [None]:
#@title Choose Japanese model { run: "auto" }
lang = 'Japanese'
tag = 'kan-bayashi/jsut_full_band_vits_prosody' #@param ["kan-bayashi/jsut_tacotron2", "kan-bayashi/jsut_transformer", "kan-bayashi/jsut_fastspeech", "kan-bayashi/jsut_fastspeech2", "kan-bayashi/jsut_conformer_fastspeech2", "kan-bayashi/jsut_conformer_fastspeech2_accent", "kan-bayashi/jsut_conformer_fastspeech2_accent_with_pause", "kan-bayashi/jsut_vits_accent_with_pause", "kan-bayashi/jsut_full_band_vits_accent_with_pause", "kan-bayashi/jsut_tacotron2_prosody", "kan-bayashi/jsut_transformer_prosody", "kan-bayashi/jsut_conformer_fastspeech2_tacotron2_prosody", "kan-bayashi/jsut_vits_prosody", "kan-bayashi/jsut_full_band_vits_prosody", "kan-bayashi/jvs_jvs010_vits_prosody", "kan-bayashi/tsukuyomi_full_band_vits_prosody"] {type:"string"}
vocoder_tag = 'none' #@param ["none", "parallel_wavegan/jsut_parallel_wavegan.v1", "parallel_wavegan/jsut_multi_band_melgan.v2", "parallel_wavegan/jsut_style_melgan.v1", "parallel_wavegan/jsut_hifigan.v1"] {type:"string"}

In [None]:
#@title Choose Mandarin model { run: "auto" }
lang = 'Mandarin'
tag = 'kan-bayashi/csmsc_full_band_vits' #@param ["kan-bayashi/csmsc_tacotron2", "kan-bayashi/csmsc_transformer", "kan-bayashi/csmsc_fastspeech", "kan-bayashi/csmsc_fastspeech2", "kan-bayashi/csmsc_conformer_fastspeech2", "kan-bayashi/csmsc_vits", "kan-bayashi/csmsc_full_band_vits"] {type: "string"}
vocoder_tag = "none" #@param ["none", "parallel_wavegan/csmsc_parallel_wavegan.v1", "parallel_wavegan/csmsc_multi_band_melgan.v2", "parallel_wavegan/csmsc_hifigan.v1", "parallel_wavegan/csmsc_style_melgan.v1"] {type:"string"}

### Model Setup

In [None]:
from espnet2.bin.tts_inference import Text2Speech
from espnet2.utils.types import str_or_none

text2speech = Text2Speech.from_pretrained(
 model_tag=str_or_none(tag),
 vocoder_tag=str_or_none(vocoder_tag),
 device="cuda",
 # Only for Tacotron 2 & Transformer
 threshold=0.5,
 # Only for Tacotron 2
 minlenratio=0.0,
 maxlenratio=10.0,
 use_att_constraint=False,
 backward_window=1,
 forward_window=3,
 # Only for FastSpeech & FastSpeech2 & VITS
 speed_control_alpha=1.0,
 # Only for VITS
 noise_scale=0.333,
 noise_scale_dur=0.333,
)

### Synthesis

In [None]:
import time
import torch

# decide the input sentence by yourself
print(f"Input your favorite sentence in {lang}.")
x = input()

# synthesis
with torch.no_grad():
 start = time.time()
 wav = text2speech(x)["wav"]
rtf = (time.time() - start) / (len(wav) / text2speech.fs)
print(f"RTF = {rtf:5f}")

# let us listen to generated samples
from IPython.display import display, Audio
display(Audio(wav.view(-1).cpu().numpy(), rate=text2speech.fs))

## Multi-speaker Model Demo

### Model Selection

Now we provide only English multi-speaker pretrained model.

> The terms of use follow that of each corpus. We use the following corpora:
- `libritts_*`: LibriTTS corpus
 - http://www.openslr.org/60
- `vctk_*`: English Multi-speaker Corpus for CSTR Voice Cloning Toolkit
 - http://www.udialogue.org/download/cstr-vctk-corpus.html



In [None]:
#@title English multi-speaker pretrained model { run: "auto" }
lang = 'English'
tag = 'kan-bayashi/vctk_full_band_multi_spk_vits' #@param ["kan-bayashi/vctk_gst_tacotron2", "kan-bayashi/vctk_gst_transformer", "kan-bayashi/vctk_xvector_tacotron2", "kan-bayashi/vctk_xvector_transformer", "kan-bayashi/vctk_xvector_conformer_fastspeech2", "kan-bayashi/vctk_gst+xvector_tacotron2", "kan-bayashi/vctk_gst+xvector_transformer", "kan-bayashi/vctk_gst+xvector_conformer_fastspeech2", "kan-bayashi/vctk_multi_spk_vits", "kan-bayashi/vctk_full_band_multi_spk_vits", "kan-bayashi/libritts_xvector_transformer", "kan-bayashi/libritts_xvector_conformer_fastspeech2", "kan-bayashi/libritts_gst+xvector_transformer", "kan-bayashi/libritts_gst+xvector_conformer_fastspeech2", "kan-bayashi/libritts_xvector_vits"] {type:"string"}
vocoder_tag = "none" #@param ["none", "parallel_wavegan/vctk_parallel_wavegan.v1.long", "parallel_wavegan/vctk_multi_band_melgan.v2", "parallel_wavegan/vctk_style_melgan.v1", "parallel_wavegan/vctk_hifigan.v1", "parallel_wavegan/libritts_parallel_wavegan.v1.long", "parallel_wavegan/libritts_multi_band_melgan.v2", "parallel_wavegan/libritts_hifigan.v1", "parallel_wavegan/libritts_style_melgan.v1"] {type:"string"}

### Model Setup

In [None]:
from espnet2.bin.tts_inference import Text2Speech
from espnet2.utils.types import str_or_none

text2speech = Text2Speech.from_pretrained(
 model_tag=str_or_none(tag),
 vocoder_tag=str_or_none(vocoder_tag),
 device="cuda",
 # Only for Tacotron 2 & Transformer
 threshold=0.5,
 # Only for Tacotron 2
 minlenratio=0.0,
 maxlenratio=10.0,
 use_att_constraint=False,
 backward_window=1,
 forward_window=3,
 # Only for FastSpeech & FastSpeech2 & VITS
 speed_control_alpha=1.0,
 # Only for VITS
 noise_scale=0.333,
 noise_scale_dur=0.333,
)

### Speaker selection

For multi-speaker model, we need to provide X-vector and/or the reference speech to decide the speaker characteristics. 
For X-vector, you can select the speaker from the dumped x-vectors. 
For the reference speech, you can use any speech but please make sure the sampling rate is matched.

In [None]:
import glob
import os
import numpy as np
import kaldiio

# Get model directory path
from espnet_model_zoo.downloader import ModelDownloader
d = ModelDownloader()
model_dir = os.path.dirname(d.download_and_unpack(tag)["train_config"])

# X-vector selection
spembs = None
if text2speech.use_spembs:
 xvector_ark = [p for p in glob.glob(f"{model_dir}/../../dump/**/spk_xvector.ark", recursive=True) if "tr" in p][0]
 xvectors = {k: v for k, v in kaldiio.load_ark(xvector_ark)}
 spks = list(xvectors.keys())

 # randomly select speaker
 random_spk_idx = np.random.randint(0, len(spks))
 spk = spks[random_spk_idx]
 spembs = xvectors[spk]
 print(f"selected spk: {spk}")

# Speaker ID selection
sids = None
if text2speech.use_sids:
 spk2sid = glob.glob(f"{model_dir}/../../dump/**/spk2sid", recursive=True)[0]
 with open(spk2sid) as f:
 lines = [line.strip() for line in f.readlines()]
 sid2spk = {int(line.split()[1]): line.split()[0] for line in lines}
 
 # randomly select speaker
 sids = np.array(np.random.randint(1, len(sid2spk)))
 spk = sid2spk[int(sids)]
 print(f"selected spk: {spk}")

# Reference speech selection for GST
speech = None
if text2speech.use_speech:
 # you can change here to load your own reference speech
 # e.g.
 # import soundfile as sf
 # speech, fs = sf.read("/path/to/reference.wav")
 # speech = torch.from_numpy(speech).float()
 speech = torch.randn(50000,) * 0.01

### Synthesis

In [None]:
import time
import torch

# decide the input sentence by yourself
print(f"Input your favorite sentence in {lang}.")
x = input()

# synthesis
with torch.no_grad():
 start = time.time()
 wav = text2speech(x, speech=speech, spembs=spembs, sids=sids)["wav"]
rtf = (time.time() - start) / (len(wav) / text2speech.fs)
print(f"RTF = {rtf:5f}")

# let us listen to generated samples
from IPython.display import display, Audio
display(Audio(wav.view(-1).cpu().numpy(), rate=text2speech.fs))