ESPnet-S2ST realtime demonstration
About 3 min
ESPnet-S2ST realtime demonstration
This notebook provides a demonstration of the realtime end-to-end speech translation using ESPnet-ST-v2.
Authors: Jiatong Shi (@ftshijt)
Setup Envrionments
!pip install --upgrade --no-cache-dir gdown
!git clone --depth 5 -b merge_s2st_st https://github.com/ftshijt/espnet.git
!cd espnet && pip install .
!git clone --depth 1 https://github.com/kan-bayashi/ParallelWaveGAN.git
!cd ParallelWaveGAN && pip install .
!pip install -q espnet_model_zoo
!pip install pysndfile
!pip install sacrebleu
!pip install mosestokenizer
!git clone https://github.com/facebookresearch/SimulEval.git
!cd SimulEval && pip install -e .
!pip install typeguard==2.13.3
Offline Speech-to-speech translation (S2ST)
In this demonstration, we show an example model that is trained with discrete-unit-based speech-to-speech translation model. Specifically, the model is trained on Spanish-to-English subset of the CVSS-C corpus. The source speech/transcription of CVSS is from commonvoice (read speech); the target transcription is from CoVOST2 (a speech-to-text translation corpus); and the final speech is synthesized by a single-speaker TTS.
Model download
# Download pretrained s2st model
!gdown 1wNaEebJDDcgi8RZhniKKEMFV15ktcNRE
!unzip -o /content/s2st_train_s2st_discrete_unit_raw_fbank_es_en_train.loss.best.zip
# Download pretrained unit-based vocoder
!gdown 1ezVM3YujTVZSytOeWtD0MsXdqw19AaXa
!unzip -o /content/hubert6_500_unit_vocoder.zip
Model Setup
import time
import torch
import string
from espnet2.bin.s2st_inference import Speech2Speech
# temporary for a buggy checkpoint
!cp /content/exp/s2st_stats_raw_es_en/train/src_feats_stats.npz /content/exp/s2st_stats_raw_es_en/train/tgt_feats_stats.npz
lang = "es"
fs = 16000
speech2speech = Speech2Speech(
model_file="/content/exp/s2st_train_s2st_discrete_unit_raw_fbank_es_en/362epoch.pth",
train_config="/content/exp/s2st_train_s2st_discrete_unit_raw_fbank_es_en/config.yaml",
minlenratio=0.0,
maxlenratio=4,
beam_size=3,
vocoder_file="/content/unit_pretrained_vocoder/checkpoint-50000steps.pkl",
device="cuda",
)
def text_normalizer(text):
text = text.upper()
return text.translate(str.maketrans('', '', string.punctuation))
# Load ASR models for ASR-BLEU
import time
import torch
import string
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.asr_inference import Speech2Text as asr
tag = "asapp/e_branchformer_librispeech"
d = ModelDownloader()
# It may takes a while to download and build models
asr_model = asr(
**d.download_and_unpack(tag),
device="cuda",
minlenratio=0.0,
maxlenratio=0.0,
ctc_weight=0.3,
beam_size=10,
batch_size=0,
nbest=1
)
Translate our example recordings
!git clone https://github.com/ftshijt/ESPnet_st_egs.git
import torch
import pandas as pd
import soundfile as sf
import librosa.display
from IPython.display import display, Audio
import matplotlib.pyplot as plt
from sacrebleu.metrics import BLEU
bleu = BLEU(effective_order=True)
egs = pd.read_csv("ESPnet_st_egs/s2st/egs.csv")
for index, row in egs.iterrows():
if row["lang"] == lang or lang == "multilingual":
speech, rate = sf.read("ESPnet_st_egs/" + row["path"])
assert fs == int(row["sr"])
tensor_speech = torch.tensor(speech, dtype=torch.double).unsqueeze(0).float()
length = tensor_speech.new_full([1], dtype=torch.long, fill_value=tensor_speech.size(1))
output_dict = speech2speech(tensor_speech, length)
output_wav = output_dict["wav"].cpu().numpy()
sf.write(
"ESPnet_st_egs/{row['path']}.predict.wav",
output_wav,
fs,
"PCM_16",
)
print(f"Input Speech: ESPnet_st_egs/{row['path']}")
# let us listen to samples
display(Audio(speech, rate=fs))
librosa.display.waveshow(speech, sr=fs)
plt.show()
print(f"Reference source text: {text_normalizer(row['src_text'])}")
print(f"Reference target text: {text_normalizer(row['tgt_text'])}")
print(f"Output speech: ESPnet_st_egs/{row['path']}.predict.wav")
print(f"Unit: {output_dict}")
display(Audio(output_wav, rate=fs))
librosa.display.waveshow(output_wav, sr=fs)
plt.show()
# ASR recognition to the output samples
text, *_ = asr_model(output_wav)[0]
print(text)
print(f"ASR hypothesis: {text_normalizer(text)}")
print(f"ASR BLEU: {bleu.sentence_score(text_normalizer(text), [text_normalizer(row['tgt_text'])])}")
print("*" * 50)
Translate your own pre-recordings
- Upload your own pre-recorded recordings
- Translate your voice with the S2ST system
from google.colab import files
from IPython.display import display, Audio
import soundfile
import librosa.display
import matplotlib.pyplot as plt
uploaded = files.upload()
for file_name in uploaded.keys():
speech, rate = soundfile.read(file_name)
assert rate == fs, "mismatch in sampling rate"
tensor_speech = torch.tensor(speech, dtype=torch.double).unsqueeze(0).float()
length = tensor_speech.new_full([1], dtype=torch.long, fill_value=tensor_speech.size(1))
output_dict = speech2speech(tensor_speech, length)
output_wav = output_dict["wav"].cpu().numpy()
print(f"Input Speech: {file_name}")
display(Audio(speech, rate=rate))
librosa.display.waveshow(speech, sr=rate)
plt.show()
print("*" * 50)
print(f"Output speech: predict.wav")
display(Audio(output_wav, rate=fs))
librosa.display.waveshow(output_wav, sr=fs)
plt.show()
Translate your own live-recordings
- Record your own voice (for people cannot speak Spanish, you may find some web sources and play that with your phone to simulate the real-time recording).
- Tralsate your vocie with the S2ST system
# from https://gist.github.com/korakot/c21c3476c024ad6d56d5f48b0bca92be
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode
RECORD = """
const sleep = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
const reader = new FileReader()
reader.onloadend = e => resolve(e.srcElement.result)
reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
stream = await navigator.mediaDevices.getUserMedia({ audio: true })
recorder = new MediaRecorder(stream)
chunks = []
recorder.ondataavailable = e => chunks.push(e.data)
recorder.start()
await sleep(time)
recorder.onstop = async ()=>{
blob = new Blob(chunks)
text = await b2text(blob)
resolve(text)
}
recorder.stop()
})
"""
def record(sec, filename='audio.wav'):
display(Javascript(RECORD))
s = output.eval_js('record(%d)' % (sec * 1000))
b = b64decode(s.split(',')[1])
with open(filename, 'wb+') as f:
f.write(b)
audio = 'audio.wav'
second = 5
print(f"Speak to your microphone {second} sec...")
record(second, audio)
print("Done!")
import librosa
import librosa.display
speech, rate = librosa.load(audio, sr=16000)
librosa.display.waveshow(speech, sr=rate)
import matplotlib.pyplot as plt
plt.show()
import pysndfile
pysndfile.sndio.write('audio_ds.wav', speech, rate=rate, format='wav', enc='pcm16')
from IPython.display import display, Audio
display(Audio(speech, rate=rate))
tensor_speech = torch.tensor(speech, dtype=torch.double).unsqueeze(0).float()
length = tensor_speech.new_full([1], dtype=torch.long, fill_value=tensor_speech.size(1))
output_dict = speech2speech(tensor_speech, length)
output_wav = output_dict["wav"].cpu().numpy()
print(f"Output speech: predict.wav")
display(Audio(output_wav, rate=fs))
librosa.display.waveshow(output_wav, sr=fs)
plt.show()