ESPnet-S2ST realtime demonstration

This notebook provides a demonstration of the realtime end-to-end speech translation using ESPnet-ST-v2.

Authors: Jiatong Shi ([@ftshijt](https://github.com/ftshijt))

Setup Envrionments

[ ]:
!pip install --upgrade --no-cache-dir gdown
!git clone --depth 5 -b merge_s2st_st https://github.com/ftshijt/espnet.git
!cd espnet && pip install .
!git clone --depth 1 https://github.com/kan-bayashi/ParallelWaveGAN.git
!cd ParallelWaveGAN && pip install .
!pip install -q espnet_model_zoo
!pip install pysndfile
!pip install sacrebleu
!pip install mosestokenizer
!git clone https://github.com/facebookresearch/SimulEval.git
!cd SimulEval && pip install -e .
!pip install typeguard==2.13.3

Offline Speech-to-speech translation (S2ST)

In this demonstration, we show an example model that is trained with discrete-unit-based speech-to-speech translation model. Specifically, the model is trained on Spanish-to-English subset of the CVSS-C corpus. The source speech/transcription of CVSS is from commonvoice (read speech); the target transcription is from CoVOST2 (a speech-to-text translation corpus); and the final speech is synthesized by a single-speaker TTS.

Model download

[ ]:
# Download pretrained s2st model

!gdown 1wNaEebJDDcgi8RZhniKKEMFV15ktcNRE
!unzip -o /content/s2st_train_s2st_discrete_unit_raw_fbank_es_en_train.loss.best.zip

# Download pretrained unit-based vocoder
!gdown 1ezVM3YujTVZSytOeWtD0MsXdqw19AaXa
!unzip -o /content/hubert6_500_unit_vocoder.zip

Model Setup

[ ]:
import time
import torch
import string
from espnet2.bin.s2st_inference import Speech2Speech

# temporary for a buggy checkpoint
!cp /content/exp/s2st_stats_raw_es_en/train/src_feats_stats.npz /content/exp/s2st_stats_raw_es_en/train/tgt_feats_stats.npz

lang = "es"
fs = 16000

speech2speech = Speech2Speech(
    model_file="/content/exp/s2st_train_s2st_discrete_unit_raw_fbank_es_en/362epoch.pth",
    train_config="/content/exp/s2st_train_s2st_discrete_unit_raw_fbank_es_en/config.yaml",
    minlenratio=0.0,
    maxlenratio=4,
    beam_size=3,
    vocoder_file="/content/unit_pretrained_vocoder/checkpoint-50000steps.pkl",
    device="cuda",
)

def text_normalizer(text):
    text = text.upper()
    return text.translate(str.maketrans('', '', string.punctuation))
[ ]:
# Load ASR models for ASR-BLEU
import time
import torch
import string
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.asr_inference import Speech2Text as asr

tag = "asapp/e_branchformer_librispeech"

d = ModelDownloader()
# It may takes a while to download and build models
asr_model = asr(
    **d.download_and_unpack(tag),
    device="cuda",
    minlenratio=0.0,
    maxlenratio=0.0,
    ctc_weight=0.3,
    beam_size=10,
    batch_size=0,
    nbest=1
)

Translate our example recordings

[ ]:
!git clone https://github.com/ftshijt/ESPnet_st_egs.git
[ ]:
import torch
import pandas as pd
import soundfile as sf
import librosa.display
from IPython.display import display, Audio
import matplotlib.pyplot as plt
from sacrebleu.metrics import BLEU

bleu = BLEU(effective_order=True)

egs = pd.read_csv("ESPnet_st_egs/s2st/egs.csv")
for index, row in egs.iterrows():
  if row["lang"] == lang or lang == "multilingual":
    speech, rate = sf.read("ESPnet_st_egs/" + row["path"])
    assert fs == int(row["sr"])
    tensor_speech = torch.tensor(speech, dtype=torch.double).unsqueeze(0).float()
    length = tensor_speech.new_full([1], dtype=torch.long, fill_value=tensor_speech.size(1))
    output_dict = speech2speech(tensor_speech, length)

    output_wav = output_dict["wav"].cpu().numpy()
    sf.write(
        "ESPnet_st_egs/{row['path']}.predict.wav",
        output_wav,
        fs,
        "PCM_16",
    )
    print(f"Input Speech: ESPnet_st_egs/{row['path']}")
    # let us listen to samples
    display(Audio(speech, rate=fs))
    librosa.display.waveshow(speech, sr=fs)
    plt.show()
    print(f"Reference source text: {text_normalizer(row['src_text'])}")
    print(f"Reference target text: {text_normalizer(row['tgt_text'])}")
    print(f"Output speech: ESPnet_st_egs/{row['path']}.predict.wav")
    print(f"Unit: {output_dict}")
    display(Audio(output_wav, rate=fs))
    librosa.display.waveshow(output_wav, sr=fs)
    plt.show()

    # ASR recognition to the output samples
    text, *_ = asr_model(output_wav)[0]
    print(text)
    print(f"ASR hypothesis: {text_normalizer(text)}")
    print(f"ASR BLEU: {bleu.sentence_score(text_normalizer(text), [text_normalizer(row['tgt_text'])])}")
    print("*" * 50)

Translate your own pre-recordings

  1. Upload your own pre-recorded recordings

  2. Translate your voice with the S2ST system

[ ]:
from google.colab import files
from IPython.display import display, Audio
import soundfile
import librosa.display
import matplotlib.pyplot as plt

uploaded = files.upload()

for file_name in uploaded.keys():
  speech, rate = soundfile.read(file_name)
  assert rate == fs, "mismatch in sampling rate"
  tensor_speech = torch.tensor(speech, dtype=torch.double).unsqueeze(0).float()
  length = tensor_speech.new_full([1], dtype=torch.long, fill_value=tensor_speech.size(1))
  output_dict = speech2speech(tensor_speech, length)

  output_wav = output_dict["wav"].cpu().numpy()

  print(f"Input Speech: {file_name}")
  display(Audio(speech, rate=rate))
  librosa.display.waveshow(speech, sr=rate)
  plt.show()
  print("*" * 50)
  print(f"Output speech: predict.wav")
  display(Audio(output_wav, rate=fs))
  librosa.display.waveshow(output_wav, sr=fs)
  plt.show()

Translate your own live-recordings

  1. Record your own voice (for people cannot speak Spanish, you may find some web sources and play that with your phone to simulate the real-time recording).

  2. Tralsate your vocie with the S2ST system

[ ]:
# from https://gist.github.com/korakot/c21c3476c024ad6d56d5f48b0bca92be

from IPython.display import Javascript
from google.colab import output
from base64 import b64decode

RECORD = """
const sleep = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec, filename='audio.wav'):
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec * 1000))
  b = b64decode(s.split(',')[1])
  with open(filename, 'wb+') as f:
    f.write(b)

audio = 'audio.wav'
second = 5
print(f"Speak to your microphone {second} sec...")
record(second, audio)
print("Done!")


import librosa
import librosa.display
speech, rate = librosa.load(audio, sr=16000)
librosa.display.waveshow(speech, sr=rate)

import matplotlib.pyplot as plt
plt.show()

import pysndfile
pysndfile.sndio.write('audio_ds.wav', speech, rate=rate, format='wav', enc='pcm16')

from IPython.display import display, Audio
display(Audio(speech, rate=rate))
[ ]:
tensor_speech = torch.tensor(speech, dtype=torch.double).unsqueeze(0).float()
length = tensor_speech.new_full([1], dtype=torch.long, fill_value=tensor_speech.size(1))
output_dict = speech2speech(tensor_speech, length)

output_wav = output_dict["wav"].cpu().numpy()

print(f"Output speech: predict.wav")
display(Audio(output_wav, rate=fs))
librosa.display.waveshow(output_wav, sr=fs)
plt.show()