ESPnet2 real streaming Transformer demonstration
About 2 min
ESPnet2 real streaming Transformer demonstration
Details in "Streaming Transformer ASR with Blockwise Synchronous Beam Search" (https://arxiv.org/abs/2006.14941)
This local notebook provides a demonstration of streaming ASR based on Transformer using ESPnet2.
You can recognize a recorded audio file or a speech online.
Author: Keqi Deng (UCAS)
Train a streaming Transformer model
You can train a streaming Transformer model on your own corpus following the example of https://github.com/espnet/espnet/blob/master/egs2/aishell/asr1/run_streaming.sh
Download pre-trained model and audio file for demo
You can download the pre-trained model from the ESPnet_model_zoo or directly from Huggingface.
For Mandarin Task (Pretrained using AISHELL-1)
tag='Emiru Tsunoo/aishell_asr_train_asr_streaming_transformer_raw_zh_char_sp_valid.acc.ave'
For English Task (Pretrained using Tedlium2)
tag='D-Keqi/espnet_asr_train_asr_streaming_transformer_raw_en_bpe500_sp_valid.acc.ave'
Import packages
Make sure that you have installed the latest ESPnet
import sys
import espnet
from espnet2.bin.asr_inference_streaming import Speech2TextStreaming
from espnet_model_zoo.downloader import ModelDownloader
import argparse
import numpy as np
import wave
Prepare for inference
d=ModelDownloader()
speech2text = Speech2TextStreaming(
**d.download_and_unpack(tag),
token_type=None,
bpemodel=None,
maxlenratio=0.0,
minlenratio=0.0,
beam_size=20,
ctc_weight=0.5,
lm_weight=0.0,
penalty=0.0,
nbest=1,
device = "cpu",
disable_repetition_detection=True,
decoder_text_length_limit=0,
encoded_feat_length_limit=0
)
prev_lines = 0
def progress_output(text):
global prev_lines
lines=['']
for i in text:
if len(lines[-1]) > 100:
lines.append('')
lines[-1] += i
for i,line in enumerate(lines):
if i == prev_lines:
sys.stderr.write('\n\r')
else:
sys.stderr.write('\r\033[B\033[K')
sys.stderr.write(line)
prev_lines = len(lines)
sys.stderr.flush()
def recognize(wavfile):
with wave.open(wavfile, 'rb') as wavfile:
ch=wavfile.getnchannels()
bits=wavfile.getsampwidth()
rate=wavfile.getframerate()
nframes=wavfile.getnframes()
buf = wavfile.readframes(-1)
data=np.frombuffer(buf, dtype='int16')
speech = data.astype(np.float16)/32767.0 #32767 is the upper limit of 16-bit binary numbers and is used for the normalization of int to float.
sim_chunk_length = 640
if sim_chunk_length > 0:
for i in range(len(speech)//sim_chunk_length):
results = speech2text(speech=speech[i*sim_chunk_length:(i+1)*sim_chunk_length], is_final=False)
if results is not None and len(results) > 0:
nbests = [text for text, token, token_int, hyp in results]
text = nbests[0] if nbests is not None and len(nbests) > 0 else ""
progress_output(nbests[0])
else:
progress_output("")
results = speech2text(speech[(i+1)*sim_chunk_length:len(speech)], is_final=True)
else:
results = speech2text(speech, is_final=True)
nbests = [text for text, token, token_int, hyp in results]
progress_output(nbests[0])
Recognize the audio file
#You can upload your own audio file for recognition, and also we provide some demo audio files that you can download from Google drive.
#For Mandarin task, the demo file comes from the AISSHELL-1: https://drive.google.com/file/d/1l8w93r8Bs5FtC3A-1ydEqFQdP4k6FiUL/view?usp=sharing
#wavfile='./BAC009S0724W0121.wav'
#For English task, the demo file comes from the Librispeech: https://drive.google.com/file/d/1l71ZUNQ6qQk95T54H0tH_OEwZvWnEL4u/view?usp=sharing
#wavfile='./61-70968-0000.wav'
recognize(wavfile)
Recognize the speech from speaker
Install pyaudio
import pyaudio
Streamingly recognize with pyaudio
CHUNK=2048
FORMAT=pyaudio.paInt16
CHANNELS=1
RATE=16000
RECORD_SECONDS=5
p=pyaudio.PyAudio()
stream = p.open(format=FORMAT,channels=CHANNELS,rate=RATE,input=True,frames_per_buffer=CHUNK)
for i in range(0,int(RATE/CHUNK*RECORD_SECONDS)+1):
data=stream.read(CHUNK)
data=np.frombuffer(data, dtype='int16')
data=data.astype(np.float16)/32767.0 #32767 is the upper limit of 16-bit binary numbers and is used for the normalization of int to float.
if i==int(RATE/CHUNK*RECORD_SECONDS):
results = speech2text(speech=data, is_final=True)
break
results = speech2text(speech=data, is_final=False)
if results is not None and len(results) > 0:
nbests = [text for text, token, token_int, hyp in results]
text = nbests[0] if nbests is not None and len(nbests) > 0 else ""
progress_output(nbests[0])
else:
progress_output("")
nbests = [text for text, token, token_int, hyp in results]
progress_output(nbests[0])