ESPnet2 real streaming Transformer demonstration

Details in “Streaming Transformer ASR with Blockwise Synchronous Beam Search” (https://arxiv.org/abs/2006.14941)

This local notebook provides a demonstration of streaming ASR based on Transformer using ESPnet2.

You can recognize a recorded audio file or a speech online.

Author: Keqi Deng (UCAS)

Train a streaming Transformer model

You can train a streaming Transformer model on your own corpus following the example of https://github.com/espnet/espnet/blob/master/egs2/aishell/asr1/run_streaming.sh

Download pre-trained model and audio file for demo

You can download the pre-trained model from the ESPnet_model_zoo or directly from Huggingface. ### For Mandarin Task (Pretrained using AISHELL-1)

[ ]:
tag='Emiru Tsunoo/aishell_asr_train_asr_streaming_transformer_raw_zh_char_sp_valid.acc.ave'

For English Task (Pretrained using Tedlium2)

[ ]:
tag='D-Keqi/espnet_asr_train_asr_streaming_transformer_raw_en_bpe500_sp_valid.acc.ave'

Import packages

Make sure that you have installed the latest ESPnet

[ ]:
import sys
import espnet
from espnet2.bin.asr_inference_streaming import Speech2TextStreaming
from espnet_model_zoo.downloader import ModelDownloader
import argparse
import numpy as np
import wave

Prepare for inference

[ ]:
d=ModelDownloader()
speech2text = Speech2TextStreaming(
    **d.download_and_unpack(tag),
    token_type=None,
    bpemodel=None,
    maxlenratio=0.0,
    minlenratio=0.0,
    beam_size=20,
    ctc_weight=0.5,
    lm_weight=0.0,
    penalty=0.0,
    nbest=1,
    device = "cpu",
    disable_repetition_detection=True,
    decoder_text_length_limit=0,
    encoded_feat_length_limit=0
)
[ ]:
prev_lines = 0
def progress_output(text):
    global prev_lines
    lines=['']
    for i in text:
        if len(lines[-1]) > 100:
            lines.append('')
        lines[-1] += i
    for i,line in enumerate(lines):
        if i == prev_lines:
            sys.stderr.write('\n\r')
        else:
            sys.stderr.write('\r\033[B\033[K')
        sys.stderr.write(line)

    prev_lines = len(lines)
    sys.stderr.flush()
[ ]:
def recognize(wavfile):
    with wave.open(wavfile, 'rb') as wavfile:
        ch=wavfile.getnchannels()
        bits=wavfile.getsampwidth()
        rate=wavfile.getframerate()
        nframes=wavfile.getnframes()
        buf = wavfile.readframes(-1)
        data=np.frombuffer(buf, dtype='int16')
    speech = data.astype(np.float16)/32767.0 #32767 is the upper limit of 16-bit binary numbers and is used for the normalization of int to float.
    sim_chunk_length = 640
    if sim_chunk_length > 0:
        for i in range(len(speech)//sim_chunk_length):
            results = speech2text(speech=speech[i*sim_chunk_length:(i+1)*sim_chunk_length], is_final=False)
            if results is not None and len(results) > 0:
                nbests = [text for text, token, token_int, hyp in results]
                text = nbests[0] if nbests is not None and len(nbests) > 0 else ""
                progress_output(nbests[0])
            else:
                progress_output("")

        results = speech2text(speech[(i+1)*sim_chunk_length:len(speech)], is_final=True)
    else:
        results = speech2text(speech, is_final=True)
    nbests = [text for text, token, token_int, hyp in results]
    progress_output(nbests[0])

Recognize the audio file

[ ]:
#You can upload your own audio file for recognition, and also we provide some demo audio files that you can download from Google drive.
#For Mandarin task, the demo file comes from the AISSHELL-1: https://drive.google.com/file/d/1l8w93r8Bs5FtC3A-1ydEqFQdP4k6FiUL/view?usp=sharing
#wavfile='./BAC009S0724W0121.wav'
#For  English task, the demo file comes from the Librispeech: https://drive.google.com/file/d/1l71ZUNQ6qQk95T54H0tH_OEwZvWnEL4u/view?usp=sharing
#wavfile='./61-70968-0000.wav'
recognize(wavfile)

Recognize the speech from speaker

Install pyaudio

[ ]:
import pyaudio

Streamingly recognize with pyaudio

[ ]:
CHUNK=2048
FORMAT=pyaudio.paInt16
CHANNELS=1
RATE=16000
RECORD_SECONDS=5
p=pyaudio.PyAudio()
stream = p.open(format=FORMAT,channels=CHANNELS,rate=RATE,input=True,frames_per_buffer=CHUNK)
for i in range(0,int(RATE/CHUNK*RECORD_SECONDS)+1):
    data=stream.read(CHUNK)
    data=np.frombuffer(data, dtype='int16')
    data=data.astype(np.float16)/32767.0 #32767 is the upper limit of 16-bit binary numbers and is used for the normalization of int to float.
    if i==int(RATE/CHUNK*RECORD_SECONDS):
        results = speech2text(speech=data, is_final=True)
        break
    results = speech2text(speech=data, is_final=False)
    if results is not None and len(results) > 0:
        nbests = [text for text, token, token_int, hyp in results]
        text = nbests[0] if nbests is not None and len(nbests) > 0 else ""
        progress_output(nbests[0])
    else:
        progress_output("")
nbests = [text for text, token, token_int, hyp in results]
progress_output(nbests[0])