ESPNET 2 pass SLU Demonstration
Less than 1 minute
ESPNET 2 pass SLU Demonstration
This notebook provides a demonstration of the Two Pass End-to-End Spoken Language Understanding model
Paper Link: https://arxiv.org/abs/2207.06670
ESPnet2-SLU: https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/slu1
Author: Siddhant Arora
! python -m pip install transformers
!git clone https://github.com/espnet/espnet /espnet
!pip install /espnet
%pip install -q espnet_model_zoo
%pip install fairseq@git+https://github.com//pytorch/fairseq.git@f2146bdc7abf293186de9449bfa2272775e39e1d#egg=fairseq
Download Audio File
# !gdown --id 1LxoxCoFgx3u8CvKb1loybGFtArKKPcAH -O /content/audio_file.wav
!gdown --id 18ANT62ittt7Ai2E8bQRlvT0ZVXXsf1eE -O /content/audio_file.wav
import os
import soundfile
from IPython.display import display, Audio
mixwav_mc, sr = soundfile.read("/content/audio_file.wav")
display(Audio(mixwav_mc.T, rate=sr))
Download and Load pretrained First Pass Model
!git lfs clone https://huggingface.co/espnet/siddhana_slurp_new_asr_train_asr_conformer_raw_en_word_valid.acc.ave_10best /content/slurp_first_pass_model
from espnet2.bin.asr_inference import Speech2Text
speech2text_slurp = Speech2Text.from_pretrained(
asr_train_config="/content/slurp_first_pass_model/exp/asr_train_asr_conformer_raw_en_word/config.yaml",
asr_model_file="/content/slurp_first_pass_model/exp/asr_train_asr_conformer_raw_en_word/valid.acc.ave_10best.pth",
nbest=1,
)
nbests_orig = speech2text_slurp(mixwav_mc)
text, *_ = nbests_orig[0]
def text_normalizer(sub_word_transcript):
transcript = sub_word_transcript[0].replace("▁", "")
for sub_word in sub_word_transcript[1:]:
if "▁" in sub_word:
transcript = transcript + " " + sub_word.replace("▁", "")
else:
transcript = transcript + sub_word
return transcript
intent_text="{scenario: "+text.split()[0].split("_")[0]+", action: "+"_".join(text.split()[0].split("_")[1:])+"}"
print(f"INTENT: {intent_text}")
transcript=text_normalizer(text.split()[1:])
print(f"ASR hypothesis: {transcript}")
print(f"First pass SLU model fails to predict the correct action.")
Download and Load pretrained Second Pass Model
!git lfs clone https://huggingface.co/espnet/slurp_slu_2pass /content/slurp_second_pass_model
from espnet2.bin.slu_inference import Speech2Understand
from transformers import AutoModel, AutoTokenizer
speech2text_second_pass_slurp = Speech2Understand.from_pretrained(
slu_train_config="/content/slurp_second_pass_model/exp/slu_train_asr_bert_conformer_deliberation_raw_en_word/config.yaml",
slu_model_file="/content/slurp_second_pass_model/exp/slu_train_asr_bert_conformer_deliberation_raw_en_word/valid.acc.ave_10best.pth",
nbest=1,
)
from espnet2.tasks.slu import SLUTask
preprocess_fn=SLUTask.build_preprocess_fn(
speech2text_second_pass_slurp.asr_train_args, False
)
import numpy as np
transcript = preprocess_fn.text_cleaner(transcript)
tokens = preprocess_fn.transcript_tokenizer.text2tokens(transcript)
text_ints = np.array(preprocess_fn.transcript_token_id_converter.tokens2ids(tokens), dtype=np.int64)
import torch
nbests = speech2text_second_pass_slurp(mixwav_mc,torch.tensor(text_ints))
text1, *_ = nbests[0]
intent_text="{scenario: "+text1.split()[0].split("_")[0]+", action: "+"_".join(text1.split()[0].split("_")[1:])+"}"
print(f"INTENT: {intent_text}")
transcript=text_normalizer(text1.split()[1:])
print(f"ASR hypothesis: {transcript}")
print(f"Second pass SLU model successfully recognizes the correct action.")