Source code for espnet2.enh.separator.uses_separator

from collections import OrderedDict
from typing import Dict, List, Optional, Tuple, Union

import torch
from torch_complex.tensor import ComplexTensor

from espnet2.enh.layers.complex_utils import is_complex, new_complex_like
from espnet2.enh.layers.uses import USES
from espnet2.enh.separator.abs_separator import AbsSeparator

[docs]class USESSeparator(AbsSeparator): def __init__( self, input_dim: int, num_spk: int = 2, enc_channels: int = 256, bottleneck_size: int = 64, num_blocks: int = 6, num_spatial_blocks: int = 3, ref_channel: Optional[int] = None, segment_size: int = 64, memory_size: int = 20, memory_types: int = 1, # Transformer-related arguments rnn_type: str = "lstm", bidirectional: bool = True, hidden_size: int = 128, att_heads: int = 4, dropout: float = 0.0, norm_type: str = "cLN", activation: str = "relu", ch_mode: Union[str, List[str]] = "att", ch_att_dim: int = 256, eps: float = 1e-5, additional: dict = {}, ): """Unconstrained Speech Enhancement and Separation (USES) Network. Reference: [1] W. Zhang, K. Saijo, Z.-Q., Wang, S. Watanabe, and Y. Qian, “Toward Universal Speech Enhancement for Diverse Input Conditions,” in Proc. ASRU, 2023. Args: input_dim (int): input feature dimension. Not used as the model is independent of the input size. num_spk (int): number of speakers. enc_channels (int): feature dimension after the Conv1D encoder. bottleneck_size (int): dimension of the bottleneck feature. Must be a multiple of `att_heads`. num_blocks (int): number of processing blocks. num_spatial_blocks (int): number of processing blocks with channel modeling. ref_channel (int): reference channel (used in channel modeling modules). segment_size (int): number of frames in each non-overlapping segment. This is used to segment long utterances into smaller chunks for efficient processing. memory_size (int): group size of global memory tokens. The basic use of memory tokens is to store the history information from previous segments. The memory tokens are updated by the output of the last block after processing each segment. memory_types (int): numbre of memory token groups. Each group corresponds to a different type of processing, i.e., the first group is used for denoising without dereverberation, the second group is used for denoising with dereverberation, rnn_type: string, select from 'RNN', 'LSTM' and 'GRU'. bidirectional (bool): whether the inter-chunk RNN layers are bidirectional. hidden_size (int): dimension of the hidden state. att_heads (int): number of attention heads. dropout (float): dropout ratio. Default is 0. norm_type: type of normalization to use after each inter- or intra-chunk NN block. activation: the nonlinear activation function. ch_mode: str or list, mode of channel modeling. Select from "att" and "tac". ch_att_dim (int): dimension of the channel attention. ref_channel: Optional[int], index of the reference channel. eps (float): epsilon for layer normalization. """ super().__init__() self._num_spk = num_spk self.enc_channels = enc_channels self.ref_channel = ref_channel # used to project each complex-valued time-frequency bin to an embedding self.post_encoder = torch.nn.Conv2d(2, enc_channels, (3, 3), padding=(1, 1)) assert bottleneck_size % att_heads == 0, (bottleneck_size, att_heads) opt = { "memory_types": memory_types, } # arguments in `opt` can be updated at inference time to process different data opt.update(additional) self.uses = USES( enc_channels, output_size=enc_channels * num_spk, bottleneck_size=bottleneck_size, num_blocks=num_blocks, num_spatial_blocks=num_spatial_blocks, segment_size=segment_size, memory_size=memory_size, **opt, # Transformer-specific arguments rnn_type=rnn_type, bidirectional=bidirectional, hidden_size=hidden_size, att_heads=att_heads, dropout=dropout, norm_type=norm_type, activation=activation, ch_mode=ch_mode, ch_att_dim=ch_att_dim, eps=eps, ) # used to project each embedding back to the complex-valued time-frequency bin self.pre_decoder = torch.nn.ConvTranspose2d( enc_channels, 2, (3, 3), padding=(1, 1) )
[docs] def forward( self, input: Union[torch.Tensor, ComplexTensor], ilens: torch.Tensor, additional: Optional[Dict] = None, ) -> Tuple[List[Union[torch.Tensor, ComplexTensor]], torch.Tensor, OrderedDict]: """Forward. Args: input (torch.Tensor or ComplexTensor): STFT spectrum [B, T, (C,) F (,2)] B is the batch size T is the number of time frames C is the number of microphone channels (optional) F is the number of frequency bins 2 is real and imaginary parts (optional if input is a complex tensor) ilens (torch.Tensor): input lengths [Batch] additional (Dict or None): other data included in model "mode": one of ("no_dereverb", "dereverb", "both") 1. "no_dereverb": only use the first memory group for denoising without dereverberation 2. "dereverb": only use the second memory group for denoising with dereverberation 3. "both": use both memory groups for denoising with and without dereverberation Returns: masked (List[Union(torch.Tensor, ComplexTensor)]): [(B, T, F), ...] ilens (torch.Tensor): (B,) others predicted data, e.g. masks: OrderedDict[ 'mask_spk1': torch.Tensor(Batch, Frames, Freq), 'mask_spk2': torch.Tensor(Batch, Frames, Freq), ... 'mask_spkn': torch.Tensor(Batch, Frames, Freq), ] """ # B, 2, T, (C,) F if is_complex(input): feature = torch.stack([input.real, input.imag], dim=1) else: assert input.size(-1) == 2, input.shape feature = input.moveaxis(-1, 1) # B, C, 2, F, T if feature.ndim == 4: feature = feature.moveaxis(-1, -2).unsqueeze(1) elif feature.ndim == 5: feature = feature.permute(0, 3, 1, 4, 2).contiguous() else: raise ValueError(f"Invalid input shape: {feature.shape}") B, C, RI, F, T = feature.shape feature = feature.reshape(-1, RI, F, T) feature = self.post_encoder(feature) # B*C, enc_channels, F, T feature = feature.reshape(B, C, -1, F, T).contiguous() others = {} # B, enc_channels * num_spk, F, T if additional is not None: mode = additional.get("mode", "no_dereverb") if mode == "no_dereverb": processed = self.uses(feature, ref_channel=self.ref_channel) elif mode == "dereverb": processed = self.uses(feature, ref_channel=self.ref_channel, mem_idx=1) elif mode == "both": # For training with multii-condition data # 1. denoised output without dereverberation processed = self.uses(feature, ref_channel=self.ref_channel, mem_idx=0) # 2. denoised output with dereverberation processed2 = self.uses(feature, ref_channel=self.ref_channel, mem_idx=1) processed2 = processed2.reshape( B * self.num_spk, self.enc_channels, F, T ) processed2 = self.pre_decoder(processed2) specs2 = processed2.reshape(B, self.num_spk, 2, F, T).moveaxis(-1, -2) # B, num_spk, T, F if not is_complex(input): for spk in range(specs2.size(1)): others[f"dereverb{spk + 1}"] = ComplexTensor( specs2[:, spk, 0], specs2[:, spk, 1] ) else: for spk in range(specs2.size(1)): others[f"dereverb{spk + 1}"] = new_complex_like( input, (specs2[:, spk, 0], specs2[:, spk, 1]) ) else: raise ValueError(mode) else: mode = "" processed = self.uses(feature, ref_channel=self.ref_channel) processed = processed.reshape(B * self.num_spk, self.enc_channels, F, T) processed = self.pre_decoder(processed) specs = processed.reshape(B, self.num_spk, 2, F, T).moveaxis(-1, -2) # B, num_spk, T, F if not is_complex(input): specs = list(ComplexTensor(specs[:, :, 0], specs[:, :, 1]).unbind(1)) else: specs = list( new_complex_like(input, (specs[:, :, 0], specs[:, :, 1])).unbind(1) ) return specs, ilens, others
@property def num_spk(self): return self._num_spk