Source code for espnet2.asr.preencoder.sinc

#!/usr/bin/env python3
#  2020, Technische Universität München;  Ludwig Kürzinger
#  Apache 2.0  (

"""Sinc convolutions for raw audio input."""

from collections import OrderedDict
from typing import Optional, Tuple, Union

import humanfriendly
import torch
from typeguard import check_argument_types

from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
from espnet2.layers.sinc_conv import LogCompression, SincConv

[docs]class LightweightSincConvs(AbsPreEncoder): """Lightweight Sinc Convolutions. Instead of using precomputed features, end-to-end speech recognition can also be done directly from raw audio using sinc convolutions, as described in "Lightweight End-to-End Speech Recognition from Raw Audio Data Using Sinc-Convolutions" by Kürzinger et al. To use Sinc convolutions in your model instead of the default f-bank frontend, set this module as your pre-encoder with `preencoder: sinc` and use the input of the sliding window frontend with `frontend: sliding_window` in your yaml configuration file. So that the process flow is: Frontend (SlidingWindow) -> SpecAug -> Normalization -> Pre-encoder (LightweightSincConvs) -> Encoder -> Decoder Note that this method also performs data augmentation in time domain (vs. in spectral domain in the default frontend). Use `` to visualize the learned Sinc filters. """ def __init__( self, fs: Union[int, str, float] = 16000, in_channels: int = 1, out_channels: int = 256, activation_type: str = "leakyrelu", dropout_type: str = "dropout", windowing_type: str = "hamming", scale_type: str = "mel", ): """Initialize the module. Args: fs: Sample rate. in_channels: Number of input channels. out_channels: Number of output channels (for each input channel). activation_type: Choice of activation function. dropout_type: Choice of dropout function. windowing_type: Choice of windowing function. scale_type: Choice of filter-bank initialization scale. """ assert check_argument_types() super().__init__() if isinstance(fs, str): fs = humanfriendly.parse_size(fs) self.fs = fs self.in_channels = in_channels self.out_channels = out_channels self.activation_type = activation_type self.dropout_type = dropout_type self.windowing_type = windowing_type self.scale_type = scale_type self.choices_dropout = { "dropout": torch.nn.Dropout, "spatial": SpatialDropout, "dropout2d": torch.nn.Dropout2d, } if dropout_type not in self.choices_dropout: raise NotImplementedError( f"Dropout type has to be one of " f"{list(self.choices_dropout.keys())}", ) self.choices_activation = { "leakyrelu": torch.nn.LeakyReLU, "relu": torch.nn.ReLU, } if activation_type not in self.choices_activation: raise NotImplementedError( f"Activation type has to be one of " f"{list(self.choices_activation.keys())}", ) # initialization self._create_sinc_convs() # Sinc filters require custom initialization self.espnet_initialization_fn() def _create_sinc_convs(self): blocks = OrderedDict() # SincConvBlock out_channels = 128 self.filters = SincConv( self.in_channels, out_channels, kernel_size=101, stride=1, fs=self.fs, window_func=self.windowing_type, scale_type=self.scale_type, ) block = OrderedDict( [ ("Filters", self.filters), ("LogCompression", LogCompression()), ("BatchNorm", torch.nn.BatchNorm1d(out_channels, affine=True)), ("AvgPool", torch.nn.AvgPool1d(2)), ] ) blocks["SincConvBlock"] = torch.nn.Sequential(block) in_channels = out_channels # First convolutional block, connects the sinc output to the front-end "body" out_channels = 128 blocks["DConvBlock1"] = self.gen_lsc_block( in_channels, out_channels, depthwise_kernel_size=25, depthwise_stride=2, pointwise_groups=0, avgpool=True, dropout_probability=0.1, ) in_channels = out_channels # Second convolutional block, multiple convolutional layers out_channels = self.out_channels for layer in [2, 3, 4]: blocks[f"DConvBlock{layer}"] = self.gen_lsc_block( in_channels, out_channels, depthwise_kernel_size=9, depthwise_stride=1 ) in_channels = out_channels # Third Convolutional block, acts as coupling to encoder out_channels = self.out_channels blocks["DConvBlock5"] = self.gen_lsc_block( in_channels, out_channels, depthwise_kernel_size=7, depthwise_stride=1, pointwise_groups=0, ) self.blocks = torch.nn.Sequential(blocks)
[docs] def gen_lsc_block( self, in_channels: int, out_channels: int, depthwise_kernel_size: int = 9, depthwise_stride: int = 1, depthwise_groups=None, pointwise_groups=0, dropout_probability: float = 0.15, avgpool=False, ): """Generate a convolutional block for Lightweight Sinc convolutions. Each block consists of either a depthwise or a depthwise-separable convolutions together with dropout, (batch-)normalization layer, and an optional average-pooling layer. Args: in_channels: Number of input channels. out_channels: Number of output channels. depthwise_kernel_size: Kernel size of the depthwise convolution. depthwise_stride: Stride of the depthwise convolution. depthwise_groups: Number of groups of the depthwise convolution. pointwise_groups: Number of groups of the pointwise convolution. dropout_probability: Dropout probability in the block. avgpool: If True, an AvgPool layer is inserted. Returns: torch.nn.Sequential: Neural network building block. """ block = OrderedDict() if not depthwise_groups: # GCD(in_channels, out_channels) to prevent size mismatches depthwise_groups, r = in_channels, out_channels while r != 0: depthwise_groups, r = depthwise_groups, depthwise_groups % r block["depthwise"] = torch.nn.Conv1d( in_channels, out_channels, depthwise_kernel_size, depthwise_stride, groups=depthwise_groups, ) if pointwise_groups: block["pointwise"] = torch.nn.Conv1d( out_channels, out_channels, 1, 1, groups=pointwise_groups ) block["activation"] = self.choices_activation[self.activation_type]() block["batchnorm"] = torch.nn.BatchNorm1d(out_channels, affine=True) if avgpool: block["avgpool"] = torch.nn.AvgPool1d(2) block["dropout"] = self.choices_dropout[self.dropout_type](dropout_probability) return torch.nn.Sequential(block)
[docs] def espnet_initialization_fn(self): """Initialize sinc filters with filterbank values.""" self.filters.init_filters() for block in self.blocks: for layer in block: if type(layer) == torch.nn.BatchNorm1d and layer.affine:[:] = 1.0[:] = 0.0
[docs] def forward( self, input: torch.Tensor, input_lengths: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor]: """Apply Lightweight Sinc Convolutions. The input shall be formatted as (B, T, C_in, D_in) with B as batch size, T as time dimension, C_in as channels, and D_in as feature dimension. The output will then be (B, T, C_out*D_out) with C_out and D_out as output dimensions. The current module structure only handles D_in=400, so that D_out=1. Remark for the multichannel case: C_out is the number of out_channels given at initialization multiplied with C_in. """ # Transform input data: # (B, T, C_in, D_in) -> (B*T, C_in, D_in) B, T, C_in, D_in = input.size() input_frames = input.view(B * T, C_in, D_in) output_frames = self.blocks.forward(input_frames) # ---TRANSFORM: (B*T, C_out, D_out) -> (B, T, C_out*D_out) _, C_out, D_out = output_frames.size() output_frames = output_frames.view(B, T, C_out * D_out) return output_frames, input_lengths # no state in this layer
[docs] def output_size(self) -> int: """Get the output size.""" return self.out_channels * self.in_channels
[docs]class SpatialDropout(torch.nn.Module): """Spatial dropout module. Apply dropout to full channels on tensors of input (B, C, D) """ def __init__( self, dropout_probability: float = 0.15, shape: Optional[Union[tuple, list]] = None, ): """Initialize. Args: dropout_probability: Dropout probability. shape (tuple, list): Shape of input tensors. """ assert check_argument_types() super().__init__() if shape is None: shape = (0, 2, 1) self.dropout = torch.nn.Dropout2d(dropout_probability) self.shape = (shape,)
[docs] def forward(self, x: torch.Tensor) -> torch.Tensor: """Forward of spatial dropout module.""" y = x.permute(*self.shape) y = self.dropout(y) return y.permute(*self.shape)