Source code for espnet.nets.pytorch_backend.transducer.blocks

"""Set of methods to create custom architecture."""

from typing import Any, Dict, List, Tuple, Union

import torch

from espnet.nets.pytorch_backend.conformer.convolution import ConvolutionModule
from espnet.nets.pytorch_backend.conformer.encoder_layer import (
    EncoderLayer as ConformerEncoderLayer,
)
from espnet.nets.pytorch_backend.nets_utils import get_activation
from espnet.nets.pytorch_backend.transducer.conv1d_nets import CausalConv1d, Conv1d
from espnet.nets.pytorch_backend.transducer.transformer_decoder_layer import (
    TransformerDecoderLayer,
)
from espnet.nets.pytorch_backend.transducer.vgg2l import VGG2L
from espnet.nets.pytorch_backend.transformer.attention import (
    MultiHeadedAttention,
    RelPositionMultiHeadedAttention,
)
from espnet.nets.pytorch_backend.transformer.embedding import (
    PositionalEncoding,
    RelPositionalEncoding,
    ScaledPositionalEncoding,
)
from espnet.nets.pytorch_backend.transformer.encoder_layer import EncoderLayer
from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
    PositionwiseFeedForward,
)
from espnet.nets.pytorch_backend.transformer.repeat import MultiSequential
from espnet.nets.pytorch_backend.transformer.subsampling import Conv2dSubsampling


[docs]def verify_block_arguments( net_part: str, block: Dict[str, Any], num_block: int, ) -> Tuple[int, int]: """Verify block arguments are valid. Args: net_part: Network part, either 'encoder' or 'decoder'. block: Block parameters. num_block: Block ID. Return: block_io: Input and output dimension of the block. """ block_type = block.get("type") if block_type is None: raise ValueError( "Block %d in %s doesn't a type assigned.", (num_block, net_part) ) if block_type == "transformer": arguments = {"d_hidden", "d_ff", "heads"} elif block_type == "conformer": arguments = { "d_hidden", "d_ff", "heads", "macaron_style", "use_conv_mod", } if net_part == "decoder": raise ValueError("Decoder does not support 'conformer'.") if block.get("use_conv_mod", None) is True and "conv_mod_kernel" not in block: raise ValueError( "Block %d: 'use_conv_mod' is True but " " 'conv_mod_kernel' is not specified" % num_block ) elif block_type == "causal-conv1d": arguments = {"idim", "odim", "kernel_size"} if net_part == "encoder": raise ValueError("Encoder does not support 'causal-conv1d'.") elif block_type == "conv1d": arguments = {"idim", "odim", "kernel_size"} if net_part == "decoder": raise ValueError("Decoder does not support 'conv1d.'") else: raise NotImplementedError( "Wrong type. Currently supported: " "causal-conv1d, conformer, conv-nd or transformer." ) if not arguments.issubset(block): raise ValueError( "%s in %s in position %d: Expected block arguments : %s." " See tutorial page for more information." % (block_type, net_part, num_block, arguments) ) if block_type in ("transformer", "conformer"): block_io = (block["d_hidden"], block["d_hidden"]) else: block_io = (block["idim"], block["odim"]) return block_io
[docs]def prepare_input_layer( input_layer_type: str, feats_dim: int, blocks: List[Dict[str, Any]], dropout_rate: float, pos_enc_dropout_rate: float, ) -> Dict[str, Any]: """Prepare input layer arguments. Args: input_layer_type: Input layer type. feats_dim: Dimension of input features. blocks: Blocks parameters for network part. dropout_rate: Dropout rate for input layer. pos_enc_dropout_rate: Dropout rate for input layer pos. enc. Return: input_block: Input block parameters. """ input_block = {} first_block_type = blocks[0].get("type", None) if first_block_type == "causal-conv1d": input_block["type"] = "c-embed" else: input_block["type"] = input_layer_type input_block["dropout-rate"] = dropout_rate input_block["pos-dropout-rate"] = pos_enc_dropout_rate input_block["idim"] = feats_dim if first_block_type in ("transformer", "conformer"): input_block["odim"] = blocks[0].get("d_hidden", 0) else: input_block["odim"] = blocks[0].get("idim", 0) return input_block
[docs]def prepare_body_model( net_part: str, blocks: List[Dict[str, Any]], ) -> Tuple[int]: """Prepare model body blocks. Args: net_part: Network part, either 'encoder' or 'decoder'. blocks: Blocks parameters for network part. Return: : Network output dimension. """ cmp_io = [ verify_block_arguments(net_part, b, (i + 1)) for i, b in enumerate(blocks) ] if {"transformer", "conformer"} <= {b["type"] for b in blocks}: raise NotImplementedError( net_part + ": transformer and conformer blocks " "can't be used together in the same net part." ) for i in range(1, len(cmp_io)): if cmp_io[(i - 1)][1] != cmp_io[i][0]: raise ValueError( "Output/Input mismatch between blocks %d and %d in %s" % (i, (i + 1), net_part) ) return cmp_io[-1][1]
[docs]def get_pos_enc_and_att_class( net_part: str, pos_enc_type: str, self_attn_type: str ) -> Tuple[ Union[PositionalEncoding, ScaledPositionalEncoding, RelPositionalEncoding], Union[MultiHeadedAttention, RelPositionMultiHeadedAttention], ]: """Get positional encoding and self attention module class. Args: net_part: Network part, either 'encoder' or 'decoder'. pos_enc_type: Positional encoding type. self_attn_type: Self-attention type. Return: pos_enc_class: Positional encoding class. self_attn_class: Self-attention class. """ if pos_enc_type == "abs_pos": pos_enc_class = PositionalEncoding elif pos_enc_type == "scaled_abs_pos": pos_enc_class = ScaledPositionalEncoding elif pos_enc_type == "rel_pos": if net_part == "encoder" and self_attn_type != "rel_self_attn": raise ValueError("'rel_pos' is only compatible with 'rel_self_attn'") pos_enc_class = RelPositionalEncoding else: raise NotImplementedError( "pos_enc_type should be either 'abs_pos', 'scaled_abs_pos' or 'rel_pos'" ) if self_attn_type == "rel_self_attn": self_attn_class = RelPositionMultiHeadedAttention else: self_attn_class = MultiHeadedAttention return pos_enc_class, self_attn_class
[docs]def build_input_layer( block: Dict[str, Any], pos_enc_class: torch.nn.Module, padding_idx: int, ) -> Tuple[Union[Conv2dSubsampling, VGG2L, torch.nn.Sequential], int]: """Build input layer. Args: block: Architecture definition of input layer. pos_enc_class: Positional encoding class. padding_idx: Padding symbol ID for embedding layer (if provided). Returns: : Input layer module. subsampling_factor: Subsampling factor. """ input_type = block["type"] idim = block["idim"] odim = block["odim"] dropout_rate = block["dropout-rate"] pos_dropout_rate = block["pos-dropout-rate"] if pos_enc_class.__name__ == "RelPositionalEncoding": pos_enc_class_subsampling = pos_enc_class(odim, pos_dropout_rate) else: pos_enc_class_subsampling = None if input_type == "linear": return ( torch.nn.Sequential( torch.nn.Linear(idim, odim), torch.nn.LayerNorm(odim), torch.nn.Dropout(dropout_rate), torch.nn.ReLU(), pos_enc_class(odim, pos_dropout_rate), ), 1, ) elif input_type == "conv2d": return Conv2dSubsampling(idim, odim, dropout_rate, pos_enc_class_subsampling), 4 elif input_type == "vgg2l": return VGG2L(idim, odim, pos_enc_class_subsampling), 4 elif input_type == "embed": return ( torch.nn.Sequential( torch.nn.Embedding(idim, odim, padding_idx=padding_idx), pos_enc_class(odim, pos_dropout_rate), ), 1, ) elif input_type == "c-embed": return ( torch.nn.Sequential( torch.nn.Embedding(idim, odim, padding_idx=padding_idx), torch.nn.Dropout(dropout_rate), ), 1, ) else: raise NotImplementedError( "Invalid input layer: %s. Supported: linear, conv2d, vgg2l and embed" % input_type )
[docs]def build_transformer_block( net_part: str, block: Dict[str, Any], pw_layer_type: str, pw_activation_type: str, ) -> Union[EncoderLayer, TransformerDecoderLayer]: """Build function for transformer block. Args: net_part: Network part, either 'encoder' or 'decoder'. block: Transformer block parameters. pw_layer_type: Positionwise layer type. pw_activation_type: Positionwise activation type. Returns: : Function to create transformer (encoder or decoder) block. """ d_hidden = block["d_hidden"] dropout_rate = block.get("dropout-rate", 0.0) pos_dropout_rate = block.get("pos-dropout-rate", 0.0) att_dropout_rate = block.get("att-dropout-rate", 0.0) if pw_layer_type != "linear": raise NotImplementedError( "Transformer block only supports linear pointwise layer." ) if net_part == "encoder": transformer_layer_class = EncoderLayer elif net_part == "decoder": transformer_layer_class = TransformerDecoderLayer return lambda: transformer_layer_class( d_hidden, MultiHeadedAttention(block["heads"], d_hidden, att_dropout_rate), PositionwiseFeedForward( d_hidden, block["d_ff"], pos_dropout_rate, get_activation(pw_activation_type), ), dropout_rate, )
[docs]def build_conformer_block( block: Dict[str, Any], self_attn_class: str, pw_layer_type: str, pw_activation_type: str, conv_mod_activation_type: str, ) -> ConformerEncoderLayer: """Build function for conformer block. Args: block: Conformer block parameters. self_attn_type: Self-attention module type. pw_layer_type: Positionwise layer type. pw_activation_type: Positionwise activation type. conv_mod_activation_type: Convolutional module activation type. Returns: : Function to create conformer (encoder) block. """ d_hidden = block["d_hidden"] d_ff = block["d_ff"] dropout_rate = block.get("dropout-rate", 0.0) pos_dropout_rate = block.get("pos-dropout-rate", 0.0) att_dropout_rate = block.get("att-dropout-rate", 0.0) macaron_style = block["macaron_style"] use_conv_mod = block["use_conv_mod"] if pw_layer_type == "linear": pw_layer = PositionwiseFeedForward pw_layer_args = ( d_hidden, d_ff, pos_dropout_rate, get_activation(pw_activation_type), ) else: raise NotImplementedError("Conformer block only supports linear yet.") if macaron_style: macaron_net = PositionwiseFeedForward macaron_net_args = ( d_hidden, d_ff, pos_dropout_rate, get_activation(pw_activation_type), ) if use_conv_mod: conv_mod = ConvolutionModule conv_mod_args = ( d_hidden, block["conv_mod_kernel"], get_activation(conv_mod_activation_type), ) return lambda: ConformerEncoderLayer( d_hidden, self_attn_class(block["heads"], d_hidden, att_dropout_rate), pw_layer(*pw_layer_args), macaron_net(*macaron_net_args) if macaron_style else None, conv_mod(*conv_mod_args) if use_conv_mod else None, dropout_rate, )
[docs]def build_conv1d_block(block: Dict[str, Any], block_type: str) -> CausalConv1d: """Build function for causal conv1d block. Args: block: CausalConv1d or Conv1D block parameters. Returns: : Function to create conv1d (encoder) or causal conv1d (decoder) block. """ if block_type == "conv1d": conv_class = Conv1d else: conv_class = CausalConv1d stride = block.get("stride", 1) dilation = block.get("dilation", 1) groups = block.get("groups", 1) bias = block.get("bias", True) use_batch_norm = block.get("use-batch-norm", False) use_relu = block.get("use-relu", False) dropout_rate = block.get("dropout-rate", 0.0) return lambda: conv_class( block["idim"], block["odim"], block["kernel_size"], stride=stride, dilation=dilation, groups=groups, bias=bias, relu=use_relu, batch_norm=use_batch_norm, dropout_rate=dropout_rate, )
[docs]def build_blocks( net_part: str, idim: int, input_layer_type: str, blocks: List[Dict[str, Any]], repeat_block: int = 0, self_attn_type: str = "self_attn", positional_encoding_type: str = "abs_pos", positionwise_layer_type: str = "linear", positionwise_activation_type: str = "relu", conv_mod_activation_type: str = "relu", input_layer_dropout_rate: float = 0.0, input_layer_pos_enc_dropout_rate: float = 0.0, padding_idx: int = -1, ) -> Tuple[ Union[Conv2dSubsampling, VGG2L, torch.nn.Sequential], MultiSequential, int, int ]: """Build custom model blocks. Args: net_part: Network part, either 'encoder' or 'decoder'. idim: Input dimension. input_layer: Input layer type. blocks: Blocks parameters for network part. repeat_block: Number of times provided blocks are repeated. positional_encoding_type: Positional encoding layer type. positionwise_layer_type: Positionwise layer type. positionwise_activation_type: Positionwise activation type. conv_mod_activation_type: Convolutional module activation type. input_layer_dropout_rate: Dropout rate for input layer. input_layer_pos_enc_dropout_rate: Dropout rate for input layer pos. enc. padding_idx: Padding symbol ID for embedding layer. Returns: in_layer: Input layer all_blocks: Encoder/Decoder network. out_dim: Network output dimension. conv_subsampling_factor: Subsampling factor in frontend CNN. """ fn_modules = [] pos_enc_class, self_attn_class = get_pos_enc_and_att_class( net_part, positional_encoding_type, self_attn_type ) input_block = prepare_input_layer( input_layer_type, idim, blocks, input_layer_dropout_rate, input_layer_pos_enc_dropout_rate, ) out_dim = prepare_body_model(net_part, blocks) input_layer, conv_subsampling_factor = build_input_layer( input_block, pos_enc_class, padding_idx, ) for i in range(len(blocks)): block_type = blocks[i]["type"] if block_type in ("causal-conv1d", "conv1d"): module = build_conv1d_block(blocks[i], block_type) elif block_type == "conformer": module = build_conformer_block( blocks[i], self_attn_class, positionwise_layer_type, positionwise_activation_type, conv_mod_activation_type, ) elif block_type == "transformer": module = build_transformer_block( net_part, blocks[i], positionwise_layer_type, positionwise_activation_type, ) fn_modules.append(module) if repeat_block > 1: fn_modules = fn_modules * repeat_block return ( input_layer, MultiSequential(*[fn() for fn in fn_modules]), out_dim, conv_subsampling_factor, )