Source code for espnet2.spk.encoder.xvector_encoder

# x-vector, cross checked with SpeechBrain implementation:
# https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/lobes/models/Xvector.py
# adapted for ESPnet-SPK by Jee-weon Jung
from typing import List

import torch.nn as nn
from typeguard import typechecked

from espnet2.asr.encoder.abs_encoder import AbsEncoder


[docs]class XvectorEncoder(AbsEncoder): """X-vector encoder. Extracts frame-level x-vector embeddings from features. Paper: D. Snyder et al., "X-vectors: Robust dnn embeddings for speaker recognition," in Proc. IEEE ICASSP, 2018. Args: input_size: input feature dimension. ndim: dimensionality of the hidden representation. output_size: ouptut embedding dimension. """ @typechecked def __init__( self, input_size: int, ndim: int = 512, output_size: int = 1500, kernel_sizes: List = [5, 3, 3, 1, 1], paddings: List = [2, 1, 1, 0, 0], dilations: List = [1, 2, 3, 1, 1], **kwargs, ): super().__init__() self._output_size = output_size in_channels = [input_size] + [ndim] * 4 out_channels = [ndim] * 4 + [output_size] self.layers = nn.ModuleList() for idx in range(5): self.layers.append( nn.Conv1d( in_channels[idx], out_channels[idx], kernel_sizes[idx], dilation=dilations[idx], padding=paddings[idx], ) ) self.layers.append(nn.ReLU()) self.layers.append(nn.BatchNorm1d(out_channels[idx]))
[docs] def output_size(self) -> int: return self._output_size
[docs] def forward(self, x): x = x.permute(0, 2, 1) # (B, S, D) -> (B, D, S) for layer in self.layers: x = layer(x) return x