Source code for espnet2.asr.state_spaces.pool

# This code is derived from

"""Implements downsampling and upsampling on sequences."""

import torch
import torch.nn.functional as F
from einops import rearrange, reduce, repeat
from torch import nn

from espnet2.asr.state_spaces.base import SequenceModule
from espnet2.asr.state_spaces.components import LinearActivation

"""Simple pooling functions that just downsample or repeat

stride: Subsample on the layer dimension
expand: Repeat on the feature dimension

[docs]def downsample(x, stride=1, expand=1, transposed=False): if x is None: return None if stride > 1: assert x.ndim == 3, ( "Downsampling with higher-dimensional inputs is currently not supported." "It is recommended to use average or spectral pooling instead." ) if transposed: x = x[..., 0::stride] else: x = x[..., 0::stride, :] if expand > 1: if transposed: x = repeat(x, "b d ... -> b (d e) ...", e=expand) else: x = repeat(x, "b ... d -> b ... (d e)", e=expand) return x
[docs]def upsample(x, stride=1, expand=1, transposed=False): if x is None: return None if expand > 1: if transposed: x = reduce(x, "... (d e) l -> ... d l", "mean", e=expand) else: x = reduce(x, "... (d e) -> ... d", "mean", e=expand) if stride > 1: if transposed: x = repeat(x, "... l -> ... (l e)", e=stride) else: x = repeat(x, "... l d -> ... (l e) d", e=stride) return x
[docs]class DownSample(SequenceModule): def __init__(self, d_input, stride=1, expand=1, transposed=True): super().__init__() self.d_input = d_input self.stride = stride self.expand = expand self.transposed = transposed
[docs] def forward(self, x): return downsample(x, self.stride, self.expand, False, self.transposed)
[docs] def step(self, x, state, **kwargs): if self.stride > 1 or self.expand > 1: raise NotImplementedError return x, state
@property def d_output(self): return self.d_input * self.expand
[docs]class DownAvgPool(SequenceModule): def __init__(self, d_input, stride=1, expand=1, transposed=True): super().__init__() self.d_input = d_input self.stride = stride self.expand = expand self.transposed = transposed
[docs] def forward(self, x): if not self.transposed: x = rearrange(x, "b ... d -> b d ...") if self.stride > 1: # einops appears slower than F if x.ndim == 3: x = F.avg_pool1d(x, self.stride, self.stride) elif x.ndim == 4: x = F.avg_pool2d(x, self.stride, self.stride) else: # Reduction string e.g. "b d (l1 2) (l2 2) -> b d l1 l2" reduce_str = ( "b d " + " ".join([f"(l{i} {self.stride})" for i in range(x.ndim - 2)]) + " -> b d " + " ".join([f"l{i}" for i in range(x.ndim - 2)]) ) x = reduce(x, reduce_str, "mean") if self.expand > 1: x = repeat(x, "b d ... -> b (d e) ...", e=self.expand) if not self.transposed: x = rearrange(x, "b d ... -> b ... d") return x
[docs] def step(self, x, state, **kwargs): if self.stride > 1 or self.expand > 1: raise NotImplementedError return x, state
@property def d_output(self): return self.d_input * self.expand
[docs]class DownSpectralPool(SequenceModule): def __init__(self, d_input, stride=1, expand=1, transposed=True): super().__init__() self.d_input = d_input self.stride = stride self.expand = expand self.transposed = transposed
[docs] def forward(self, x): """Forward pass. x: (B, L..., D) """ if not self.transposed: x = rearrange(x, "b ... d -> b d ...") shape = x.shape[2:] x_f = torch.fft.ifftn(x, s=shape) for axis, l in enumerate(shape): assert l % self.stride == 0, "input length must be divisible by stride" new_l = l // self.stride idx = [torch.arange(0, new_l - new_l // 2), l + torch.arange(-new_l // 2, 0)] ).to(x_f.device) x_f = torch.index_select(x_f, 2 + axis, idx) x = torch.fft.ifftn(x_f, s=[length // self.stride for length in shape]) x = x.real if self.expand > 1: x = repeat(x, "b d ... -> b (d e) ...", e=self.expand) if not self.transposed: x = rearrange(x, "b d ... -> b ... d") return x
[docs] def step(self, x, state, **kwargs): if self.stride > 1 or self.expand > 1: raise NotImplementedError return x, state
@property def d_output(self): return self.d_input * self.expand
[docs]class UpSample(nn.Module): def __init__(self, d_input, stride=1, expand=1, transposed=True): super().__init__() self.d_input = d_input self.stride = stride self.expand = expand self.transposed = transposed
[docs] def forward(self, x): return upsample(x, self.stride, self.expand, self.transposed)
@property def d_output(self): return self.d_input // self.expand
[docs] def step(self, x, state, **kwargs): if self.stride > 1 or self.expand > 1: raise NotImplementedError return x, state
""" Pooling functions with trainable parameters """ # For the flexible backbone SequenceModel
[docs]class DownLinearPool(SequenceModule): def __init__(self, d_input, stride=1, expand=1, transposed=True): super().__init__() self.d_input = d_input self.stride = stride self.expand = expand self.transposed = transposed self.linear = LinearActivation( d_input * stride, d_input * expand, transposed=transposed, )
[docs] def forward(self, x): if self.transposed: x = rearrange(x, "... h (l s) -> ... (h s) l", s=self.stride) else: x = rearrange(x, "... (l s) h -> ... l (h s)", s=self.stride) x = self.linear(x) return x
[docs] def step(self, x, state, **kwargs): if self.stride > 1 or self.expand > 1: raise NotImplementedError return x, state
@property def d_output(self): return self.d_input * self.expand
""" Pooling functions with trainable parameters """
[docs]class DownPool2d(SequenceModule): def __init__(self, d_input, d_output, stride=1, transposed=True, weight_norm=True): super().__init__() self.linear = LinearActivation( d_input, d_output, transposed=transposed, weight_norm=weight_norm, ) self.pool = (nn.AvgPool2d(kernel_size=stride, stride=stride),)
[docs] def forward(self, x): if self.transposed: x = self.pool(x)
# DownLinearPool is used by the registry (for isotropic backbone) # DownPool is essentially the same as DownLinearPool. These should be consolidated
[docs]class DownPool(SequenceModule): def __init__( self, d_input, d_output=None, expand=None, stride=1, transposed=True, weight_norm=True, initializer=None, activation=None, ): super().__init__() assert (d_output is None) + (expand is None) == 1 if d_output is None: d_output = d_input * expand self.d_output = d_output self.stride = stride self.transposed = transposed self.linear = LinearActivation( d_input * stride, d_output, transposed=transposed, initializer=initializer, weight_norm=weight_norm, activation=activation, activate=True if activation is not None else False, )
[docs] def forward(self, x): if self.transposed: x = rearrange(x, "... h (l s) -> ... (h s) l", s=self.stride) else: x = rearrange(x, "... (l s) h -> ... l (h s)", s=self.stride) x = self.linear(x) return x, None
[docs] def step(self, x, state, **kwargs): """Step one time step as a recurrent model. x: (..., H) """ if x is None: return None, state state.append(x) if len(state) == self.stride: x = rearrange(torch.stack(state, dim=-1), "... h s -> ... (h s)") if self.transposed: x = x.unsqueeze(-1) x = self.linear(x) if self.transposed: x = x.squeeze(-1) return x, [] else: return None, state
[docs] def default_state(self, *batch_shape, device=None): return []
[docs]class UpPool(SequenceModule): def __init__( self, d_input, d_output, stride, transposed=True, weight_norm=True, initializer=None, activation=None, ): super().__init__() self.d_input = d_input self._d_output = d_output self.stride = stride self.transposed = transposed self.linear = LinearActivation( d_input, d_output * stride, transposed=transposed, initializer=initializer, weight_norm=weight_norm, activation=activation, activate=True if activation is not None else False, )
[docs] def forward(self, x, skip=None): x = self.linear(x) if self.transposed: x = F.pad(x[..., :-1], (1, 0)) # Shift to ensure causality x = rearrange(x, "... (h s) l -> ... h (l s)", s=self.stride) else: x = F.pad(x[..., :-1, :], (0, 0, 1, 0)) # Shift to ensure causality x = rearrange(x, "... l (h s) -> ... (l s) h", s=self.stride) if skip is not None: x = x + skip return x, None
[docs] def step(self, x, state, **kwargs): """Step one time step as a recurrent model. x: (..., H) """ assert len(state) > 0 y, state = state[0], state[1:] if len(state) == 0: assert x is not None if self.transposed: x = x.unsqueeze(-1) x = self.linear(x) if self.transposed: x = x.squeeze(-1) x = rearrange(x, "... (h s) -> ... h s", s=self.stride) state = list(torch.unbind(x, dim=-1)) else: assert x is None return y, state
[docs] def default_state(self, *batch_shape, device=None): state = torch.zeros( batch_shape + (self.d_output, self.stride), device=device ) # (batch, h, s) state = list(torch.unbind(state, dim=-1)) # List of (..., H) return state
@property def d_output(self): return self._d_output
registry = { "sample": DownSample, "pool": DownAvgPool, "linear": DownLinearPool, "spectral": DownSpectralPool, }