Source code for espnet2.gan_svs.avocodo.avocodo

# Copyright 2023 Yifeng Yu
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Avocodo Modules.

This code is modified from https://github.com/ncsoft/avocodo.

"""

import logging
from typing import Any, Dict, List, Optional

import torch
import torch.nn.functional as F
from torch.nn import Conv1d
from torch.nn.utils import spectral_norm, weight_norm

from espnet2.gan_svs.visinger2.visinger2_vocoder import MultiFrequencyDiscriminator
from espnet2.gan_tts.hifigan.residual_block import ResidualBlock
from espnet2.gan_tts.melgan.pqmf import PQMF


[docs]def get_padding(kernel_size, dilation=1):
    return int((kernel_size * dilation - dilation) / 2)


[docs]class AvocodoGenerator(torch.nn.Module):
    """Avocodo generator module."""

    def __init__(
        self,
        in_channels: int = 80,
        out_channels: int = 1,
        channels: int = 512,
        global_channels: int = -1,
        kernel_size: int = 7,
        upsample_scales: List[int] = [8, 8, 2, 2],
        upsample_kernel_sizes: List[int] = [16, 16, 4, 4],
        resblock_kernel_sizes: List[int] = [3, 7, 11],
        resblock_dilations: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
        projection_filters: List[int] = [0, 1, 1, 1],
        projection_kernels: List[int] = [0, 5, 7, 11],
        use_additional_convs: bool = True,
        bias: bool = True,
        nonlinear_activation: str = "LeakyReLU",
        nonlinear_activation_params: Dict[str, Any] = {"negative_slope": 0.2},
        use_weight_norm: bool = True,
    ):
        """Initialize AvocodoGenerator module.

        Args:
            in_channels (int): Number of input channels.
            out_channels (int): Number of output channels.
            channels (int): Number of hidden representation channels.
            global_channels (int): Number of global conditioning channels.
            kernel_size (int): Kernel size of initial and final conv layer.
            upsample_scales (List[int]): List of upsampling scales.
            upsample_kernel_sizes (List[int]): List of kernel sizes for upsample layers.
            resblock_kernel_sizes (List[int]): List of kernel sizes for residual blocks.
            resblock_dilations (List[List[int]]): List of list of dilations for residual
                blocks.
            use_additional_convs (bool): Whether to use additional conv layers in
                residual blocks.
            bias (bool): Whether to add bias parameter in convolution layers.
            nonlinear_activation (str): Activation function module name.
            nonlinear_activation_params (Dict[str, Any]): Hyperparameters for activation
                function.
            use_weight_norm (bool): Whether to use weight norm. If set to true, it will
                be applied to all of the conv layers.

        """
        super().__init__()

        # check hyperparameters are valid
        assert kernel_size % 2 == 1, "Kernel size must be odd number."
        assert len(upsample_scales) == len(upsample_kernel_sizes)
        assert len(resblock_dilations) == len(resblock_kernel_sizes)

        # define modules

        self.num_upsamples = len(upsample_kernel_sizes)
        self.num_blocks = len(resblock_kernel_sizes)
        self.input_conv = torch.nn.Conv1d(
            in_channels,
            channels,
            kernel_size,
            1,
            padding=(kernel_size - 1) // 2,
        )
        self.upsamples = torch.nn.ModuleList()
        self.blocks = torch.nn.ModuleList()
        self.output_conv = torch.nn.ModuleList()
        for i in range(len(upsample_kernel_sizes)):
            assert upsample_kernel_sizes[i] == 2 * upsample_scales[i]
            self.upsamples += [
                torch.nn.Sequential(
                    getattr(torch.nn, nonlinear_activation)(
                        **nonlinear_activation_params
                    ),
                    torch.nn.ConvTranspose1d(
                        channels // (2**i),
                        channels // (2 ** (i + 1)),
                        upsample_kernel_sizes[i],
                        upsample_scales[i],
                        padding=upsample_scales[i] // 2 + upsample_scales[i] % 2,
                        output_padding=upsample_scales[i] % 2,
                    ),
                )
            ]
            for j in range(len(resblock_kernel_sizes)):
                self.blocks += [
                    ResidualBlock(
                        kernel_size=resblock_kernel_sizes[j],
                        channels=channels // (2 ** (i + 1)),
                        dilations=resblock_dilations[j],
                        bias=bias,
                        use_additional_convs=use_additional_convs,
                        nonlinear_activation=nonlinear_activation,
                        nonlinear_activation_params=nonlinear_activation_params,
                    )
                ]

            if projection_filters[i] != 0:
                self.output_conv.append(
                    torch.nn.Conv1d(
                        channels // (2 ** (i + 1)),
                        projection_filters[i],
                        projection_kernels[i],
                        1,
                        padding=projection_kernels[i] // 2,
                    )
                )
            else:
                self.output_conv.append(torch.nn.Identity())

        if global_channels > 0:
            self.global_conv = torch.nn.Conv1d(global_channels, channels, 1)

        # apply weight norm
        if use_weight_norm:
            self.apply_weight_norm()

        # reset parameters
        self.reset_parameters()

[docs]    def forward(
        self, c: torch.Tensor, g: Optional[torch.Tensor] = None
    ) -> torch.Tensor:
        """Calculate forward propagation.

        Args:
            c (Tensor): Input tensor (B, in_channels, T).
            g (Optional[Tensor]): Global conditioning tensor (B, global_channels, 1).

        Returns:
            List[Tensor]: List of output tensors (B, out_channels, T).

        """
        outs = []
        c = self.input_conv(c)
        if g is not None:
            c = c + self.global_conv(g)
        for i in range(self.num_upsamples):
            c = self.upsamples[i](c)
            cs = 0.0  # initialize
            for j in range(self.num_blocks):
                cs += self.blocks[i * self.num_blocks + j](c)
            c = cs / self.num_blocks
            if i >= (self.num_upsamples - 3):
                _c = F.leaky_relu(c)
                _c = self.output_conv[i](_c)
                _c = torch.tanh(_c)
                outs.append(_c)
            else:
                c = self.output_conv[i](c)

        return outs

[docs]    def reset_parameters(self):
        """Reset parameters.

        This initialization follows the official implementation manner.
        https://github.com/jik876/hifi-gan/blob/master/models.py

        """

        def _reset_parameters(m: torch.nn.Module):
            if isinstance(m, (torch.nn.Conv1d, torch.nn.ConvTranspose1d)):
                m.weight.data.normal_(0.0, 0.01)
                logging.debug(f"Reset parameters in {m}.")

        self.apply(_reset_parameters)

[docs]    def remove_weight_norm(self):
        """Remove weight normalization module from all of the layers."""

        def _remove_weight_norm(m: torch.nn.Module):
            try:
                logging.debug(f"Weight norm is removed from {m}.")
                torch.nn.utils.remove_weight_norm(m)
            except ValueError:  # this module didn't have weight norm
                return

        self.apply(_remove_weight_norm)

[docs]    def apply_weight_norm(self):
        """Apply weight normalization module from all of the layers."""

        def _apply_weight_norm(m: torch.nn.Module):
            if isinstance(m, torch.nn.Conv1d) or isinstance(
                m, torch.nn.ConvTranspose1d
            ):
                torch.nn.utils.weight_norm(m)
                logging.debug(f"Weight norm is applied to {m}.")

        self.apply(_apply_weight_norm)


# CoMBD
[docs]class CoMBDBlock(torch.nn.Module):
    """CoMBD (Collaborative Multi-band Discriminator) block module"""

    def __init__(
        self,
        h_u: List[int],
        d_k: List[int],
        d_s: List[int],
        d_d: List[int],
        d_g: List[int],
        d_p: List[int],
        op_f: int,
        op_k: int,
        op_g: int,
        use_spectral_norm=False,
    ):
        super(CoMBDBlock, self).__init__()
        norm_f = weight_norm if use_spectral_norm is False else spectral_norm

        self.convs = torch.nn.ModuleList()
        filters = [[1, h_u[0]]]
        for i in range(len(h_u) - 1):
            filters.append([h_u[i], h_u[i + 1]])
        for _f, _k, _s, _d, _g, _p in zip(filters, d_k, d_s, d_d, d_g, d_p):
            self.convs.append(
                norm_f(
                    Conv1d(
                        in_channels=_f[0],
                        out_channels=_f[1],
                        kernel_size=_k,
                        stride=_s,
                        dilation=_d,
                        groups=_g,
                        padding=_p,
                    )
                )
            )
        self.projection_conv = norm_f(
            Conv1d(
                in_channels=filters[-1][1],
                out_channels=op_f,
                kernel_size=op_k,
                groups=op_g,
            )
        )

[docs]    def forward(self, x):
        """Forward pass through the CoMBD block.

        Args:
            x (Tensor): Input tensor of shape (B, C_in, T_in).

        Returns:
            Tuple[Tensor, List[Tensor]]: Tuple containing the output tensor of
                                         shape (B, C_out, T_out)
            and a list of feature maps of shape (B, C, T) at each Conv1d layer.
        """
        fmap = []
        for block in self.convs:
            x = block(x)
            x = F.leaky_relu(x, 0.2)
            fmap.append(x)
        x = self.projection_conv(x)
        return x, fmap


[docs]class CoMBD(torch.nn.Module):
    """CoMBD (Collaborative Multi-band Discriminator) module

    from from https://arxiv.org/abs/2206.13404
    """

    def __init__(self, h, pqmf_list=None, use_spectral_norm=False):
        super(CoMBD, self).__init__()
        self.h = h
        if pqmf_list is not None:
            self.pqmf = pqmf_list
        else:
            self.pqmf = [PQMF(*h.pqmf_config["lv2"]), PQMF(*h.pqmf_config["lv1"])]

        self.blocks = torch.nn.ModuleList()
        for _h_u, _d_k, _d_s, _d_d, _d_g, _d_p, _op_f, _op_k, _op_g in zip(
            h["combd_h_u"],
            h["combd_d_k"],
            h["combd_d_s"],
            h["combd_d_d"],
            h["combd_d_g"],
            h["combd_d_p"],
            h["combd_op_f"],
            h["combd_op_k"],
            h["combd_op_g"],
        ):
            self.blocks.append(
                CoMBDBlock(
                    _h_u,
                    _d_k,
                    _d_s,
                    _d_d,
                    _d_g,
                    _d_p,
                    _op_f,
                    _op_k,
                    _op_g,
                )
            )

    def _block_forward(self, input, blocks, outs, f_maps):
        for x, block in zip(input, blocks):
            out, f_map = block(x)
            outs.append(out)
            f_maps.append(f_map)
        return outs, f_maps

    def _pqmf_forward(self, ys, ys_hat):
        # preprocess for multi_scale forward
        multi_scale_inputs = []
        multi_scale_inputs_hat = []
        for pqmf in self.pqmf:
            multi_scale_inputs.append(pqmf.to(ys[-1]).analysis(ys[-1])[:, :1, :])
            multi_scale_inputs_hat.append(
                pqmf.to(ys[-1]).analysis(ys_hat[-1])[:, :1, :]
            )

        outs_real = []
        f_maps_real = []
        # real
        # for hierarchical forward
        outs_real, f_maps_real = self._block_forward(
            ys, self.blocks, outs_real, f_maps_real
        )
        # for multi_scale forward
        outs_real, f_maps_real = self._block_forward(
            multi_scale_inputs, self.blocks[:-1], outs_real, f_maps_real
        )

        outs_fake = []
        f_maps_fake = []
        # predicted
        # for hierarchical forward
        outs_fake, f_maps_fake = self._block_forward(
            ys_hat, self.blocks, outs_fake, f_maps_fake
        )
        # for multi_scale forward
        outs_fake, f_maps_fake = self._block_forward(
            multi_scale_inputs_hat, self.blocks[:-1], outs_fake, f_maps_fake
        )

        return outs_real, outs_fake, f_maps_real, f_maps_fake

[docs]    def forward(self, ys, ys_hat):
        """Forward CoMBD.

        Args:
            ys (List[Tensor]): List of ground truth signals of shape (B, 1, T).
            ys_hat (List[Tensor]): List of predicted signals of shape (B, 1, T).
        Returns:
            Tuple[List[Tensor], List[Tensor], List[List[Tensor]], List[List[Tensor]]]:
            Tuple containing the list of output tensors of shape (B, C_out, T_out)
            for real and fake, respectively, and the list of feature maps of shape
            (B, C, T) at each Conv1d layer for real and fake, respectively.
        """
        outs_real, outs_fake, f_maps_real, f_maps_fake = self._pqmf_forward(ys, ys_hat)
        return outs_real, outs_fake, f_maps_real, f_maps_fake


# SBD
[docs]class MDC(torch.nn.Module):
    """Multiscale Dilated Convolution from https://arxiv.org/pdf/1609.07093.pdf"""

    def __init__(
        self,
        in_channels,
        out_channels,
        strides,
        kernel_size,
        dilations,
        use_spectral_norm=False,
    ):
        super(MDC, self).__init__()
        norm_f = weight_norm if not use_spectral_norm else spectral_norm
        self.d_convs = torch.nn.ModuleList()
        for _k, _d in zip(kernel_size, dilations):
            self.d_convs.append(
                norm_f(
                    Conv1d(
                        in_channels=in_channels,
                        out_channels=out_channels,
                        kernel_size=_k,
                        dilation=_d,
                        padding=get_padding(_k, _d),
                    )
                )
            )
        self.post_conv = norm_f(
            Conv1d(
                in_channels=out_channels,
                out_channels=out_channels,
                kernel_size=3,
                stride=strides,
                padding=get_padding(_k, _d),
            )
        )
        self.softmax = torch.nn.Softmax(dim=-1)

[docs]    def forward(self, x):
        _out = None
        for _l in self.d_convs:
            _x = torch.unsqueeze(_l(x), -1)
            _x = F.leaky_relu(_x, 0.2)
            if _out is None:
                _out = _x
            else:
                _out = torch.cat([_out, _x], axis=-1)
        x = torch.sum(_out, dim=-1)
        x = self.post_conv(x)
        x = F.leaky_relu(x, 0.2)  # @@

        return x


[docs]class SBDBlock(torch.nn.Module):
    """SBD (Sub-band Discriminator) Block"""

    def __init__(
        self,
        segment_dim,
        strides,
        filters,
        kernel_size,
        dilations,
        use_spectral_norm=False,
    ):
        super(SBDBlock, self).__init__()
        norm_f = weight_norm if not use_spectral_norm else spectral_norm
        self.convs = torch.nn.ModuleList()
        filters_in_out = [(segment_dim, filters[0])]
        for i in range(len(filters) - 1):
            filters_in_out.append([filters[i], filters[i + 1]])
        for _s, _f, _k, _d in zip(strides, filters_in_out, kernel_size, dilations):
            self.convs.append(
                MDC(
                    in_channels=_f[0],
                    out_channels=_f[1],
                    strides=_s,
                    kernel_size=_k,
                    dilations=_d,
                    use_spectral_norm=use_spectral_norm,
                )
            )
        self.post_conv = norm_f(
            Conv1d(
                in_channels=_f[1],
                out_channels=1,
                kernel_size=3,
                stride=1,
                padding=3 // 2,
            )
        )  # @@

[docs]    def forward(self, x):
        fmap = []
        for _l in self.convs:
            x = _l(x)
            fmap.append(x)
        x = self.post_conv(x)  # @@

        return x, fmap


[docs]class MDCDConfig:
    def __init__(self, h):
        self.pqmf_params = h["pqmf_config"]["sbd"]
        self.f_pqmf_params = h["pqmf_config"]["fsbd"]
        self.filters = h["sbd_filters"]
        self.kernel_sizes = h["sbd_kernel_sizes"]
        self.dilations = h["sbd_dilations"]
        self.strides = h["sbd_strides"]
        self.band_ranges = h["sbd_band_ranges"]
        self.transpose = h["sbd_transpose"]
        self.segment_size = h["segment_size"]


[docs]class SBD(torch.nn.Module):
    """SBD (Sub-band Discriminator) from https://arxiv.org/pdf/2206.13404.pdf"""

    def __init__(self, h, use_spectral_norm=False):
        super(SBD, self).__init__()
        self.config = MDCDConfig(h)
        self.pqmf = PQMF(*self.config.pqmf_params)
        if True in h["sbd_transpose"]:
            self.f_pqmf = PQMF(*self.config.f_pqmf_params)
        else:
            self.f_pqmf = None

        self.discriminators = torch.nn.ModuleList()

        for _f, _k, _d, _s, _br, _tr in zip(
            self.config.filters,
            self.config.kernel_sizes,
            self.config.dilations,
            self.config.strides,
            self.config.band_ranges,
            self.config.transpose,
        ):
            if _tr:
                segment_dim = self.config.segment_size // _br[1] - _br[0]
            else:
                segment_dim = _br[1] - _br[0]

            self.discriminators.append(
                SBDBlock(
                    segment_dim=segment_dim,
                    filters=_f,
                    kernel_size=_k,
                    dilations=_d,
                    strides=_s,
                    use_spectral_norm=use_spectral_norm,
                )
            )

[docs]    def forward(self, y, y_hat):
        y_d_rs = []
        y_d_gs = []
        fmap_rs = []
        fmap_gs = []
        y_in = self.pqmf.analysis(y)
        y_hat_in = self.pqmf.analysis(y_hat)
        if self.f_pqmf is not None:
            y_in_f = self.f_pqmf.analysis(y)
            y_hat_in_f = self.f_pqmf.analysis(y_hat)

        for d, br, tr in zip(
            self.discriminators, self.config.band_ranges, self.config.transpose
        ):
            if tr:
                _y_in = y_in_f[:, br[0] : br[1], :]
                _y_hat_in = y_hat_in_f[:, br[0] : br[1], :]
                _y_in = torch.transpose(_y_in, 1, 2)
                _y_hat_in = torch.transpose(_y_hat_in, 1, 2)
            else:
                _y_in = y_in[:, br[0] : br[1], :]
                _y_hat_in = y_hat_in[:, br[0] : br[1], :]
            y_d_r, fmap_r = d(_y_in)
            y_d_g, fmap_g = d(_y_hat_in)
            y_d_rs.append(y_d_r)
            fmap_rs.append(fmap_r)
            y_d_gs.append(y_d_g)
            fmap_gs.append(fmap_g)

        return y_d_rs, y_d_gs, fmap_rs, fmap_gs


[docs]class AvocodoDiscriminator(torch.nn.Module):
    """Avocodo Discriminator module"""

    def __init__(
        self,
        combd: Dict[str, Any] = {
            "combd_h_u": [
                [16, 64, 256, 1024, 1024, 1024],
                [16, 64, 256, 1024, 1024, 1024],
                [16, 64, 256, 1024, 1024, 1024],
            ],
            "combd_d_k": [
                [7, 11, 11, 11, 11, 5],
                [11, 21, 21, 21, 21, 5],
                [15, 41, 41, 41, 41, 5],
            ],
            "combd_d_s": [
                [1, 1, 4, 4, 4, 1],
                [1, 1, 4, 4, 4, 1],
                [1, 1, 4, 4, 4, 1],
            ],
            "combd_d_d": [
                [1, 1, 1, 1, 1, 1],
                [1, 1, 1, 1, 1, 1],
                [1, 1, 1, 1, 1, 1],
            ],
            "combd_d_g": [
                [1, 4, 16, 64, 256, 1],
                [1, 4, 16, 64, 256, 1],
                [1, 4, 16, 64, 256, 1],
            ],
            "combd_d_p": [
                [3, 5, 5, 5, 5, 2],
                [5, 10, 10, 10, 10, 2],
                [7, 20, 20, 20, 20, 2],
            ],
            "combd_op_f": [1, 1, 1],
            "combd_op_k": [3, 3, 3],
            "combd_op_g": [1, 1, 1],
        },
        sbd: Dict[str, Any] = {
            "use_sbd": True,
            "sbd_filters": [
                [64, 128, 256, 256, 256],
                [64, 128, 256, 256, 256],
                [64, 128, 256, 256, 256],
                [32, 64, 128, 128, 128],
            ],
            "sbd_strides": [
                [1, 1, 3, 3, 1],
                [1, 1, 3, 3, 1],
                [1, 1, 3, 3, 1],
                [1, 1, 3, 3, 1],
            ],
            "sbd_kernel_sizes": [
                [[7, 7, 7], [7, 7, 7], [7, 7, 7], [7, 7, 7], [7, 7, 7]],
                [[5, 5, 5], [5, 5, 5], [5, 5, 5], [5, 5, 5], [5, 5, 5]],
                [[3, 3, 3], [3, 3, 3], [3, 3, 3], [3, 3, 3], [3, 3, 3]],
                [[5, 5, 5], [5, 5, 5], [5, 5, 5], [5, 5, 5], [5, 5, 5]],
            ],
            "sbd_dilations": [
                [[5, 7, 11], [5, 7, 11], [5, 7, 11], [5, 7, 11], [5, 7, 11]],
                [[3, 5, 7], [3, 5, 7], [3, 5, 7], [3, 5, 7], [3, 5, 7]],
                [[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]],
                [[1, 2, 3], [1, 2, 3], [1, 2, 3], [2, 3, 5], [2, 3, 5]],
            ],
            "sbd_band_ranges": [[0, 6], [0, 11], [0, 16], [0, 64]],
            "sbd_transpose": [False, False, False, True],
            "pqmf_config": {
                "sbd": [16, 256, 0.03, 10.0],
                "fsbd": [64, 256, 0.1, 9.0],
            },
            "segment_size": 8192,
        },
        pqmf_config: Dict[str, Any] = {
            "lv1": [2, 256, 0.25, 10.0],
            "lv2": [4, 192, 0.13, 10.0],
        },
        projection_filters: List[int] = [0, 1, 1, 1],
    ):
        super(AvocodoDiscriminator, self).__init__()

        self.pqmf_lv2 = PQMF(*pqmf_config["lv2"])
        self.pqmf_lv1 = PQMF(*pqmf_config["lv1"])
        self.combd = CoMBD(
            combd,
            [self.pqmf_lv2, self.pqmf_lv1],
            use_spectral_norm=combd["use_spectral_norm"],
        )
        self.sbd = SBD(
            sbd,
            use_spectral_norm=sbd["use_spectral_norm"],
        )
        self.projection_filters = projection_filters

[docs]    def forward(
        self, y: torch.Tensor, y_hats: torch.Tensor
    ) -> List[List[torch.Tensor]]:
        ys = [
            self.pqmf_lv2.analysis(y)[:, : self.projection_filters[1]],
            self.pqmf_lv1.analysis(y)[:, : self.projection_filters[2]],
            y,
        ]

        (
            combd_outs_real,
            combd_outs_fake,
            combd_fmaps_real,
            combd_fmaps_fake,
        ) = self.combd(ys, y_hats)

        sbd_outs_real, sbd_outs_fake, sbd_fmaps_real, sbd_fmaps_fake = self.sbd(
            y, y_hats[-1]
        )

        # Combine the outputs of both discriminators
        outs_real = combd_outs_real + sbd_outs_real
        outs_fake = combd_outs_fake + sbd_outs_fake
        fmaps_real = combd_fmaps_real + sbd_fmaps_real
        fmaps_fake = combd_fmaps_fake + sbd_fmaps_fake

        return outs_real, outs_fake, fmaps_real, fmaps_fake


[docs]class AvocodoDiscriminatorPlus(torch.nn.Module):
    """Avocodo discriminator with additional MFD."""

    def __init__(
        self,
        combd: Dict[str, Any] = {
            "combd_h_u": [
                [16, 64, 256, 1024, 1024, 1024],
                [16, 64, 256, 1024, 1024, 1024],
                [16, 64, 256, 1024, 1024, 1024],
            ],
            "combd_d_k": [
                [7, 11, 11, 11, 11, 5],
                [11, 21, 21, 21, 21, 5],
                [15, 41, 41, 41, 41, 5],
            ],
            "combd_d_s": [
                [1, 1, 4, 4, 4, 1],
                [1, 1, 4, 4, 4, 1],
                [1, 1, 4, 4, 4, 1],
            ],
            "combd_d_d": [
                [1, 1, 1, 1, 1, 1],
                [1, 1, 1, 1, 1, 1],
                [1, 1, 1, 1, 1, 1],
            ],
            "combd_d_g": [
                [1, 4, 16, 64, 256, 1],
                [1, 4, 16, 64, 256, 1],
                [1, 4, 16, 64, 256, 1],
            ],
            "combd_d_p": [
                [3, 5, 5, 5, 5, 2],
                [5, 10, 10, 10, 10, 2],
                [7, 20, 20, 20, 20, 2],
            ],
            "combd_op_f": [1, 1, 1],
            "combd_op_k": [3, 3, 3],
            "combd_op_g": [1, 1, 1],
        },
        sbd: Dict[str, Any] = {
            "use_sbd": True,
            "sbd_filters": [
                [64, 128, 256, 256, 256],
                [64, 128, 256, 256, 256],
                [64, 128, 256, 256, 256],
                [32, 64, 128, 128, 128],
            ],
            "sbd_strides": [
                [1, 1, 3, 3, 1],
                [1, 1, 3, 3, 1],
                [1, 1, 3, 3, 1],
                [1, 1, 3, 3, 1],
            ],
            "sbd_kernel_sizes": [
                [[7, 7, 7], [7, 7, 7], [7, 7, 7], [7, 7, 7], [7, 7, 7]],
                [[5, 5, 5], [5, 5, 5], [5, 5, 5], [5, 5, 5], [5, 5, 5]],
                [[3, 3, 3], [3, 3, 3], [3, 3, 3], [3, 3, 3], [3, 3, 3]],
                [[5, 5, 5], [5, 5, 5], [5, 5, 5], [5, 5, 5], [5, 5, 5]],
            ],
            "sbd_dilations": [
                [[5, 7, 11], [5, 7, 11], [5, 7, 11], [5, 7, 11], [5, 7, 11]],
                [[3, 5, 7], [3, 5, 7], [3, 5, 7], [3, 5, 7], [3, 5, 7]],
                [[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]],
                [[1, 2, 3], [1, 2, 3], [1, 2, 3], [2, 3, 5], [2, 3, 5]],
            ],
            "sbd_band_ranges": [[0, 6], [0, 11], [0, 16], [0, 64]],
            "sbd_transpose": [False, False, False, True],
            "pqmf_config": {
                "sbd": [16, 256, 0.03, 10.0],
                "fsbd": [64, 256, 0.1, 9.0],
            },
            "segment_size": 8192,
        },
        pqmf_config: Dict[str, Any] = {
            "lv1": [2, 256, 0.25, 10.0],
            "lv2": [4, 192, 0.13, 10.0],
        },
        projection_filters: List[int] = [0, 1, 1, 1],
        # Multi-frequency discriminator related
        sample_rate: int = 22050,
        multi_freq_disc_params: Dict[str, Any] = {
            "hop_length_factors": [4, 8, 16],
            "hidden_channels": [256, 512, 512],
            "domain": "double",
            "mel_scale": True,
            "divisors": [32, 16, 8, 4, 2, 1, 1],
            "strides": [1, 2, 1, 2, 1, 2, 1],
        },
    ):
        super().__init__()

        self.pqmf_lv2 = PQMF(*pqmf_config["lv2"])
        self.pqmf_lv1 = PQMF(*pqmf_config["lv1"])
        self.combd = CoMBD(
            combd,
            [self.pqmf_lv2, self.pqmf_lv1],
            use_spectral_norm=combd["use_spectral_norm"],
        )
        self.sbd = SBD(
            sbd,
            use_spectral_norm=sbd["use_spectral_norm"],
        )
        # Multi-frequency discriminator related
        if "hop_lengths" not in multi_freq_disc_params:
            # Transfer hop lengths factors to hop lengths
            multi_freq_disc_params["hop_lengths"] = []

            for i in range(len(multi_freq_disc_params["hop_length_factors"])):
                multi_freq_disc_params["hop_lengths"].append(
                    int(
                        sample_rate
                        * multi_freq_disc_params["hop_length_factors"][i]
                        / 1000
                    )
                )

            del multi_freq_disc_params["hop_length_factors"]

        self.mfd = MultiFrequencyDiscriminator(
            **multi_freq_disc_params,
        )
        self.projection_filters = projection_filters

[docs]    def forward(
        self, y: torch.Tensor, y_hats: torch.Tensor
    ) -> List[List[torch.Tensor]]:
        ys = [
            self.pqmf_lv2.analysis(y)[:, : self.projection_filters[1]],
            self.pqmf_lv1.analysis(y)[:, : self.projection_filters[2]],
            y,
        ]

        (
            combd_outs_real,
            combd_outs_fake,
            combd_fmaps_real,
            combd_fmaps_fake,
        ) = self.combd(ys, y_hats)

        sbd_outs_real, sbd_outs_fake, sbd_fmaps_real, sbd_fmaps_fake = self.sbd(
            y, y_hats[-1]
        )

        mfd_fmaps_real = self.mfd(y)
        mfd_fmaps_fake = self.mfd(y_hats[-1])
        mfd_outs_real = mfd_fmaps_real[-1]
        mfd_outs_fake = mfd_fmaps_fake[-1]

        # Combine the outputs of both discriminators
        outs_real = combd_outs_real + sbd_outs_real + mfd_outs_real
        outs_fake = combd_outs_fake + sbd_outs_fake + mfd_outs_fake
        fmaps_real = combd_fmaps_real + sbd_fmaps_real + mfd_fmaps_real
        fmaps_fake = combd_fmaps_fake + sbd_fmaps_fake + mfd_fmaps_fake

        return outs_real, outs_fake, fmaps_real, fmaps_fake