Source code for espnet.nets.pytorch_backend.fastspeech.duration_calculator

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2019 Tomoki Hayashi
#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

"""Duration calculator related modules."""

import torch

from espnet.nets.pytorch_backend.e2e_tts_tacotron2 import Tacotron2
from espnet.nets.pytorch_backend.e2e_tts_transformer import Transformer
from espnet.nets.pytorch_backend.nets_utils import pad_list


[docs]class DurationCalculator(torch.nn.Module):
    """Duration calculator module for FastSpeech.

    Todo:
        * Fix the duplicated calculation of diagonal head decision

    """

    def __init__(self, teacher_model):
        """Initialize duration calculator module.

        Args:
            teacher_model (e2e_tts_transformer.Transformer):
                Pretrained auto-regressive Transformer.

        """
        super(DurationCalculator, self).__init__()
        if isinstance(teacher_model, Transformer):
            self.register_buffer("diag_head_idx", torch.tensor(-1))
        elif isinstance(teacher_model, Tacotron2):
            pass
        else:
            raise ValueError(
                "teacher model should be the instance of "
                "e2e_tts_transformer.Transformer or e2e_tts_tacotron2.Tacotron2."
            )
        self.teacher_model = teacher_model

[docs]    def forward(self, xs, ilens, ys, olens, spembs=None):
        """Calculate forward propagation.

        Args:
            xs (Tensor): Batch of the padded sequences of character ids (B, Tmax).
            ilens (Tensor): Batch of lengths of each input sequence (B,).
            ys (Tensor):
                Batch of the padded sequence of target features (B, Lmax, odim).
            olens (Tensor): Batch of lengths of each output sequence (B,).
            spembs (Tensor, optional):
                Batch of speaker embedding vectors (B, spk_embed_dim).

        Returns:
            Tensor: Batch of durations (B, Tmax).

        """
        if isinstance(self.teacher_model, Transformer):
            att_ws = self._calculate_encoder_decoder_attentions(
                xs, ilens, ys, olens, spembs=spembs
            )
            # TODO(kan-bayashi): fix this issue
            # this does not work in multi-gpu case. registered buffer is not saved.
            if int(self.diag_head_idx) == -1:
                self._init_diagonal_head(att_ws)
            att_ws = att_ws[:, self.diag_head_idx]
        else:
            # NOTE(kan-bayashi): Here we assume that the teacher is tacotron 2
            att_ws = self.teacher_model.calculate_all_attentions(
                xs, ilens, ys, spembs=spembs, keep_tensor=True
            )
        durations = [
            self._calculate_duration(att_w, ilen, olen)
            for att_w, ilen, olen in zip(att_ws, ilens, olens)
        ]

        return pad_list(durations, 0)

    @staticmethod
    def _calculate_duration(att_w, ilen, olen):
        return torch.stack(
            [att_w[:olen, :ilen].argmax(-1).eq(i).sum() for i in range(ilen)]
        )

    def _init_diagonal_head(self, att_ws):
        diagonal_scores = att_ws.max(dim=-1)[0].mean(dim=-1).mean(dim=0)  # (H * L,)
        self.register_buffer("diag_head_idx", diagonal_scores.argmax())

    def _calculate_encoder_decoder_attentions(self, xs, ilens, ys, olens, spembs=None):
        att_dict = self.teacher_model.calculate_all_attentions(
            xs, ilens, ys, olens, spembs=spembs, skip_output=True, keep_tensor=True
        )
        return torch.cat(
            [att_dict[k] for k in att_dict.keys() if "src_attn" in k], dim=1
        )  # (B, H*L, Lmax, Tmax)