Source code for espnet.nets.pytorch_backend.transducer.custom_decoder

"""Custom decoder definition for Transducer model."""

from typing import Any, Dict, List, Optional, Tuple, Union

import torch

from espnet.nets.pytorch_backend.transducer.blocks import build_blocks
from espnet.nets.pytorch_backend.transducer.utils import (
    check_batch_states,
    check_state,
    pad_sequence,
)
from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
from espnet.nets.pytorch_backend.transformer.mask import subsequent_mask
from espnet.nets.transducer_decoder_interface import (
    ExtendedHypothesis,
    Hypothesis,
    TransducerDecoderInterface,
)


[docs]class CustomDecoder(TransducerDecoderInterface, torch.nn.Module):
    """Custom decoder module for Transducer model.

    Args:
        odim: Output dimension.
        dec_arch: Decoder block architecture (type and parameters).
        input_layer: Input layer type.
        repeat_block: Number of times dec_arch is repeated.
        joint_activation_type: Type of activation for joint network.
        positional_encoding_type: Positional encoding type.
        positionwise_layer_type: Positionwise layer type.
        positionwise_activation_type: Positionwise activation type.
        input_layer_dropout_rate: Dropout rate for input layer.
        blank_id: Blank symbol ID.

    """

    def __init__(
        self,
        odim: int,
        dec_arch: List,
        input_layer: str = "embed",
        repeat_block: int = 0,
        joint_activation_type: str = "tanh",
        positional_encoding_type: str = "abs_pos",
        positionwise_layer_type: str = "linear",
        positionwise_activation_type: str = "relu",
        input_layer_dropout_rate: float = 0.0,
        blank_id: int = 0,
    ):
        """Construct a CustomDecoder object."""
        torch.nn.Module.__init__(self)

        self.embed, self.decoders, ddim, _ = build_blocks(
            "decoder",
            odim,
            input_layer,
            dec_arch,
            repeat_block=repeat_block,
            positional_encoding_type=positional_encoding_type,
            positionwise_layer_type=positionwise_layer_type,
            positionwise_activation_type=positionwise_activation_type,
            input_layer_dropout_rate=input_layer_dropout_rate,
            padding_idx=blank_id,
        )

        self.after_norm = LayerNorm(ddim)

        self.dlayers = len(self.decoders)
        self.dunits = ddim
        self.odim = odim

        self.blank_id = blank_id

[docs]    def set_device(self, device: torch.device):
        """Set GPU device to use.

        Args:
            device: Device ID.

        """
        self.device = device

[docs]    def init_state(
        self,
        batch_size: Optional[int] = None,
    ) -> List[Optional[torch.Tensor]]:
        """Initialize decoder states.

        Args:
            batch_size: Batch size.

        Returns:
            state: Initial decoder hidden states. [N x None]

        """
        state = [None] * self.dlayers

        return state

[docs]    def forward(
        self, dec_input: torch.Tensor, dec_mask: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Encode label ID sequences.

        Args:
            dec_input: Label ID sequences. (B, U)
            dec_mask: Label mask sequences.  (B, U)

        Return:
            dec_output: Decoder output sequences. (B, U, D_dec)
            dec_output_mask: Mask of decoder output sequences. (B, U)

        """
        dec_input = self.embed(dec_input)

        dec_output, dec_mask = self.decoders(dec_input, dec_mask)
        dec_output = self.after_norm(dec_output)

        return dec_output, dec_mask

[docs]    def score(
        self, hyp: Hypothesis, cache: Dict[str, Any]
    ) -> Tuple[torch.Tensor, List[Optional[torch.Tensor]], torch.Tensor]:
        """One-step forward hypothesis.

        Args:
            hyp: Hypothesis.
            cache: Pairs of (dec_out, dec_state) for each label sequence. (key)

        Returns:
            dec_out: Decoder output sequence. (1, D_dec)
            dec_state: Decoder hidden states. [N x (1, U, D_dec)]
            lm_label: Label ID for LM. (1,)

        """
        labels = torch.tensor([hyp.yseq], device=self.device)
        lm_label = labels[:, -1]

        str_labels = "_".join(list(map(str, hyp.yseq)))

        if str_labels in cache:
            dec_out, dec_state = cache[str_labels]
        else:
            dec_out_mask = subsequent_mask(len(hyp.yseq)).unsqueeze_(0)

            new_state = check_state(hyp.dec_state, (labels.size(1) - 1), self.blank_id)

            dec_out = self.embed(labels)

            dec_state = []
            for s, decoder in zip(new_state, self.decoders):
                dec_out, dec_out_mask = decoder(dec_out, dec_out_mask, cache=s)
                dec_state.append(dec_out)

            dec_out = self.after_norm(dec_out[:, -1])

            cache[str_labels] = (dec_out, dec_state)

        return dec_out[0], dec_state, lm_label

[docs]    def batch_score(
        self,
        hyps: Union[List[Hypothesis], List[ExtendedHypothesis]],
        dec_states: List[Optional[torch.Tensor]],
        cache: Dict[str, Any],
        use_lm: bool,
    ) -> Tuple[torch.Tensor, List[Optional[torch.Tensor]], torch.Tensor]:
        """One-step forward hypotheses.

        Args:
            hyps: Hypotheses.
            dec_states: Decoder hidden states. [N x (B, U, D_dec)]
            cache: Pairs of (h_dec, dec_states) for each label sequences. (keys)
            use_lm: Whether to compute label ID sequences for LM.

        Returns:
            dec_out: Decoder output sequences. (B, D_dec)
            dec_states: Decoder hidden states. [N x (B, U, D_dec)]
            lm_labels: Label ID sequences for LM. (B,)

        """
        final_batch = len(hyps)

        process = []
        done = [None] * final_batch

        for i, hyp in enumerate(hyps):
            str_labels = "_".join(list(map(str, hyp.yseq)))

            if str_labels in cache:
                done[i] = cache[str_labels]
            else:
                process.append((str_labels, hyp.yseq, hyp.dec_state))

        if process:
            labels = pad_sequence([p[1] for p in process], self.blank_id)
            labels = torch.LongTensor(labels, device=self.device)

            p_dec_states = self.create_batch_states(
                self.init_state(),
                [p[2] for p in process],
                labels,
            )

            dec_out = self.embed(labels)

            dec_out_mask = (
                subsequent_mask(labels.size(-1))
                .unsqueeze_(0)
                .expand(len(process), -1, -1)
            )

            new_states = []
            for s, decoder in zip(p_dec_states, self.decoders):
                dec_out, dec_out_mask = decoder(dec_out, dec_out_mask, cache=s)
                new_states.append(dec_out)

            dec_out = self.after_norm(dec_out[:, -1])

        j = 0
        for i in range(final_batch):
            if done[i] is None:
                state = self.select_state(new_states, j)

                done[i] = (dec_out[j], state)
                cache[process[j][0]] = (dec_out[j], state)

                j += 1

        dec_out = torch.stack([d[0] for d in done])
        dec_states = self.create_batch_states(
            dec_states, [d[1] for d in done], [[0] + h.yseq for h in hyps]
        )

        if use_lm:
            lm_labels = torch.LongTensor(
                [hyp.yseq[-1] for hyp in hyps], device=self.device
            )

            return dec_out, dec_states, lm_labels

        return dec_out, dec_states, None

[docs]    def select_state(
        self, states: List[Optional[torch.Tensor]], idx: int
    ) -> List[Optional[torch.Tensor]]:
        """Get specified ID state from decoder hidden states.

        Args:
            states: Decoder hidden states. [N x (B, U, D_dec)]
            idx: State ID to extract.

        Returns:
            state_idx: Decoder hidden state for given ID. [N x (1, U, D_dec)]

        """
        if states[0] is None:
            return states

        state_idx = [states[layer][idx] for layer in range(self.dlayers)]

        return state_idx

[docs]    def create_batch_states(
        self,
        states: List[Optional[torch.Tensor]],
        new_states: List[Optional[torch.Tensor]],
        check_list: List[List[int]],
    ) -> List[Optional[torch.Tensor]]:
        """Create decoder hidden states sequences.

        Args:
            states: Decoder hidden states. [N x (B, U, D_dec)]
            new_states: Decoder hidden states. [B x [N x (1, U, D_dec)]]
            check_list: Label ID sequences.

        Returns:
            states: New decoder hidden states. [N x (B, U, D_dec)]

        """
        if new_states[0][0] is None:
            return states

        max_len = max(len(elem) for elem in check_list) - 1

        for layer in range(self.dlayers):
            states[layer] = check_batch_states(
                [s[layer] for s in new_states], max_len, self.blank_id
            )

        return states