Source code for espnet2.bin.spk_inference

#!/usr/bin/env python3
import argparse
import logging
import sys
from pathlib import Path
from typing import Any, Optional, Sequence, Tuple, Union

import numpy as np
import torch
from typeguard import typechecked

from espnet2.fileio.npy_scp import NpyScpWriter
from espnet2.tasks.spk import SpeakerTask
from espnet2.torch_utils.device_funcs import to_device
from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
from espnet2.utils import config_argparse
from espnet2.utils.types import str2triple_str, str_or_none
from espnet.utils.cli_utils import get_commandline_args


[docs]class Speech2Embedding: """Speech2Embedding class Examples: >>> import soundfile >>> speech2spkembed = Speech2Embedding("spk_config.yml", "spk.pth") >>> audio, rate = soundfile.read("speech.wav") >>> speech2spkembed(audio) """ @typechecked def __init__( self, train_config: Union[Path, str, None] = None, model_file: Union[Path, str, None] = None, device: str = "cpu", dtype: str = "float32", batch_size: int = 1, ): spk_model, spk_train_args = SpeakerTask.build_model_from_file( train_config, model_file, device ) self.spk_model = spk_model.eval() self.spk_train_args = spk_train_args self.device = device self.dtype = dtype self.batch_size = batch_size @torch.no_grad() @typechecked def __call__(self, speech: Union[torch.Tensor, np.ndarray]) -> torch.Tensor: """Inference Args: speech: Input speech data Returns: spk_embedding """ # Input as audio signal if isinstance(speech, np.ndarray): speech = torch.tensor(speech) # data: (Nsamples,) -> (1, Nsamples) speech = speech.unsqueeze(0).to(getattr(torch, self.dtype)) logging.info("speech length: " + str(speech.size(1))) batch = {"speech": speech, "extract_embd": True} # a. To device batch = to_device(batch, device=self.device) # b. Forward the model embedding extraction output = self.spk_model(**batch) return output
[docs] @staticmethod def from_pretrained( model_tag: Optional[str] = None, **kwargs: Optional[Any], ): """Build Speech2Embedding instance from the pretrained model. Args: model_tag (Optional[str]): Model tag of the pretrained models. Currently, the tags of espnet_model_zoo are supported. Returns: Speech2Text: Speech2Embedding instance. """ if model_tag is not None: try: from espnet_model_zoo.downloader import ModelDownloader except ImportError: logging.error( "`espnet_model_zoo` is not installed. " "Please install via `pip install -U espnet_model_zoo`." ) raise d = ModelDownloader() kwargs.update(**d.download_and_unpack(model_tag)) return Speech2Embedding(**kwargs)
[docs]@typechecked def inference( output_dir: str, batch_size: int, dtype: str, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], train_config: Optional[str], model_file: Optional[str], model_tag: Optional[str], ): if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) # if ngpu >= 1: # device = "cuda" # else: # device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build speech2embedding speech2embedding_kwargs = dict( batch_size=batch_size, dtype=dtype, train_config=train_config, model_file=model_file, ) speech2embedding = Speech2Embedding.from_pretrained( model_tag=model_tag, **speech2embedding_kwargs, ) # 3. Build data-iterator loader = SpeakerTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=SpeakerTask.build_preprocess_fn( speech2embedding.spk_train_args, False ), collate_fn=SpeakerTask.build_colate_fn(speech2embedding.spk_train_args, False), inference=True, ) # 4. Start for-loop with NpyScpWriter(output_dir / "embed", output_dir / "embed.scp") as writer: for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} result = speech2embedding(**batch) # Only supporting batch_size==1 key = keys[0] writer[key] = result.cpu().numpy()
[docs]def get_parser(): parser = config_argparse.ArgumentParser( description="Speaker Embedding Extraction", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) # Note(kamo): Use '_' instead of '-' as separator. # '-' is confusing if written in yaml. parser.add_argument( "--log_level", type=lambda x: x.upper(), default="INFO", choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), help="The verbose level of logging", ) parser.add_argument("--output_dir", type=str, required=True) parser.add_argument( "--ngpu", type=int, default=0, help="The number of gpus. 0 indicates CPU mode", ) parser.add_argument("--seed", type=int, default=0, help="Random seed") parser.add_argument( "--dtype", default="float32", choices=["float16", "float32", "float64"], help="Data type", ) parser.add_argument( "--num_workers", type=int, default=1, help="The number of workers used for DataLoader", ) group = parser.add_argument_group("Input data related") group.add_argument( "--data_path_and_name_and_type", type=str2triple_str, required=True, action="append", ) group.add_argument("--key_file", type=str_or_none) group.add_argument( "--batch_size", type=int, default=1, help="The batch size for inference", ) group = parser.add_argument_group("The model configuration related") group.add_argument( "--train_config", type=str, help="Speaker model training configuration", ) group.add_argument( "--model_file", type=str, help="Speaker model parameter file", ) group.add_argument( "--model_tag", type=str, help="Pretrained model tag. If specify this option, *_train_config and " "*_file will be overwritten", ) return parser
[docs]def main(cmd=None): print(get_commandline_args(), file=sys.stderr) parser = get_parser() args = parser.parse_args(cmd) kwargs = vars(args) kwargs.pop("config", None) inference(**kwargs)
if __name__ == "__main__": main()