Source code for espnet2.text.hugging_face_token_id_converter

from typing import Iterable, List, Union

import numpy as np
from typeguard import typechecked

try:
    from transformers import AutoTokenizer

    is_transformers_available = True
except ImportError:
    is_transformers_available = False


[docs]class HuggingFaceTokenIDConverter: @typechecked def __init__( self, model_name_or_path: str, ): if not is_transformers_available: raise ImportError( "`transformers` is not available. Please install it via `pip install" " transformers` or `cd /path/to/espnet/tools && . ./activate_python.sh" " && ./installers/install_transformers.sh`." ) self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
[docs] def get_num_vocabulary_size(self) -> int: return self.tokenizer.vocab_size
[docs] def ids2tokens(self, integers: Union[np.ndarray, Iterable[int]]) -> List[str]: return self.tokenizer.convert_ids_to_tokens(integers)
[docs] def tokens2ids(self, tokens: Iterable[str]) -> List[int]: return self.tokenizer.convert_tokens_to_ids(tokens)