Source code for espnet2.text.build_tokenizer

from pathlib import Path
from typing import Dict, Iterable, Optional, Union

from typeguard import typechecked

from espnet2.text.abs_tokenizer import AbsTokenizer
from espnet2.text.char_tokenizer import CharTokenizer
from espnet2.text.hugging_face_tokenizer import HuggingFaceTokenizer
from espnet2.text.phoneme_tokenizer import PhonemeTokenizer
from espnet2.text.sentencepiece_tokenizer import SentencepiecesTokenizer
from espnet2.text.whisper_tokenizer import OpenAIWhisperTokenizer
from espnet2.text.word_tokenizer import WordTokenizer


[docs]@typechecked def build_tokenizer( token_type: str, bpemodel: Optional[Union[Path, str, Iterable[str]]] = None, non_linguistic_symbols: Optional[Union[Path, str, Iterable[str]]] = None, remove_non_linguistic_symbols: bool = False, space_symbol: str = "<space>", delimiter: Optional[str] = None, g2p_type: Optional[str] = None, nonsplit_symbol: Optional[Iterable[str]] = None, # tokenization encode (text2token) args, e.g. BPE dropout, only applied in training encode_kwargs: Optional[Dict] = None, # only use for whisper whisper_language: Optional[str] = None, whisper_task: Optional[str] = None, sot_asr: bool = False, ) -> AbsTokenizer: """A helper function to instantiate Tokenizer""" if token_type == "bpe": if bpemodel is None: raise ValueError('bpemodel is required if token_type = "bpe"') if remove_non_linguistic_symbols: raise RuntimeError( "remove_non_linguistic_symbols is not implemented for token_type=bpe" ) if encode_kwargs is None: encode_kwargs = dict() return SentencepiecesTokenizer(bpemodel, encode_kwargs) if token_type == "hugging_face": if bpemodel is None: raise ValueError('bpemodel is required if token_type = "hugging_face"') if remove_non_linguistic_symbols: raise RuntimeError( "remove_non_linguistic_symbols is not " + "implemented for token_type=hugging_face" ) return HuggingFaceTokenizer(bpemodel) elif token_type == "word": if remove_non_linguistic_symbols and non_linguistic_symbols is not None: return WordTokenizer( delimiter=delimiter, non_linguistic_symbols=non_linguistic_symbols, remove_non_linguistic_symbols=True, ) else: return WordTokenizer(delimiter=delimiter) elif token_type == "char": return CharTokenizer( non_linguistic_symbols=non_linguistic_symbols, space_symbol=space_symbol, remove_non_linguistic_symbols=remove_non_linguistic_symbols, nonsplit_symbols=nonsplit_symbol, ) elif token_type == "phn": return PhonemeTokenizer( g2p_type=g2p_type, non_linguistic_symbols=non_linguistic_symbols, space_symbol=space_symbol, remove_non_linguistic_symbols=remove_non_linguistic_symbols, ) elif "whisper" in token_type: return OpenAIWhisperTokenizer( model_type=bpemodel, language=whisper_language or "en", task=whisper_task or "transcribe", added_tokens_txt=non_linguistic_symbols, sot=sot_asr, ) else: raise ValueError( f"token_mode must be one of bpe, word, char or phn: " f"{token_type}" )