tokenize_text.py
Less than 1 minute
tokenize_text.py
Tokenize texts
usage: tokenize_text.py [-h]
[--log_level {CRITICAL,ERROR,WARNING,INFO,DEBUG,NOTSET}]
--input INPUT --output OUTPUT [--field FIELD]
[--token_type {char,bpe,word,phn}]
[--delimiter DELIMITER] [--space_symbol SPACE_SYMBOL]
[--bpemodel BPEMODEL]
[--non_linguistic_symbols NON_LINGUISTIC_SYMBOLS]
[--remove_non_linguistic_symbols REMOVE_NON_LINGUISTIC_SYMBOLS]
[--cleaner {None,tacotron,jaconv,vietnamese,korean_cleaner,whisper_en,whisper_basic}]
[--g2p {None,g2p_en,g2p_en_no_space,pyopenjtalk,pyopenjtalk_kana,pyopenjtalk_accent,pyopenjtalk_accent_with_pause,pyopenjtalk_prosody,pypinyin_g2p,pypinyin_g2p_phone,pypinyin_g2p_phone_without_prosody,espeak_ng_arabic,espeak_ng_german,espeak_ng_french,espeak_ng_spanish,espeak_ng_russian,espeak_ng_greek,espeak_ng_finnish,espeak_ng_hungarian,espeak_ng_dutch,espeak_ng_english_us_vits,espeak_ng_hindi,espeak_ng_italian,espeak_ng_ukrainian,espeak_ng_polish,g2pk,g2pk_no_space,g2pk_explicit_space,korean_jaso,korean_jaso_no_space,g2p_is}]
[--write_vocabulary WRITE_VOCABULARY]
[--vocabulary_size VOCABULARY_SIZE] [--cutoff CUTOFF]
[--add_symbol ADD_SYMBOL]
[--add_nonsplit_symbol ADD_NONSPLIT_SYMBOL]