espnet2.asr.encoder.avhubert_encoder.AVHubertConfig

About 4 min

espnet2.asr.encoder.avhubert_encoder.AVHubertConfig

class espnet2.asr.encoder.avhubert_encoder.AVHubertConfig(sample_rate: int = 16000, label_rate: int = -1, encoder_layers: int = 12, encoder_embed_dim: int = 768, encoder_ffn_embed_dim: int = 3072, encoder_attention_heads: int = 12, activation_fn: str = 'gelu', dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.0, encoder_layerdrop: float = 0.0, dropout_input: float = 0.0, dropout_features: float = 0.0, final_dim: int = 0, untie_final_proj: bool = False, layer_norm_first: bool = False, conv_feature_layers: str = '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', conv_bias: bool = False, logit_temp: float = 0.1, target_glu: bool = False, feature_grad_mult: float = 1.0, mask_length_audio: int = 10, mask_prob_audio: float = 0.65, mask_length_image: int = 10, mask_prob_image: float = 0.65, mask_selection: str = 'static', mask_other: float = 0, no_mask_overlap: bool = False, mask_min_space: int = 1, mask_channel_length: int = 10, mask_channel_prob: float = 0.0, mask_channel_selection: str = 'static', mask_channel_other: float = 0, no_mask_channel_overlap: bool = False, mask_channel_min_space: int = 1, conv_pos: int = 128, conv_pos_groups: int = 16, latent_temp: Tuple[float, float, float] = (2, 0.5, 0.999995), skip_masked: bool = False, skip_nomask: bool = False, resnet_relu_type: str = 'prelu', resnet_weights: str | None = None, sim_type: str = 'cosine', sub_encoder_layers: int = 0, audio_feat_dim: int = -1, modality_dropout: float = 0, audio_dropout: float = 0, modality_fuse: str = 'concat', selection_type: str = 'same_other_seq', masking_type: str = 'input', decoder_embed_dim: int = 768, decoder_ffn_embed_dim: int = 3072, decoder_layers: int = 6, decoder_layerdrop: float = 0.0, decoder_attention_heads: int = 4, decoder_learned_pos: bool = False, decoder_normalize_before: bool = False, no_token_positional_embeddings: bool = False, decoder_dropout: float = 0.1, decoder_attention_dropout: float = 0.1, decoder_activation_dropout: float = 0.0, max_target_positions: int = 2048, share_decoder_input_output_embed: bool = False, audio_only: bool = False, no_scale_embedding: bool = True)

Bases: object

Configuration from original AVHubert Github

activation_dropout : float = 0.0

activation_fn : str = 'gelu'

attention_dropout : float = 0.1

audio_dropout : float = 0

audio_feat_dim : int = -1

audio_only : bool = False

conv_bias : bool = False

conv_feature_layers : str = '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'

conv_pos : int = 128

conv_pos_groups : int = 16

decoder_activation_dropout : float = 0.0

decoder_attention_dropout : float = 0.1

decoder_attention_heads : int = 4

decoder_dropout : float = 0.1

decoder_embed_dim : int = 768

decoder_ffn_embed_dim : int = 3072

decoder_layerdrop : float = 0.0

decoder_layers : int = 6

decoder_learned_pos : bool = False

decoder_normalize_before : bool = False

dropout : float = 0.1

dropout_features : float = 0.0

dropout_input : float = 0.0

encoder_attention_heads : int = 12

encoder_embed_dim : int = 768

encoder_ffn_embed_dim : int = 3072

encoder_layerdrop : float = 0.0

encoder_layers : int = 12

feature_grad_mult : float = 1.0

final_dim : int = 0

label_rate : int = -1

latent_temp : Tuple[float, float, float] = (2, 0.5, 0.999995)

layer_norm_first : bool = False

logit_temp : float = 0.1

mask_channel_length : int = 10

mask_channel_min_space : int = 1

mask_channel_other : float = 0

mask_channel_prob : float = 0.0

mask_channel_selection : str = 'static'

mask_length_audio : int = 10

mask_length_image : int = 10

mask_min_space : int = 1

mask_other : float = 0

mask_prob_audio : float = 0.65

mask_prob_image : float = 0.65

mask_selection : str = 'static'

masking_type : str = 'input'

max_target_positions : int = 2048

modality_dropout : float = 0

modality_fuse : str = 'concat'

no_mask_channel_overlap : bool = False

no_mask_overlap : bool = False

no_scale_embedding : bool = True

no_token_positional_embeddings : bool = False

resnet_relu_type : str = 'prelu'

resnet_weights : str | None = None

sample_rate : int = 16000

selection_type : str = 'same_other_seq'

share_decoder_input_output_embed : bool = False

sim_type : str = 'cosine'

skip_masked : bool = False

skip_nomask : bool = False

sub_encoder_layers : int = 0

target_glu : bool = False

untie_final_proj : bool = False