espnet2.asr.encoder.avhubert_encoder.AVHubertConfig
espnet2.asr.encoder.avhubert_encoder.AVHubertConfig
class espnet2.asr.encoder.avhubert_encoder.AVHubertConfig(sample_rate: int = 16000, label_rate: int = -1, encoder_layers: int = 12, encoder_embed_dim: int = 768, encoder_ffn_embed_dim: int = 3072, encoder_attention_heads: int = 12, activation_fn: str = 'gelu', dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.0, encoder_layerdrop: float = 0.0, dropout_input: float = 0.0, dropout_features: float = 0.0, final_dim: int = 0, untie_final_proj: bool = False, layer_norm_first: bool = False, conv_feature_layers: str = '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', conv_bias: bool = False, logit_temp: float = 0.1, target_glu: bool = False, feature_grad_mult: float = 1.0, mask_length_audio: int = 10, mask_prob_audio: float = 0.65, mask_length_image: int = 10, mask_prob_image: float = 0.65, mask_selection: str = 'static', mask_other: float = 0, no_mask_overlap: bool = False, mask_min_space: int = 1, mask_channel_length: int = 10, mask_channel_prob: float = 0.0, mask_channel_selection: str = 'static', mask_channel_other: float = 0, no_mask_channel_overlap: bool = False, mask_channel_min_space: int = 1, conv_pos: int = 128, conv_pos_groups: int = 16, latent_temp: Tuple[float, float, float] = (2, 0.5, 0.999995), skip_masked: bool = False, skip_nomask: bool = False, resnet_relu_type: str = 'prelu', resnet_weights: str | None = None, sim_type: str = 'cosine', sub_encoder_layers: int = 0, audio_feat_dim: int = -1, modality_dropout: float = 0, audio_dropout: float = 0, modality_fuse: str = 'concat', selection_type: str = 'same_other_seq', masking_type: str = 'input', decoder_embed_dim: int = 768, decoder_ffn_embed_dim: int = 3072, decoder_layers: int = 6, decoder_layerdrop: float = 0.0, decoder_attention_heads: int = 4, decoder_learned_pos: bool = False, decoder_normalize_before: bool = False, no_token_positional_embeddings: bool = False, decoder_dropout: float = 0.1, decoder_attention_dropout: float = 0.1, decoder_activation_dropout: float = 0.0, max_target_positions: int = 2048, share_decoder_input_output_embed: bool = False, audio_only: bool = False, no_scale_embedding: bool = True)
Bases: object
Configuration from original AVHubert Github
activation_dropout : float = 0.0
activation_fn : str = 'gelu'
attention_dropout : float = 0.1
audio_dropout : float = 0
audio_feat_dim : int = -1
audio_only : bool = False
conv_bias : bool = False
conv_feature_layers : str = '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
conv_pos : int = 128
conv_pos_groups : int = 16
decoder_activation_dropout : float = 0.0
decoder_attention_dropout : float = 0.1
decoder_attention_heads : int = 4
decoder_dropout : float = 0.1
decoder_embed_dim : int = 768
decoder_ffn_embed_dim : int = 3072
decoder_layerdrop : float = 0.0
decoder_layers : int = 6
decoder_learned_pos : bool = False
decoder_normalize_before : bool = False
dropout : float = 0.1
dropout_features : float = 0.0
dropout_input : float = 0.0
encoder_attention_heads : int = 12
encoder_embed_dim : int = 768
encoder_ffn_embed_dim : int = 3072
encoder_layerdrop : float = 0.0
encoder_layers : int = 12
feature_grad_mult : float = 1.0
final_dim : int = 0
label_rate : int = -1
latent_temp : Tuple[float, float, float] = (2, 0.5, 0.999995)
layer_norm_first : bool = False
logit_temp : float = 0.1
mask_channel_length : int = 10
mask_channel_min_space : int = 1
mask_channel_other : float = 0
mask_channel_prob : float = 0.0
mask_channel_selection : str = 'static'
mask_length_audio : int = 10
mask_length_image : int = 10
mask_min_space : int = 1
mask_other : float = 0
mask_prob_audio : float = 0.65
mask_prob_image : float = 0.65
mask_selection : str = 'static'
masking_type : str = 'input'
max_target_positions : int = 2048
modality_dropout : float = 0
modality_fuse : str = 'concat'
no_mask_channel_overlap : bool = False
no_mask_overlap : bool = False
no_scale_embedding : bool = True
no_token_positional_embeddings : bool = False
resnet_relu_type : str = 'prelu'
resnet_weights : str | None = None
sample_rate : int = 16000
selection_type : str = 'same_other_seq'
share_decoder_input_output_embed : bool = False
sim_type : str = 'cosine'
skip_masked : bool = False
skip_nomask : bool = False
sub_encoder_layers : int = 0
target_glu : bool = False
untie_final_proj : bool = False