File size: 7,071 Bytes
e2a8e6f 8c8d6a5 e2a8e6f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
from transformers.configuration_utils import PretrainedConfig
from typing import List
class MeralionBestRqConformerConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of a [`MeralionBestRqConformer`]. It is used to
instantiate a BEST-RQ Conformer model according to the specified arguments, defining the model architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
input_dim (`int`, *optional*, defaults to 80):
The number of input features in the mel-frequency spectrogram.
input_channels (`int`, *optional*, defaults to 1):
The number of input channels of the convolutional subsampling layers.
num_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer encoder.
hidden_size (`int`, *optional*, defaults to 1024):
Dimensionality of the encoder layers and the pooler layer.
ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 24):
Number of hidden layers in the Transformer encoder.
conv_depthwise_kernel_size (`int`, *optional*, defaults to 5):
Kernel size of the depthwise convolution in the Conformer convolution module.
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for the input projection layer.
activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for the activation functions in the feed-forward layers.
hidden_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for the hidden layers.
max_source_positions (`int`, *optional*, defaults to 3000):
The maximum sequence length that this model might ever be used with.
no_scale_embedding (`bool`, *optional*, defaults to `False`):
Whether to scale the embeddings by the square root of the hidden size.
hidden_act (`str`, *optional*, defaults to `"swish"`):
The non-linear activation function (function or string) in the encoder and pooler.
conformer_conv_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for the Conformer convolution module.
position_embeddings_type (`str`, *optional*, defaults to `"relative"`):
The type of position embeddings to use. Can be `"relative"` or `"rotary"`.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for the attention layers.
rotary_embedding_base (`int`, *optional*, defaults to 10000):
The base for the rotary position embeddings.
layerdrop (`float`, *optional*, defaults to 0.0):
The LayerDrop probability.
self_condition_layers (`List`, *optional*, defaults to `[]`):
A list of layer indices where self-conditioning should be applied.
use_weighted_sum (`bool`, *optional*, defaults to `True`):
Whether to use a weighted sum of all hidden states for the final output of the LSTM-CTC model.
lstm_dim (`int`, *optional*, defaults to 768):
The hidden size of the LSTM layers in the LSTM-CTC head.
lstm_num_layers (`int`, *optional*, defaults to 2):
The number of layers in the LSTM of the LSTM-CTC head.
lstm_dropout_prob (`float`, *optional*, defaults to 0.0):
The dropout probability for the LSTM layers in the LSTM-CTC head.
final_dropout (`float`, *optional*, defaults to 0.0):
The dropout probability for the final layer before the CTC loss.
vocab_size (`int`, *optional*):
Vocabulary size of the model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`MeralionBestRqModelForCTC`].
ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
The reduction to apply to the output of `torch.nn.functional.ctc_loss`.
ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
Whether to zero infinite losses and gradients in `torch.nn.functional.ctc_loss`.
"""
model_type = "meralion_bestrq"
def __init__(
self,
input_dim: int = 80,
input_channels: int = 1,
num_attention_heads: int = 8,
hidden_size: int = 1024, #embed_dim
ffn_dim: int = 4096,
num_hidden_layers: int = 24,
conv_depthwise_kernel_size: int = 5,
feat_proj_dropout: float = 0., #for input_projection
activation_dropout: float = 0.,
hidden_dropout: float = 0.,
max_source_positions: int = 3000,
no_scale_embedding: bool = False,
hidden_act: str = "swish",
conformer_conv_dropout: float = 0.,
position_embeddings_type: str = "relative",
attention_dropout: float = 0.,
rotary_embedding_base: int = 10000,
layerdrop = 0.,
self_condition_layers: List = [], # asr
use_weighted_sum: bool = True, #lstm
lstm_dim: int = 768, #lstm
lstm_num_layers: int = 2, #lstm
lstm_dropout_prob = 0., #lstm
final_dropout = 0., #ctc
vocab_size = None, #ctc
ctc_loss_reduction = 'sum', #ctc
ctc_zero_infinity = False, #ctc
**kwargs,
):
self.input_dim = input_dim
self.input_channels = input_channels
self.num_attention_heads = num_attention_heads
self.hidden_size = hidden_size
self.ffn_dim = ffn_dim
self.num_hidden_layers = num_hidden_layers
self.conv_depthwise_kernel_size = conv_depthwise_kernel_size
self.feat_proj_dropout = feat_proj_dropout
self.activation_dropout = activation_dropout
self.hidden_dropout = hidden_dropout
self.max_source_positions = max_source_positions
self.no_scale_embedding = no_scale_embedding
self.hidden_act = hidden_act
self.conformer_conv_dropout = conformer_conv_dropout
self.position_embeddings_type = position_embeddings_type
self.attention_dropout = attention_dropout
self.rotary_embedding_base = rotary_embedding_base
self.layerdrop = layerdrop
self.self_condition_layers = self_condition_layers
self.use_weighted_sum = use_weighted_sum
self.lstm_dim = lstm_dim
self.lstm_num_layers = lstm_num_layers
self.lstm_dropout_prob = lstm_dropout_prob
self.final_dropout = final_dropout
self.vocab_size = vocab_size
self.ctc_loss_reduction = ctc_loss_reduction
self.ctc_zero_infinity = ctc_zero_infinity
self.inputs_to_logits_ratio = 640
super().__init__(**kwargs) |