File size: 7,071 Bytes
e2a8e6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c8d6a5
 
e2a8e6f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from transformers.configuration_utils import PretrainedConfig
from typing import List


class MeralionBestRqConformerConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`MeralionBestRqConformer`]. It is used to
    instantiate a BEST-RQ Conformer model according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        input_dim (`int`, *optional*, defaults to 80):
            The number of input features in the mel-frequency spectrogram.
        input_channels (`int`, *optional*, defaults to 1):
            The number of input channels of the convolutional subsampling layers.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimensionality of the encoder layers and the pooler layer.
        ffn_dim (`int`, *optional*, defaults to 4096):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        conv_depthwise_kernel_size (`int`, *optional*, defaults to 5):
            Kernel size of the depthwise convolution in the Conformer convolution module.
        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability for the input projection layer.
        activation_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability for the activation functions in the feed-forward layers.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability for the hidden layers.
        max_source_positions (`int`, *optional*, defaults to 3000):
            The maximum sequence length that this model might ever be used with.
        no_scale_embedding (`bool`, *optional*, defaults to `False`):
            Whether to scale the embeddings by the square root of the hidden size.
        hidden_act (`str`, *optional*, defaults to `"swish"`):
            The non-linear activation function (function or string) in the encoder and pooler.
        conformer_conv_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability for the Conformer convolution module.
        position_embeddings_type (`str`, *optional*, defaults to `"relative"`):
            The type of position embeddings to use. Can be `"relative"` or `"rotary"`.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability for the attention layers.
        rotary_embedding_base (`int`, *optional*, defaults to 10000):
            The base for the rotary position embeddings.
        layerdrop (`float`, *optional*, defaults to 0.0):
            The LayerDrop probability.
        self_condition_layers (`List`, *optional*, defaults to `[]`):
            A list of layer indices where self-conditioning should be applied.
        use_weighted_sum (`bool`, *optional*, defaults to `True`):
            Whether to use a weighted sum of all hidden states for the final output of the LSTM-CTC model.
        lstm_dim (`int`, *optional*, defaults to 768):
            The hidden size of the LSTM layers in the LSTM-CTC head.
        lstm_num_layers (`int`, *optional*, defaults to 2):
            The number of layers in the LSTM of the LSTM-CTC head.
        lstm_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout probability for the LSTM layers in the LSTM-CTC head.
        final_dropout (`float`, *optional*, defaults to 0.0):
            The dropout probability for the final layer before the CTC loss.
        vocab_size (`int`, *optional*):
            Vocabulary size of the model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MeralionBestRqModelForCTC`].
        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
            The reduction to apply to the output of `torch.nn.functional.ctc_loss`.
        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
            Whether to zero infinite losses and gradients in `torch.nn.functional.ctc_loss`.
    """
    model_type = "meralion_bestrq"

    def __init__(
        self,
        input_dim: int = 80,
        input_channels: int = 1,
        num_attention_heads: int = 8,
        hidden_size: int = 1024, #embed_dim
        ffn_dim: int = 4096,
        num_hidden_layers: int = 24,
        conv_depthwise_kernel_size: int = 5,
        feat_proj_dropout: float = 0., #for input_projection
        activation_dropout: float = 0.,
        hidden_dropout: float = 0.,
        max_source_positions: int = 3000,
        no_scale_embedding: bool = False,
        hidden_act: str = "swish",
        conformer_conv_dropout: float = 0.,
        position_embeddings_type: str = "relative",
        attention_dropout: float = 0.,
        rotary_embedding_base: int = 10000,
        layerdrop = 0.,
        self_condition_layers: List = [], # asr
        use_weighted_sum: bool = True, #lstm
        lstm_dim: int = 768, #lstm
        lstm_num_layers: int = 2, #lstm
        lstm_dropout_prob = 0., #lstm
        final_dropout = 0., #ctc
        vocab_size = None, #ctc
        ctc_loss_reduction = 'sum', #ctc
        ctc_zero_infinity = False, #ctc
        **kwargs,
    ):

        self.input_dim = input_dim
        self.input_channels = input_channels
        self.num_attention_heads = num_attention_heads
        self.hidden_size = hidden_size
        self.ffn_dim = ffn_dim
        self.num_hidden_layers = num_hidden_layers
        self.conv_depthwise_kernel_size = conv_depthwise_kernel_size
        self.feat_proj_dropout = feat_proj_dropout
        self.activation_dropout = activation_dropout
        self.hidden_dropout = hidden_dropout
        self.max_source_positions = max_source_positions
        self.no_scale_embedding = no_scale_embedding
        self.hidden_act = hidden_act
        self.conformer_conv_dropout = conformer_conv_dropout
        self.position_embeddings_type = position_embeddings_type
        self.attention_dropout = attention_dropout
        self.rotary_embedding_base = rotary_embedding_base
        self.layerdrop = layerdrop
        self.self_condition_layers = self_condition_layers
        self.use_weighted_sum = use_weighted_sum
        self.lstm_dim = lstm_dim
        self.lstm_num_layers = lstm_num_layers
        self.lstm_dropout_prob = lstm_dropout_prob
        self.final_dropout = final_dropout
        self.vocab_size = vocab_size
        self.ctc_loss_reduction = ctc_loss_reduction
        self.ctc_zero_infinity = ctc_zero_infinity

        self.inputs_to_logits_ratio = 640
        super().__init__(**kwargs)