yujiepan commited on
Commit
94148b8
·
verified ·
1 Parent(s): 31ddb98

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tekken.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ pipeline_tag: text-generation
4
+ inference: true
5
+ widget:
6
+ - text: Hello!
7
+ example_title: Hello world
8
+ group: Python
9
+ base_model:
10
+ - mistralai/Voxtral-Small-24B-2507
11
+ ---
12
+
13
+ This tiny model is for debugging. It is randomly initialized with the config adapted from [mistralai/Voxtral-Small-24B-2507](https://huggingface.co/mistralai/Voxtral-Small-24B-2507).
14
+
15
+ ### Example usage:
16
+
17
+ - vLLM
18
+
19
+ ```bash
20
+ vllm serve yujiepan/voxtral-tiny-random --trust-remote-code
21
+ ```
22
+
23
+ - Transformers
24
+
25
+ ```python
26
+ import torch
27
+ from transformers import AutoProcessor, VoxtralForConditionalGeneration
28
+
29
+ model_id = "yujiepan/voxtral-tiny-random"
30
+
31
+ device = "cuda"
32
+ processor = AutoProcessor.from_pretrained(model_id)
33
+ model = VoxtralForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map=device)
34
+
35
+ conversation = [
36
+ {
37
+ "role": "user",
38
+ "content": [
39
+ {
40
+ "type": "audio",
41
+ "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/mary_had_lamb.mp3",
42
+ },
43
+ {
44
+ "type": "audio",
45
+ "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/winning_call.mp3",
46
+ },
47
+ {"type": "text", "text": "What sport and what nursery rhyme are referenced?"},
48
+ ],
49
+ }
50
+ ]
51
+
52
+ inputs = processor.apply_chat_template(conversation)
53
+ inputs = inputs.to(device, dtype=torch.bfloat16)
54
+
55
+ outputs = model.generate(**inputs, max_new_tokens=32)
56
+ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
57
+
58
+ print("\nGenerated response:")
59
+ print("=" * 80)
60
+ print(decoded_outputs[0])
61
+ print("=" * 80)
62
+ ```
63
+
64
+ ### Codes to create this repo:
65
+
66
+ ```python
67
+ import json
68
+ from pathlib import Path
69
+
70
+ import accelerate
71
+ import torch
72
+ from huggingface_hub import file_exists, hf_hub_download
73
+ from transformers import (
74
+ AutoConfig,
75
+ AutoModel,
76
+ AutoModelForCausalLM,
77
+ AutoProcessor,
78
+ GenerationConfig,
79
+ set_seed,
80
+ )
81
+
82
+ source_model_id = "mistralai/Voxtral-Small-24B-2507"
83
+ save_folder = "/tmp/yujiepan/voxtral-tiny-random"
84
+
85
+ processor = AutoProcessor.from_pretrained(source_model_id, trust_remote_code=True)
86
+ processor.save_pretrained(save_folder)
87
+
88
+ with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
89
+ config_json = json.load(f)
90
+ config_json['audio_config'].update(
91
+ {
92
+ "head_dim": 32,
93
+ "hidden_size": 64,
94
+ "intermediate_size": 256,
95
+ "num_attention_heads": 2,
96
+ "num_key_value_heads": 2,
97
+ "num_hidden_layers": 2,
98
+ }
99
+ )
100
+ config_json['hidden_size'] = 64
101
+ config_json['text_config'].update(
102
+ {
103
+ "head_dim": 32,
104
+ "hidden_size": 64,
105
+ "intermediate_size": 128,
106
+ "num_attention_heads": 2,
107
+ "num_key_value_heads": 1,
108
+ "num_hidden_layers": 2,
109
+ 'tie_word_embeddings': True,
110
+ }
111
+ )
112
+ with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
113
+ json.dump(config_json, f, indent=2)
114
+ config = AutoConfig.from_pretrained(
115
+ save_folder,
116
+ trust_remote_code=True,
117
+ )
118
+ print(config)
119
+ torch.set_default_dtype(torch.bfloat16)
120
+ model = AutoModel.from_config(config)
121
+ torch.set_default_dtype(torch.float32)
122
+ if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
123
+ model.generation_config = GenerationConfig.from_pretrained(
124
+ source_model_id, trust_remote_code=True,
125
+ )
126
+ set_seed(42)
127
+ model = model.cpu() # cpu is more stable for random initialization across machines
128
+ with torch.no_grad():
129
+ for name, p in sorted(model.named_parameters()):
130
+ torch.nn.init.normal_(p, 0, 0.2)
131
+ print(name, p.shape)
132
+ model.save_pretrained(save_folder)
133
+ print(model)
134
+ ```
135
+
136
+ ### Printing the model:
137
+
138
+ ```text
139
+ VoxtralForConditionalGeneration(
140
+ (audio_tower): VoxtralEncoder(
141
+ (conv1): Conv1d(128, 64, kernel_size=(3,), stride=(1,), padding=(1,))
142
+ (conv2): Conv1d(64, 64, kernel_size=(3,), stride=(2,), padding=(1,))
143
+ (embed_positions): Embedding(1500, 64)
144
+ (layers): ModuleList(
145
+ (0-1): 2 x VoxtralEncoderLayer(
146
+ (self_attn): VoxtralAttention(
147
+ (k_proj): Linear(in_features=64, out_features=64, bias=False)
148
+ (v_proj): Linear(in_features=64, out_features=64, bias=True)
149
+ (q_proj): Linear(in_features=64, out_features=64, bias=True)
150
+ (out_proj): Linear(in_features=64, out_features=64, bias=True)
151
+ )
152
+ (self_attn_layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
153
+ (activation_fn): GELUActivation()
154
+ (fc1): Linear(in_features=64, out_features=256, bias=True)
155
+ (fc2): Linear(in_features=256, out_features=64, bias=True)
156
+ (final_layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
157
+ )
158
+ )
159
+ (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
160
+ (avg_pooler): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
161
+ )
162
+ (language_model): LlamaForCausalLM(
163
+ (model): LlamaModel(
164
+ (embed_tokens): Embedding(131072, 64)
165
+ (layers): ModuleList(
166
+ (0-1): 2 x LlamaDecoderLayer(
167
+ (self_attn): LlamaAttention(
168
+ (q_proj): Linear(in_features=64, out_features=64, bias=False)
169
+ (k_proj): Linear(in_features=64, out_features=32, bias=False)
170
+ (v_proj): Linear(in_features=64, out_features=32, bias=False)
171
+ (o_proj): Linear(in_features=64, out_features=64, bias=False)
172
+ )
173
+ (mlp): LlamaMLP(
174
+ (gate_proj): Linear(in_features=64, out_features=128, bias=False)
175
+ (up_proj): Linear(in_features=64, out_features=128, bias=False)
176
+ (down_proj): Linear(in_features=128, out_features=64, bias=False)
177
+ (act_fn): SiLU()
178
+ )
179
+ (input_layernorm): LlamaRMSNorm((64,), eps=1e-05)
180
+ (post_attention_layernorm): LlamaRMSNorm((64,), eps=1e-05)
181
+ )
182
+ )
183
+ (norm): LlamaRMSNorm((64,), eps=1e-05)
184
+ (rotary_emb): LlamaRotaryEmbedding()
185
+ )
186
+ (lm_head): Linear(in_features=64, out_features=131072, bias=False)
187
+ )
188
+ (multi_modal_projector): VoxtralMultiModalProjector(
189
+ (linear_1): Linear(in_features=256, out_features=64, bias=False)
190
+ (act): GELUActivation()
191
+ (linear_2): Linear(in_features=64, out_features=64, bias=False)
192
+ )
193
+ )
194
+ ```
config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "VoxtralForConditionalGeneration"
4
+ ],
5
+ "audio_config": {
6
+ "activation_dropout": 0.0,
7
+ "activation_function": "gelu",
8
+ "attention_dropout": 0.0,
9
+ "dropout": 0.0,
10
+ "head_dim": 32,
11
+ "hidden_size": 64,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 256,
14
+ "layerdrop": 0.0,
15
+ "max_source_positions": 1500,
16
+ "model_type": "voxtral_encoder",
17
+ "num_attention_heads": 2,
18
+ "num_hidden_layers": 2,
19
+ "num_key_value_heads": 2,
20
+ "num_mel_bins": 128,
21
+ "scale_embedding": false,
22
+ "vocab_size": 51866
23
+ },
24
+ "audio_token_id": 24,
25
+ "hidden_size": 64,
26
+ "model_type": "voxtral",
27
+ "projector_hidden_act": "gelu",
28
+ "text_config": {
29
+ "attention_bias": false,
30
+ "attention_dropout": 0.0,
31
+ "head_dim": 32,
32
+ "hidden_act": "silu",
33
+ "hidden_size": 64,
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 128,
36
+ "max_position_embeddings": 131072,
37
+ "mlp_bias": false,
38
+ "model_type": "llama",
39
+ "num_attention_heads": 2,
40
+ "num_hidden_layers": 2,
41
+ "num_key_value_heads": 1,
42
+ "pretraining_tp": 1,
43
+ "rms_norm_eps": 1e-05,
44
+ "rope_scaling": null,
45
+ "rope_theta": 100000000.0,
46
+ "sliding_window": null,
47
+ "tie_word_embeddings": true,
48
+ "use_cache": true,
49
+ "vocab_size": 131072
50
+ },
51
+ "torch_dtype": "bfloat16",
52
+ "transformers_version": "4.54.0.dev0",
53
+ "vocab_size": 131072
54
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "pad_token_id": 11,
5
+ "transformers_version": "4.54.0.dev0",
6
+ "trust_remote_code": true
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbc8b2aa125d18ced83514cca0b8f156c6713acd05810f2a9b56e4018711483d
3
+ size 17438688
preprocessor_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "dither": 0.0,
4
+ "feature_extractor_type": "WhisperFeatureExtractor",
5
+ "feature_size": 128,
6
+ "hop_length": 160,
7
+ "n_fft": 400,
8
+ "n_samples": 480000,
9
+ "nb_max_frames": 3000,
10
+ "padding_side": "right",
11
+ "padding_value": 0.0,
12
+ "processor_class": "VoxtralProcessor",
13
+ "return_attention_mask": false,
14
+ "sampling_rate": 16000
15
+ }
tekken.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4aaf3836c2a5332f029ce85a7a62255c966f47b6797ef81dedd0ade9c862e4a8
3
+ size 14894206