Tiny dummy models
Collection
Randomly initialized tiny models for debugging/testing purpose
•
107 items
•
Updated
•
6
This tiny model is for debugging. It is randomly initialized with the config adapted from openai/gpt-oss-120b.
Note: This model used uantized MXFP4 FFN. pip install -U triton git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels
vllm serve yujiepan/gpt-oss-tiny-random-mxfp4
import torch
from transformers import pipeline
model_id = "yujiepan/gpt-oss-tiny-random-mxfp4"
pipe = pipeline(
"text-generation",
model=model_id,
torch_dtype='auto',
device_map="cuda",
)
messages = [
{"role": "user", "content": "Explain quantum mechanics clearly and concisely."},
]
outputs = pipe(
messages,
max_new_tokens=16,
)
print(outputs[0]["generated_text"][-1])
import json
import safetensors
import torch
from huggingface_hub import hf_hub_download
from transformers import (
AutoConfig,
AutoModelForCausalLM,
AutoProcessor,
AutoTokenizer,
GenerationConfig,
GptOssForCausalLM,
pipeline,
set_seed,
)
source_model_id = "openai/gpt-oss-120b"
save_folder = "/tmp/yujiepan/gpt-oss-tiny-random-mxfp4"
processor = AutoProcessor.from_pretrained(source_model_id)
processor.save_pretrained(save_folder)
with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r') as f:
config_json = json.load(f)
config_json.update({
"head_dim": 32,
"hidden_size": 32, # required by Mxfp4GptOssExperts codes
"intermediate_size": 64,
"layer_types": ["sliding_attention", "full_attention"],
"num_attention_heads": 2,
"num_hidden_layers": 2,
"num_key_value_heads": 1,
"num_local_experts": 32,
"tie_word_embeddings": True,
})
quantization_config = config_json['quantization_config']
del config_json['quantization_config']
with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
json.dump(config_json, f, indent=2)
config = AutoConfig.from_pretrained(save_folder)
print(config)
torch.set_default_dtype(torch.bfloat16)
model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.bfloat16)
torch.set_default_dtype(torch.float32)
model.generation_config = GenerationConfig.from_pretrained(
source_model_id, trust_remote_code=True,
)
set_seed(42)
with torch.no_grad():
for name, p in sorted(model.named_parameters()):
torch.nn.init.normal_(p, 0, 0.1)
print(name, p.shape)
model.save_pretrained(save_folder)
# mxfp4
state_dict = model.cpu().state_dict()
del state_dict['lm_head.weight']
for i in range(len(model.model.layers)):
del state_dict[f'model.layers.{i}.mlp.experts.down_proj']
del state_dict[f'model.layers.{i}.mlp.experts.gate_up_proj']
state_dict[f'model.layers.{i}.mlp.experts.down_proj_blocks'] = torch.randint(0, 255, size=(
config.num_local_experts, config.hidden_size, config.intermediate_size // 32, 16), dtype=torch.uint8
)
state_dict[f'model.layers.{i}.mlp.experts.down_proj_scales'] = torch.randint(0, 4, size=(
config.num_local_experts, config.hidden_size, config.intermediate_size // 32), dtype=torch.uint8
)
state_dict[f'model.layers.{i}.mlp.experts.gate_up_proj_blocks'] = torch.randint(0, 255, size=(
config.num_local_experts, 2 * config.intermediate_size, config.hidden_size // 32, 16), dtype=torch.uint8
)
state_dict[f'model.layers.{i}.mlp.experts.gate_up_proj_scales'] = torch.randint(0, 4, size=(
config.num_local_experts, 2 * config.intermediate_size, config.hidden_size // 32), dtype=torch.uint8
)
safetensors.torch.save_file(state_dict, f"{save_folder}/model.safetensors")
# from unittest.mock import Mock
# from transformers.quantizers.auto import AutoHfQuantizer
# from transformers.quantizers.quantizer_mxfp4 import Mxfp4HfQuantizer
# _get_device_capability = torch.cuda.get_device_capability
# torch.cuda.get_device_capability = Mock(return_value=(9, 0))
# set_seed(42)
# bf16_state_dict = model.cpu().state_dict()
# model = AutoModelForCausalLM.from_pretrained(save_folder, torch_dtype=torch.bfloat16, quantization_config=quantization_config)
# for i in range(len(model.model.layers)):
# model.model.layers[i].mlp.experts.down_proj_bottom_pad = 0
# model.model.layers[i].mlp.experts.down_proj_right_pad = 0
# hf_quantizer: Mxfp4HfQuantizer = AutoHfQuantizer.from_config(quantization_config)
# hf_quantizer.pre_quantized = False
# ffn_keys = ['model.layers.0.mlp.experts.down_proj', 'model.layers.0.mlp.experts.gate_up_proj',
# 'model.layers.1.mlp.experts.down_proj', 'model.layers.1.mlp.experts.gate_up_proj']
# for key in ffn_keys:
# hf_quantizer.create_quantized_param(model, bf16_state_dict[key], key, "cuda", bf16_state_dict)
# print('down_proj', model.model.layers[0].mlp.experts.down_proj)
# print('down_proj_blocks', model.model.layers[0].mlp.experts.down_proj_blocks)
# state_dict = model.state_dict()
# del state_dict['lm_head.weight']
# for key in ffn_keys:
# del state_dict[key]
# for k, v in state_dict.items():
# if str(v.device) == 'meta':
# print(k, v.device, v.shape)
# safetensors.torch.save_file(state_dict, f"{save_folder}/model.safetensors")
with open(f"{save_folder}/config.json", "r", encoding='utf-8') as f:
config = json.load(f)
config['quantization_config'] = quantization_config
with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
json.dump(config, f, indent=2)
# torch.cuda.get_device_capability = _get_device_capability
Base model
openai/gpt-oss-120b