Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

.gitattributes +1 -0
README.md +213 -0
added_tokens.json +12 -0
chat_template.jinja +1 -0
config.json +41 -0
generation_config.json +10 -0
merges.txt +0 -0
model.safetensors +3 -0
special_tokens_map.json +30 -0
tokenizer.json +3 -0
tokenizer_config.json +111 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,213 @@

+---
+library_name: transformers
+pipeline_tag: text-generation
+inference: true
+widget:
+  - text: Hello!
+    example_title: Hello world
+    group: Python
+base_model:
+- microsoft/Phi-4-mini-flash-reasoning
+---
+This tiny model is for debugging. It is randomly initialized with the config adapted from [microsoft/Phi-4-mini-flash-reasoning](https://huggingface.co/microsoft/Phi-4-mini-flash-reasoning).
+### Example usage:
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+torch.random.manual_seed(0)
+model_id = "tiny-random/phi4-flash"
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="cuda",
+    torch_dtype=torch.bfloat16,
+    trust_remote_code=True,
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+messages = [{
+    "role": "user",
+    "content": "How to solve 3*x^2+4*x+5=1?"
+}]
+inputs = tokenizer.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    return_dict=True,
+    return_tensors="pt",
+)
+outputs = model.generate(
+    **inputs.to(model.device),
+    max_new_tokens=600,
+    temperature=0.6,
+    top_p=0.95,
+    do_sample=True,
+)
+outputs = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[-1]:])
+print(outputs[0])
+```
+### Codes to create this repo:
+```python
+import json
+from pathlib import Path
+import accelerate
+import torch
+from huggingface_hub import file_exists, hf_hub_download
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoProcessor,
+    GenerationConfig,
+    set_seed,
+)
+source_model_id = "microsoft/Phi-4-mini-flash-reasoning"
+save_folder = "/tmp/tiny-random/phi4-flash"
+processor = AutoProcessor.from_pretrained(source_model_id, trust_remote_code=True)
+processor.save_pretrained(save_folder)
+with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
+    config_json = json.load(f)
+for key in ['AutoConfig', 'AutoModelForCausalLM']:
+    config_json['auto_map'][key] = f'{source_model_id}--' + config_json['auto_map'][key]
+automap = config_json['auto_map']
+config_json['hidden_size'] = 64
+config_json['intermediate_size'] = 64
+config_json['num_attention_heads'] = 2
+config_json['num_hidden_layers'] = 4
+config_json['num_key_value_heads'] = 2
+config_json['tie_word_embeddings'] = True
+config_json['sliding_window'] = 512
+config_json['use_cache'] = True
+config_json['mb_per_layer'] = 2  # first layer is mamba
+with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
+    json.dump(config_json, f, indent=2)
+config = AutoConfig.from_pretrained(
+    save_folder,
+    trust_remote_code=True,
+)
+print(config)
+torch.set_default_dtype(torch.bfloat16)
+model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+torch.set_default_dtype(torch.float32)
+if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
+    model.generation_config = GenerationConfig.from_pretrained(
+        source_model_id, trust_remote_code=True,
+    )
+set_seed(42)
+model = model.cpu()  # cpu is more stable for random initialization across machines
+with torch.no_grad():
+    for name, p in sorted(model.named_parameters()):
+        torch.nn.init.normal_(p, 0, 0.2)
+        print(name, p.shape)
+model.save_pretrained(save_folder)
+print(model)
+with open(f"{save_folder}/config.json", "r", encoding='utf-8') as f:
+    config_json = json.load(f)
+    config_json['auto_map'] = automap
+    config_json['sliding_window'] = 512  # a bugfix for '<' not supported between instances of 'int' and 'list'
+with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
+    json.dump(config_json, f, indent=2)
+for python_file in Path(save_folder).glob('*.py'):
+    if python_file.name.startswith('modeling_') or python_file.name.startswith('configuration_'):
+        python_file.unlink()
+```
+### Printing the model:
+```text
+Phi4FlashForCausalLM(
+  (model): Phi4FlashModel(
+    (embed_tokens): Embedding(200064, 64, padding_idx=199999)
+    (embed_dropout): Dropout(p=0.0, inplace=False)
+    (layers): ModuleList(
+      (0): SambaYDecoderLayer(
+        (mlp): SambaYMLP(
+          (fc1): Linear(in_features=64, out_features=128, bias=False)
+          (fc2): Linear(in_features=64, out_features=64, bias=False)
+          (activation_fn): SiLU()
+        )
+        (input_layernorm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
+        (attn): Phi3Mamba(
+          (in_proj): Linear(in_features=64, out_features=256, bias=False)
+          (conv1d): Conv1d(128, 128, kernel_size=(4,), stride=(1,), padding=(3,), groups=128)
+          (act): SiLU()
+          (x_proj): Linear(in_features=128, out_features=36, bias=False)
+          (dt_proj): Linear(in_features=4, out_features=128, bias=True)
+          (out_proj): Linear(in_features=128, out_features=64, bias=False)
+        )
+        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
+        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
+        (post_attention_layernorm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
+      )
+      (1): SambaYDecoderLayer(
+        (mlp): SambaYMLP(
+          (fc1): Linear(in_features=64, out_features=128, bias=False)
+          (fc2): Linear(in_features=64, out_features=64, bias=False)
+          (activation_fn): SiLU()
+        )
+        (input_layernorm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
+        (attn): SambaYFlashAttention2(
+          (out_proj): Linear(in_features=64, out_features=64, bias=True)
+          (Wqkv): Linear(in_features=64, out_features=192, bias=True)
+          (inner_cross_attn): FlashDiffCustomAttention(
+            (subln): SambaYRMSNorm()
+          )
+        )
+        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
+        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
+        (post_attention_layernorm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
+      )
+      (2): SambaYDecoderLayer(
+        (mlp): SambaYMLP(
+          (fc1): Linear(in_features=64, out_features=128, bias=False)
+          (fc2): Linear(in_features=64, out_features=64, bias=False)
+          (activation_fn): SiLU()
+        )
+        (input_layernorm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
+        (attn): Phi3Mamba(
+          (in_proj): Linear(in_features=64, out_features=256, bias=False)
+          (conv1d): Conv1d(128, 128, kernel_size=(4,), stride=(1,), padding=(3,), groups=128)
+          (act): SiLU()
+          (x_proj): Linear(in_features=128, out_features=36, bias=False)
+          (dt_proj): Linear(in_features=4, out_features=128, bias=True)
+          (out_proj): Linear(in_features=128, out_features=64, bias=False)
+        )
+        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
+        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
+        (post_attention_layernorm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
+      )
+      (3): SambaYDecoderLayer(
+        (mlp): SambaYMLP(
+          (fc1): Linear(in_features=64, out_features=128, bias=False)
+          (fc2): Linear(in_features=64, out_features=64, bias=False)
+          (activation_fn): SiLU()
+        )
+        (input_layernorm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
+        (attn): SambaYFlashAttention2(
+          (out_proj): Linear(in_features=64, out_features=64, bias=True)
+          (Wqkv): Linear(in_features=64, out_features=192, bias=True)
+          (inner_cross_attn): FlashDiffCustomAttention(
+            (subln): SambaYRMSNorm()
+          )
+        )
+        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
+        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
+        (post_attention_layernorm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
+      )
+    )
+    (final_layernorm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
+  )
+  (lm_head): Linear(in_features=64, out_features=200064, bias=False)
+)
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "<|/tool_call|>": 200026,
+  "<|/tool|>": 200024,
+  "<|assistant|>": 200019,
+  "<|end|>": 200020,
+  "<|system|>": 200022,
+  "<|tag|>": 200028,
+  "<|tool_call|>": 200025,
+  "<|tool_response|>": 200027,
+  "<|tool|>": 200023,
+  "<|user|>": 200021
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1 @@

+ {% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}

config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "architectures": [
+    "Phi4FlashForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "microsoft/Phi-4-mini-flash-reasoning--configuration_phi4flash.Phi4FlashConfig",
+    "AutoModelForCausalLM": "microsoft/Phi-4-mini-flash-reasoning--modeling_phi4flash.Phi4FlashForCausalLM",
+    "AutoTokenizer": "Xenova/gpt-4o"
+  },
+  "bos_token_id": 199999,
+  "embd_pdrop": 0.0,
+  "eos_token_id": 199999,
+  "hidden_act": "silu",
+  "hidden_size": 64,
+  "initializer_range": 0.02,
+  "intermediate_size": 64,
+  "layer_norm_eps": 1e-05,
+  "lm_head_bias": false,
+  "mamba_conv_bias": true,
+  "mamba_d_conv": 4,
+  "mamba_d_state": 16,
+  "mamba_dt_rank": 4,
+  "mamba_expand": 2,
+  "mamba_proj_bias": false,
+  "max_position_embeddings": 262144,
+  "mb_per_layer": 2,
+  "mlp_bias": false,
+  "model_type": "phi4flash",
+  "num_attention_heads": 2,
+  "num_hidden_layers": 4,
+  "num_key_value_heads": 2,
+  "pad_token_id": 199999,
+  "resid_pdrop": 0.0,
+  "rope_theta": 10000.0,
+  "sliding_window": 512,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.54.0.dev0",
+  "use_cache": true,
+  "vocab_size": 200064
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 199999,
+  "eos_token_id": [
+    200020,
+    199999
+  ],
+  "pad_token_id": 199999,
+  "transformers_version": "4.54.0.dev0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f18cf02c0c75b4bcd621ebbd2267c20c491e56bcaa0d4bb376638e38c2b7a82e
+size 25921976

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:382cc235b56c725945e149cc25f191da667c836655efd0857b004320e90e91ea
+size 15524095

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,111 @@

+{
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "199999": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200018": {
+      "content": "<|endofprompt|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "200019": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "200020": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "200021": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "200022": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    },
+    "200023": {
+      "content": "<|tool|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "200024": {
+      "content": "<|/tool|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "200025": {
+      "content": "<|tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "200026": {
+      "content": "<|/tool_call|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "200027": {
+      "content": "<|tool_response|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "200028": {
+      "content": "<|tag|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 65536,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff