Overwrite with snapshot bb8c23be2 from checkpoint itazap/blt-1b
Browse files- config.json +8 -4
config.json
CHANGED
@@ -31,9 +31,10 @@
|
|
31 |
"num_attention_heads": 12,
|
32 |
"num_key_value_heads": null,
|
33 |
"max_position_embeddings": 8192,
|
34 |
-
"
|
35 |
"dropout": 0.0,
|
36 |
"rope_theta": 10000.0,
|
|
|
37 |
"attn_bias_type": "local_block_causal",
|
38 |
"intermediate_size": 2048
|
39 |
},
|
@@ -47,7 +48,7 @@
|
|
47 |
"num_attention_heads": 16,
|
48 |
"num_key_value_heads": null,
|
49 |
"num_hidden_layers": 1,
|
50 |
-
"
|
51 |
"dropout": 0.0,
|
52 |
"max_position_embeddings": 24576,
|
53 |
"rope_theta": 500000.0,
|
@@ -55,6 +56,7 @@
|
|
55 |
"rope_type": "default"
|
56 |
},
|
57 |
"hidden_act": "silu",
|
|
|
58 |
"intermediate_size": 2816
|
59 |
},
|
60 |
"decoder_config": {
|
@@ -66,7 +68,7 @@
|
|
66 |
"num_attention_heads": 16,
|
67 |
"num_key_value_heads": null,
|
68 |
"num_hidden_layers": 9,
|
69 |
-
"
|
70 |
"dropout": 0.0,
|
71 |
"max_position_embeddings": 24576,
|
72 |
"rope_theta": 500000.0,
|
@@ -74,6 +76,7 @@
|
|
74 |
"rope_type": "default"
|
75 |
},
|
76 |
"hidden_act": "silu",
|
|
|
77 |
"intermediate_size": 2816
|
78 |
},
|
79 |
"global_config": {
|
@@ -81,7 +84,7 @@
|
|
81 |
"num_attention_heads": 16,
|
82 |
"num_key_value_heads": null,
|
83 |
"num_hidden_layers": 25,
|
84 |
-
"
|
85 |
"dropout": 0.0,
|
86 |
"max_position_embeddings": 4096,
|
87 |
"rope_theta": 500000.0,
|
@@ -89,6 +92,7 @@
|
|
89 |
"rope_type": "default"
|
90 |
},
|
91 |
"hidden_act": "silu",
|
|
|
92 |
"intermediate_size": 5632
|
93 |
},
|
94 |
"tie_word_embeddings": false
|
|
|
31 |
"num_attention_heads": 12,
|
32 |
"num_key_value_heads": null,
|
33 |
"max_position_embeddings": 8192,
|
34 |
+
"norm_eps": 1e-05,
|
35 |
"dropout": 0.0,
|
36 |
"rope_theta": 10000.0,
|
37 |
+
"attn_impl": "xformers",
|
38 |
"attn_bias_type": "local_block_causal",
|
39 |
"intermediate_size": 2048
|
40 |
},
|
|
|
48 |
"num_attention_heads": 16,
|
49 |
"num_key_value_heads": null,
|
50 |
"num_hidden_layers": 1,
|
51 |
+
"norm_eps": 1e-05,
|
52 |
"dropout": 0.0,
|
53 |
"max_position_embeddings": 24576,
|
54 |
"rope_theta": 500000.0,
|
|
|
56 |
"rope_type": "default"
|
57 |
},
|
58 |
"hidden_act": "silu",
|
59 |
+
"_attn_implementation": "sdpa",
|
60 |
"intermediate_size": 2816
|
61 |
},
|
62 |
"decoder_config": {
|
|
|
68 |
"num_attention_heads": 16,
|
69 |
"num_key_value_heads": null,
|
70 |
"num_hidden_layers": 9,
|
71 |
+
"norm_eps": 1e-05,
|
72 |
"dropout": 0.0,
|
73 |
"max_position_embeddings": 24576,
|
74 |
"rope_theta": 500000.0,
|
|
|
76 |
"rope_type": "default"
|
77 |
},
|
78 |
"hidden_act": "silu",
|
79 |
+
"_attn_implementation": "sdpa",
|
80 |
"intermediate_size": 2816
|
81 |
},
|
82 |
"global_config": {
|
|
|
84 |
"num_attention_heads": 16,
|
85 |
"num_key_value_heads": null,
|
86 |
"num_hidden_layers": 25,
|
87 |
+
"norm_eps": 1e-05,
|
88 |
"dropout": 0.0,
|
89 |
"max_position_embeddings": 4096,
|
90 |
"rope_theta": 500000.0,
|
|
|
92 |
"rope_type": "default"
|
93 |
},
|
94 |
"hidden_act": "silu",
|
95 |
+
"_attn_implementation": "sdpa",
|
96 |
"intermediate_size": 5632
|
97 |
},
|
98 |
"tie_word_embeddings": false
|