diff --git "a/topology.json" "b/topology.json" --- "a/topology.json" +++ "b/topology.json" @@ -1,9 +1,9 @@ { "tensors": { - "model.embed_tokens.weight": { + "model.layers.20.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ - 262144, + 6912, 1152 ], "dtype": "BF16", @@ -14,47 +14,47 @@ 0 ], "shape": [ - 65536, + 1728, 1152 ], "filename_index": 0 }, { "offsets": [ - 65536, + 1728, 0 ], "shape": [ - 65536, + 1728, 1152 ], "filename_index": 1 }, { "offsets": [ - 131072, + 3456, 0 ], "shape": [ - 65536, + 1728, 1152 ], "filename_index": 2 }, { "offsets": [ - 196608, + 5184, 0 ], "shape": [ - 65536, + 1728, 1152 ], "filename_index": 3 } ] }, - "model.layers.21.post_attention_layernorm.weight": { + "model.layers.19.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -99,10 +99,10 @@ } ] }, - "model.layers.15.self_attn.q_proj.weight": { + "model.layers.18.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ - 1024, + 256, 1152 ], "dtype": "BF16", @@ -113,7 +113,7 @@ 0 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 0 @@ -124,7 +124,7 @@ 288 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 1 @@ -135,7 +135,7 @@ 576 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 2 @@ -146,68 +146,59 @@ 864 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.1.mlp.up_proj.weight": { + "model.layers.11.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 288 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 576 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 864 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 3 } ] }, - "model.layers.14.mlp.up_proj.weight": { + "model.layers.7.mlp.up_proj.weight": { "type": "Distributed", "shape": [ 6912, @@ -261,55 +252,10 @@ } ] }, - "model.layers.15.self_attn.q_norm.weight": { - "type": "Distributed", - "shape": [ - 256 - ], - "dtype": "BF16", - "chunks": [ - { - "offsets": [ - 0 - ], - "shape": [ - 64 - ], - "filename_index": 0 - }, - { - "offsets": [ - 64 - ], - "shape": [ - 64 - ], - "filename_index": 1 - }, - { - "offsets": [ - 128 - ], - "shape": [ - 64 - ], - "filename_index": 2 - }, - { - "offsets": [ - 192 - ], - "shape": [ - 64 - ], - "filename_index": 3 - } - ] - }, - "model.layers.21.self_attn.q_norm.weight": { + "model.layers.11.input_layernorm.weight": { "type": "Distributed", "shape": [ - 256 + 1152 ], "dtype": "BF16", "chunks": [ @@ -318,94 +264,85 @@ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } ] }, - "model.layers.18.self_attn.k_proj.weight": { + "model.layers.2.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 256, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 256, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 256, 288 ], "filename_index": 3 } ] }, - "model.layers.11.self_attn.q_norm.weight": { + "model.layers.5.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ 256 @@ -450,56 +387,65 @@ } ] }, - "model.layers.24.self_attn.k_norm.weight": { + "model.layers.22.mlp.down_proj.weight": { "type": "Distributed", "shape": [ - 256 + 1152, + 6912 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 64 + 1152, + 1728 ], "filename_index": 0 }, { "offsets": [ - 64 + 0, + 1728 ], "shape": [ - 64 + 1152, + 1728 ], "filename_index": 1 }, { "offsets": [ - 128 + 0, + 3456 ], "shape": [ - 64 + 1152, + 1728 ], "filename_index": 2 }, { "offsets": [ - 192 + 0, + 5184 ], "shape": [ - 64 + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.1.self_attn.o_proj.weight": { + "model.layers.18.mlp.down_proj.weight": { "type": "Distributed", "shape": [ 1152, - 1024 + 6912 ], "dtype": "BF16", "chunks": [ @@ -509,50 +455,50 @@ 0 ], "shape": [ - 288, - 1024 + 1152, + 1728 ], "filename_index": 0 }, { "offsets": [ - 288, - 0 + 0, + 1728 ], "shape": [ - 288, - 1024 + 1152, + 1728 ], "filename_index": 1 }, { "offsets": [ - 576, - 0 + 0, + 3456 ], "shape": [ - 288, - 1024 + 1152, + 1728 ], "filename_index": 2 }, { "offsets": [ - 864, - 0 + 0, + 5184 ], "shape": [ - 288, - 1024 + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.13.mlp.up_proj.weight": { + "model.layers.9.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ - 6912, + 1024, 1152 ], "dtype": "BF16", @@ -563,50 +509,50 @@ 0 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 0, + 288 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 0, + 576 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 0, + 864 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 3 } ] }, - "model.layers.25.self_attn.k_norm.weight": { + "model.layers.13.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 256 + 1152 ], "dtype": "BF16", "chunks": [ @@ -615,44 +561,44 @@ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } ] }, - "model.layers.17.mlp.down_proj.weight": { + "model.layers.4.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ - 1152, - 6912 + 256, + 1152 ], "dtype": "BF16", "chunks": [ @@ -662,47 +608,47 @@ 0 ], "shape": [ - 1152, - 1728 + 256, + 288 ], "filename_index": 0 }, { "offsets": [ 0, - 1728 + 288 ], "shape": [ - 1152, - 1728 + 256, + 288 ], "filename_index": 1 }, { "offsets": [ 0, - 3456 + 576 ], "shape": [ - 1152, - 1728 + 256, + 288 ], "filename_index": 2 }, { "offsets": [ 0, - 5184 + 864 ], "shape": [ - 1152, - 1728 + 256, + 288 ], "filename_index": 3 } ] }, - "model.layers.23.input_layernorm.weight": { + "model.layers.20.input_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -747,115 +693,97 @@ } ] }, - "model.layers.25.mlp.up_proj.weight": { + "model.layers.24.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ - 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 288 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 576 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 864 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 3 } ] }, - "model.layers.5.self_attn.q_proj.weight": { + "model.layers.23.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ - 1024, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1024, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 1024, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 1024, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.5.post_feedforward_layernorm.weight": { + "model.layers.2.input_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -900,10 +828,10 @@ } ] }, - "model.layers.17.mlp.up_proj.weight": { + "model.layers.1.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ - 6912, + 1024, 1152 ], "dtype": "BF16", @@ -914,50 +842,50 @@ 0 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 0, + 288 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 0, + 576 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 0, + 864 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 3 } ] }, - "model.layers.14.pre_feedforward_layernorm.weight": { + "model.layers.13.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ - 1152 + 256 ], "dtype": "BF16", "chunks": [ @@ -966,43 +894,43 @@ 0 ], "shape": [ - 288 + 64 ], "filename_index": 0 }, { "offsets": [ - 288 + 64 ], "shape": [ - 288 + 64 ], "filename_index": 1 }, { "offsets": [ - 576 + 128 ], "shape": [ - 288 + 64 ], "filename_index": 2 }, { "offsets": [ - 864 + 192 ], "shape": [ - 288 + 64 ], "filename_index": 3 } ] }, - "model.layers.10.self_attn.k_proj.weight": { + "model.layers.20.mlp.up_proj.weight": { "type": "Distributed", "shape": [ - 256, + 6912, 1152 ], "dtype": "BF16", @@ -1013,50 +941,50 @@ 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 1728, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 3456, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 5184, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.7.input_layernorm.weight": { + "model.layers.9.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 1152 + 256 ], "dtype": "BF16", "chunks": [ @@ -1065,130 +993,148 @@ 0 ], "shape": [ - 288 + 64 ], "filename_index": 0 }, { "offsets": [ - 288 + 64 ], "shape": [ - 288 + 64 ], "filename_index": 1 }, { "offsets": [ - 576 + 128 ], "shape": [ - 288 + 64 ], "filename_index": 2 }, { "offsets": [ - 864 + 192 ], "shape": [ - 288 + 64 ], "filename_index": 3 } ] }, - "model.layers.19.pre_feedforward_layernorm.weight": { + "model.layers.13.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ + 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 288 + 1728, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 576 + 3456, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 864 + 5184, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.25.pre_feedforward_layernorm.weight": { + "model.layers.13.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ - 1152 + 1152, + 1024 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 288 + 288, + 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 576 + 576, + 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 864 + 864, + 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.6.post_feedforward_layernorm.weight": { + "model.layers.0.input_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -1233,7 +1179,7 @@ } ] }, - "model.layers.19.self_attn.v_proj.weight": { + "model.layers.23.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ 256, @@ -1287,56 +1233,65 @@ } ] }, - "model.layers.14.input_layernorm.weight": { + "model.layers.23.mlp.up_proj.weight": { "type": "Distributed", "shape": [ + 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 288 + 1728, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 576 + 3456, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 864 + 5184, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.4.mlp.up_proj.weight": { + "model.layers.20.mlp.down_proj.weight": { "type": "Distributed", "shape": [ - 6912, - 1152 + 1152, + 6912 ], "dtype": "BF16", "chunks": [ @@ -1346,47 +1301,47 @@ 0 ], "shape": [ - 1728, - 1152 + 1152, + 1728 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 0, + 1728 ], "shape": [ - 1728, - 1152 + 1152, + 1728 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 0, + 3456 ], "shape": [ - 1728, - 1152 + 1152, + 1728 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 0, + 5184 ], "shape": [ - 1728, - 1152 + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.8.mlp.gate_proj.weight": { + "model.layers.4.mlp.up_proj.weight": { "type": "Distributed", "shape": [ 6912, @@ -1440,7 +1395,7 @@ } ] }, - "model.layers.19.input_layernorm.weight": { + "model.layers.14.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -1485,7 +1440,7 @@ } ] }, - "model.layers.15.self_attn.k_norm.weight": { + "model.layers.21.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ 256 @@ -1530,7 +1485,52 @@ } ] }, - "model.layers.8.self_attn.k_proj.weight": { + "model.layers.4.self_attn.k_norm.weight": { + "type": "Distributed", + "shape": [ + 256 + ], + "dtype": "BF16", + "chunks": [ + { + "offsets": [ + 0 + ], + "shape": [ + 64 + ], + "filename_index": 0 + }, + { + "offsets": [ + 64 + ], + "shape": [ + 64 + ], + "filename_index": 1 + }, + { + "offsets": [ + 128 + ], + "shape": [ + 64 + ], + "filename_index": 2 + }, + { + "offsets": [ + 192 + ], + "shape": [ + 64 + ], + "filename_index": 3 + } + ] + }, + "model.layers.13.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ 256, @@ -1584,10 +1584,10 @@ } ] }, - "model.layers.16.self_attn.q_proj.weight": { + "model.layers.19.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ - 1024, + 256, 1152 ], "dtype": "BF16", @@ -1598,7 +1598,7 @@ 0 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 0 @@ -1609,7 +1609,7 @@ 288 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 1 @@ -1620,7 +1620,7 @@ 576 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 2 @@ -1631,59 +1631,68 @@ 864 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.9.self_attn.q_norm.weight": { + "model.layers.6.mlp.up_proj.weight": { "type": "Distributed", "shape": [ - 256 + 6912, + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 64 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 64 + 1728, + 0 ], "shape": [ - 64 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 128 + 3456, + 0 ], "shape": [ - 64 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 192 + 5184, + 0 ], "shape": [ - 64 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.5.self_attn.o_proj.weight": { + "model.layers.16.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ 1152, @@ -1737,56 +1746,65 @@ } ] }, - "model.layers.11.post_attention_layernorm.weight": { + "model.layers.4.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ + 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 288 + 1728, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 576 + 3456, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 864 + 5184, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.4.self_attn.q_proj.weight": { + "model.layers.2.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ - 1024, - 1152 + 1152, + 1024 ], "dtype": "BF16", "chunks": [ @@ -1796,105 +1814,96 @@ 0 ], "shape": [ - 1024, - 288 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 288, + 0 ], "shape": [ - 1024, - 288 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 576, + 0 ], "shape": [ - 1024, - 288 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 864, + 0 ], "shape": [ - 1024, - 288 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.3.self_attn.k_proj.weight": { + "model.layers.10.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ - 256, - 1152 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 64 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 128 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 192 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 3 } ] }, - "model.layers.25.self_attn.o_proj.weight": { + "model.layers.0.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ - 1152, - 1024 + 256, + 1152 ], "dtype": "BF16", "chunks": [ @@ -1904,47 +1913,47 @@ 0 ], "shape": [ - 288, - 1024 + 256, + 288 ], "filename_index": 0 }, { "offsets": [ - 288, - 0 + 0, + 288 ], "shape": [ - 288, - 1024 + 256, + 288 ], "filename_index": 1 }, { "offsets": [ - 576, - 0 + 0, + 576 ], "shape": [ - 288, - 1024 + 256, + 288 ], "filename_index": 2 }, { "offsets": [ - 864, - 0 + 0, + 864 ], "shape": [ - 288, - 1024 + 256, + 288 ], "filename_index": 3 } ] }, - "model.layers.19.self_attn.o_proj.weight": { + "model.layers.10.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ 1152, @@ -1998,11 +2007,11 @@ } ] }, - "model.layers.6.self_attn.o_proj.weight": { + "model.layers.5.mlp.down_proj.weight": { "type": "Distributed", "shape": [ 1152, - 1024 + 6912 ], "dtype": "BF16", "chunks": [ @@ -2012,96 +2021,105 @@ 0 ], "shape": [ - 288, - 1024 + 1152, + 1728 ], "filename_index": 0 }, { "offsets": [ - 288, - 0 + 0, + 1728 ], "shape": [ - 288, - 1024 + 1152, + 1728 ], "filename_index": 1 }, { "offsets": [ - 576, - 0 + 0, + 3456 ], "shape": [ - 288, - 1024 + 1152, + 1728 ], "filename_index": 2 }, { "offsets": [ - 864, - 0 + 0, + 5184 ], "shape": [ - 288, - 1024 + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.7.post_feedforward_layernorm.weight": { + "model.layers.6.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ + 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 288 + 1728, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 576 + 3456, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 864 + 5184, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.13.self_attn.o_proj.weight": { + "model.layers.23.mlp.down_proj.weight": { "type": "Distributed", "shape": [ 1152, - 1024 + 6912 ], "dtype": "BF16", "chunks": [ @@ -2111,155 +2129,137 @@ 0 ], "shape": [ - 288, - 1024 + 1152, + 1728 ], "filename_index": 0 }, { "offsets": [ - 288, - 0 + 0, + 1728 ], "shape": [ - 288, - 1024 + 1152, + 1728 ], "filename_index": 1 }, { "offsets": [ - 576, - 0 + 0, + 3456 ], "shape": [ - 288, - 1024 + 1152, + 1728 ], "filename_index": 2 }, { "offsets": [ - 864, - 0 + 0, + 5184 ], "shape": [ - 288, - 1024 + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.22.self_attn.o_proj.weight": { + "model.layers.18.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ - 1152, - 1024 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 0 }, { "offsets": [ - 288, - 0 + 64 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 1 }, { "offsets": [ - 576, - 0 + 128 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 2 }, { "offsets": [ - 864, - 0 + 192 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 3 } ] }, - "model.layers.2.mlp.down_proj.weight": { + "model.layers.25.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ - 1152, - 6912 + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 0 }, { "offsets": [ - 0, - 1728 + 288 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 1 }, { "offsets": [ - 0, - 3456 + 576 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 2 }, { "offsets": [ - 0, - 5184 + 864 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 3 } ] }, - "model.layers.22.pre_feedforward_layernorm.weight": { + "model.layers.22.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -2304,10 +2304,10 @@ } ] }, - "model.layers.24.self_attn.v_proj.weight": { + "model.layers.17.mlp.up_proj.weight": { "type": "Distributed", "shape": [ - 256, + 6912, 1152 ], "dtype": "BF16", @@ -2318,92 +2318,101 @@ 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 1728, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 3456, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 5184, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.9.post_feedforward_layernorm.weight": { + "model.layers.15.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.0.self_attn.q_norm.weight": { + "model.layers.22.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ 256 @@ -2448,7 +2457,61 @@ } ] }, - "model.layers.24.pre_feedforward_layernorm.weight": { + "model.layers.11.mlp.down_proj.weight": { + "type": "Distributed", + "shape": [ + 1152, + 6912 + ], + "dtype": "BF16", + "chunks": [ + { + "offsets": [ + 0, + 0 + ], + "shape": [ + 1152, + 1728 + ], + "filename_index": 0 + }, + { + "offsets": [ + 0, + 1728 + ], + "shape": [ + 1152, + 1728 + ], + "filename_index": 1 + }, + { + "offsets": [ + 0, + 3456 + ], + "shape": [ + 1152, + 1728 + ], + "filename_index": 2 + }, + { + "offsets": [ + 0, + 5184 + ], + "shape": [ + 1152, + 1728 + ], + "filename_index": 3 + } + ] + }, + "model.layers.16.input_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -2493,7 +2556,7 @@ } ] }, - "model.layers.23.self_attn.q_norm.weight": { + "model.layers.4.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ 256 @@ -2538,7 +2601,7 @@ } ] }, - "model.layers.25.post_feedforward_layernorm.weight": { + "model.layers.10.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -2583,61 +2646,52 @@ } ] }, - "model.layers.2.self_attn.q_proj.weight": { + "model.layers.12.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 1024, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1024, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 1024, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 1024, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.8.self_attn.v_proj.weight": { + "model.layers.9.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ 256, @@ -2691,10 +2745,10 @@ } ] }, - "model.layers.23.mlp.gate_proj.weight": { + "model.layers.19.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ - 6912, + 256, 1152 ], "dtype": "BF16", @@ -2705,47 +2759,47 @@ 0 ], "shape": [ - 1728, - 1152 + 256, + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 0, + 288 ], "shape": [ - 1728, - 1152 + 256, + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 0, + 576 ], "shape": [ - 1728, - 1152 + 256, + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 0, + 864 ], "shape": [ - 1728, - 1152 + 256, + 288 ], "filename_index": 3 } ] }, - "model.layers.4.post_attention_layernorm.weight": { + "model.layers.23.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -2790,7 +2844,7 @@ } ] }, - "model.layers.20.mlp.gate_proj.weight": { + "model.layers.16.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ 6912, @@ -2844,11 +2898,11 @@ } ] }, - "model.layers.16.self_attn.v_proj.weight": { + "model.layers.4.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ - 256, - 1152 + 1152, + 1024 ], "dtype": "BF16", "chunks": [ @@ -2858,92 +2912,101 @@ 0 ], "shape": [ - 256, - 288 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 288, + 0 ], "shape": [ - 256, - 288 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 576, + 0 ], "shape": [ - 256, - 288 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 864, + 0 ], "shape": [ - 256, - 288 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.1.input_layernorm.weight": { + "model.layers.10.mlp.up_proj.weight": { "type": "Distributed", "shape": [ + 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 288 + 1728, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 576 + 3456, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 864 + 5184, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.22.post_attention_layernorm.weight": { + "model.layers.24.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -2988,61 +3051,52 @@ } ] }, - "model.layers.20.self_attn.v_proj.weight": { + "model.layers.23.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 256, - 1152 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 64 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 128 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 192 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 3 } ] }, - "model.layers.7.post_attention_layernorm.weight": { + "model.layers.19.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -3087,10 +3141,10 @@ } ] }, - "model.layers.8.self_attn.q_proj.weight": { + "model.layers.14.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ - 1024, + 256, 1152 ], "dtype": "BF16", @@ -3101,7 +3155,7 @@ 0 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 0 @@ -3112,7 +3166,7 @@ 288 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 1 @@ -3123,7 +3177,7 @@ 576 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 2 @@ -3134,14 +3188,14 @@ 864 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.1.self_attn.k_proj.weight": { + "model.layers.11.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ 256, @@ -3195,7 +3249,61 @@ } ] }, - "model.layers.21.self_attn.o_proj.weight": { + "model.layers.14.mlp.gate_proj.weight": { + "type": "Distributed", + "shape": [ + 6912, + 1152 + ], + "dtype": "BF16", + "chunks": [ + { + "offsets": [ + 0, + 0 + ], + "shape": [ + 1728, + 1152 + ], + "filename_index": 0 + }, + { + "offsets": [ + 1728, + 0 + ], + "shape": [ + 1728, + 1152 + ], + "filename_index": 1 + }, + { + "offsets": [ + 3456, + 0 + ], + "shape": [ + 1728, + 1152 + ], + "filename_index": 2 + }, + { + "offsets": [ + 5184, + 0 + ], + "shape": [ + 1728, + 1152 + ], + "filename_index": 3 + } + ] + }, + "model.layers.22.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ 1152, @@ -3249,7 +3357,7 @@ } ] }, - "model.layers.4.pre_feedforward_layernorm.weight": { + "model.layers.1.input_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -3294,61 +3402,52 @@ } ] }, - "model.layers.23.self_attn.q_proj.weight": { + "model.layers.0.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 1024, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1024, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 1024, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 1024, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.4.input_layernorm.weight": { + "model.layers.6.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -3393,352 +3492,316 @@ } ] }, - "model.layers.20.mlp.down_proj.weight": { + "model.layers.0.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 1152, - 6912 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 0 }, { "offsets": [ - 0, - 1728 + 64 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 1 }, { "offsets": [ - 0, - 3456 + 128 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 2 }, { "offsets": [ - 0, - 5184 + 192 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 3 } ] }, - "model.layers.10.mlp.gate_proj.weight": { + "model.layers.15.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ - 6912, - 1152 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 64 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 128 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 192 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 3 } ] }, - "model.layers.8.post_attention_layernorm.weight": { + "model.layers.22.mlp.up_proj.weight": { "type": "Distributed", "shape": [ + 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 288 - ], - "shape": [ - 288 - ], - "filename_index": 1 - }, - { - "offsets": [ - 576 - ], - "shape": [ - 288 - ], - "filename_index": 2 - }, - { - "offsets": [ - 864 - ], - "shape": [ - 288 - ], - "filename_index": 3 - } - ] - }, - "model.layers.23.self_attn.o_proj.weight": { - "type": "Distributed", - "shape": [ - 1152, - 1024 - ], - "dtype": "BF16", - "chunks": [ - { - "offsets": [ - 0, - 0 - ], - "shape": [ - 288, - 1024 - ], - "filename_index": 0 - }, - { - "offsets": [ - 288, + 1728, 0 ], "shape": [ - 288, - 1024 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 576, + 3456, 0 ], "shape": [ - 288, - 1024 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 864, + 5184, 0 ], "shape": [ - 288, - 1024 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.22.input_layernorm.weight": { + "model.layers.1.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.8.self_attn.q_norm.weight": { + "model.layers.24.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ - 256 + 1024, + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 64 + 1024, + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 0, + 288 ], "shape": [ - 64 + 1024, + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 0, + 576 ], "shape": [ - 64 + 1024, + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 0, + 864 ], "shape": [ - 64 + 1024, + 288 ], "filename_index": 3 } ] }, - "model.layers.15.post_feedforward_layernorm.weight": { + "model.layers.21.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.12.self_attn.q_proj.weight": { + "model.layers.11.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ - 1024, + 6912, 1152 ], "dtype": "BF16", @@ -3749,50 +3812,50 @@ 0 ], "shape": [ - 1024, - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 1728, + 0 ], "shape": [ - 1024, - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 3456, + 0 ], "shape": [ - 1024, - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 5184, + 0 ], "shape": [ - 1024, - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.11.self_attn.k_proj.weight": { + "model.layers.23.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ - 256, + 6912, 1152 ], "dtype": "BF16", @@ -3803,47 +3866,47 @@ 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 1728, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 3456, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 5184, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.16.mlp.up_proj.weight": { + "model.layers.21.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ 6912, @@ -3897,10 +3960,10 @@ } ] }, - "model.layers.12.self_attn.q_norm.weight": { + "model.layers.8.input_layernorm.weight": { "type": "Distributed", "shape": [ - 256 + 1152 ], "dtype": "BF16", "chunks": [ @@ -3909,40 +3972,40 @@ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } ] }, - "model.layers.19.self_attn.k_norm.weight": { + "model.layers.15.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ 256 @@ -3987,7 +4050,7 @@ } ] }, - "model.layers.23.pre_feedforward_layernorm.weight": { + "model.layers.5.input_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -4032,10 +4095,10 @@ } ] }, - "model.layers.18.self_attn.q_proj.weight": { + "model.layers.8.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ - 1024, + 6912, 1152 ], "dtype": "BF16", @@ -4046,47 +4109,47 @@ 0 ], "shape": [ - 1024, - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 1728, + 0 ], "shape": [ - 1024, - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 3456, + 0 ], "shape": [ - 1024, - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 5184, + 0 ], "shape": [ - 1024, - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.20.self_attn.k_proj.weight": { + "model.layers.7.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ 256, @@ -4140,11 +4203,11 @@ } ] }, - "model.layers.23.self_attn.v_proj.weight": { + "model.layers.6.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ - 256, - 1152 + 1152, + 1024 ], "dtype": "BF16", "chunks": [ @@ -4154,101 +4217,92 @@ 0 ], "shape": [ - 256, - 288 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 288, + 0 ], "shape": [ - 256, - 288 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 - ], + 576, + 0 + ], "shape": [ - 256, - 288 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 864, + 0 ], "shape": [ - 256, - 288 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.6.self_attn.q_proj.weight": { + "model.layers.19.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 1024, - 1152 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1024, - 288 + 64 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 64 ], "shape": [ - 1024, - 288 + 64 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 128 ], "shape": [ - 1024, - 288 + 64 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 192 ], "shape": [ - 1024, - 288 + 64 ], "filename_index": 3 } ] }, - "model.layers.9.post_attention_layernorm.weight": { + "model.layers.16.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -4293,7 +4347,7 @@ } ] }, - "model.layers.4.self_attn.v_proj.weight": { + "model.layers.15.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ 256, @@ -4347,7 +4401,7 @@ } ] }, - "model.layers.4.mlp.down_proj.weight": { + "model.layers.1.mlp.down_proj.weight": { "type": "Distributed", "shape": [ 1152, @@ -4401,160 +4455,151 @@ } ] }, - "model.layers.0.post_feedforward_layernorm.weight": { + "model.layers.9.mlp.down_proj.weight": { "type": "Distributed", "shape": [ - 1152 + 1152, + 6912 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 0 }, { "offsets": [ - 288 + 0, + 1728 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 1 }, { "offsets": [ - 576 + 0, + 3456 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 2 }, { "offsets": [ - 864 + 0, + 5184 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.8.mlp.down_proj.weight": { + "model.layers.15.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ - 1152, - 6912 + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 0 }, { "offsets": [ - 0, - 1728 + 288 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 1 }, { "offsets": [ - 0, - 3456 + 576 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 2 }, { "offsets": [ - 0, - 5184 + 864 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 3 } ] }, - "model.layers.11.self_attn.v_proj.weight": { + "model.layers.6.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ - 256, - 1152 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 64 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 128 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 192 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 3 } ] }, - "model.layers.7.self_attn.q_proj.weight": { + "model.layers.15.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ 1024, @@ -4608,61 +4653,52 @@ } ] }, - "model.layers.0.self_attn.o_proj.weight": { + "model.layers.12.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 1152, - 1024 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 0 }, { "offsets": [ - 288, - 0 + 64 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 1 }, { "offsets": [ - 576, - 0 + 128 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 2 }, { "offsets": [ - 864, - 0 + 192 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 3 } ] }, - "model.layers.13.post_attention_layernorm.weight": { + "model.layers.24.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -4707,64 +4743,55 @@ } ] }, - "model.layers.19.mlp.down_proj.weight": { + "model.layers.21.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ - 1152, - 6912 + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 0 }, { "offsets": [ - 0, - 1728 + 288 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 1 }, { "offsets": [ - 0, - 3456 + 576 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 2 }, { "offsets": [ - 0, - 5184 + 864 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 3 } ] }, - "model.layers.22.mlp.up_proj.weight": { + "model.layers.11.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ - 6912, + 1024, 1152 ], "dtype": "BF16", @@ -4775,95 +4802,104 @@ 0 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 0, + 288 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 0, + 576 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 0, + 864 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 3 } ] }, - "model.layers.22.self_attn.k_norm.weight": { + "model.layers.20.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ - 256 + 1152, + 1024 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 64 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 64 + 288, + 0 ], "shape": [ - 64 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 128 + 576, + 0 ], "shape": [ - 64 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 192 + 864, + 0 ], "shape": [ - 64 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.17.self_attn.v_proj.weight": { + "model.layers.5.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ - 256, + 1024, 1152 ], "dtype": "BF16", @@ -4874,7 +4910,7 @@ 0 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 0 @@ -4885,7 +4921,7 @@ 288 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 1 @@ -4896,7 +4932,7 @@ 576 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 2 @@ -4907,17 +4943,17 @@ 864 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.18.post_attention_layernorm.weight": { + "model.layers.19.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ - 1152 + 256 ], "dtype": "BF16", "chunks": [ @@ -4926,34 +4962,34 @@ 0 ], "shape": [ - 288 + 64 ], "filename_index": 0 }, { "offsets": [ - 288 + 64 ], "shape": [ - 288 + 64 ], "filename_index": 1 }, { "offsets": [ - 576 + 128 ], "shape": [ - 288 + 64 ], "filename_index": 2 }, { "offsets": [ - 864 + 192 ], "shape": [ - 288 + 64 ], "filename_index": 3 } @@ -5013,208 +5049,154 @@ } ] }, - "model.layers.6.self_attn.q_norm.weight": { - "type": "Distributed", - "shape": [ - 256 - ], - "dtype": "BF16", - "chunks": [ - { - "offsets": [ - 0 - ], - "shape": [ - 64 - ], - "filename_index": 0 - }, - { - "offsets": [ - 64 - ], - "shape": [ - 64 - ], - "filename_index": 1 - }, - { - "offsets": [ - 128 - ], - "shape": [ - 64 - ], - "filename_index": 2 - }, - { - "offsets": [ - 192 - ], - "shape": [ - 64 - ], - "filename_index": 3 - } - ] - }, - "model.layers.4.self_attn.k_norm.weight": { + "model.layers.20.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ - 256 + 1024, + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 64 + 1024, + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 0, + 288 ], "shape": [ - 64 + 1024, + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 0, + 576 ], "shape": [ - 64 + 1024, + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 0, + 864 ], "shape": [ - 64 + 1024, + 288 ], "filename_index": 3 } ] }, - "model.layers.3.mlp.gate_proj.weight": { + "model.layers.11.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ - 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 288 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 576 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 864 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 3 } ] }, - "model.layers.14.self_attn.k_proj.weight": { + "model.layers.22.input_layernorm.weight": { "type": "Distributed", "shape": [ - 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 256, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 256, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 256, 288 ], "filename_index": 3 } ] }, - "model.layers.22.self_attn.q_norm.weight": { + "model.layers.20.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 256 + 1152 ], "dtype": "BF16", "chunks": [ @@ -5223,43 +5205,43 @@ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } ] }, - "model.layers.25.self_attn.v_proj.weight": { + "model.layers.17.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ - 256, + 1024, 1152 ], "dtype": "BF16", @@ -5270,7 +5252,7 @@ 0 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 0 @@ -5281,7 +5263,7 @@ 288 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 1 @@ -5292,7 +5274,7 @@ 576 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 2 @@ -5303,68 +5285,59 @@ 864 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.15.self_attn.o_proj.weight": { + "model.norm.weight": { "type": "Distributed", "shape": [ - 1152, - 1024 + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 0 }, { "offsets": [ - 288, - 0 + 288 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 1 }, { "offsets": [ - 576, - 0 + 576 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 2 }, { "offsets": [ - 864, - 0 + 864 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 3 } ] }, - "model.layers.13.mlp.gate_proj.weight": { + "model.layers.15.mlp.up_proj.weight": { "type": "Distributed", "shape": [ 6912, @@ -5418,7 +5391,7 @@ } ] }, - "model.layers.2.post_feedforward_layernorm.weight": { + "model.layers.20.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -5463,250 +5436,259 @@ } ] }, - "model.layers.3.mlp.down_proj.weight": { + "model.layers.1.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 1152, - 6912 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 0 }, { "offsets": [ - 0, - 1728 + 64 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 1 }, { "offsets": [ - 0, - 3456 + 128 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 2 }, { "offsets": [ - 0, - 5184 + 192 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 3 } ] }, - "model.layers.10.mlp.up_proj.weight": { + "model.layers.21.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ - 6912, - 1152 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 64 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 128 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 192 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 3 } ] }, - "model.layers.19.post_attention_layernorm.weight": { + "model.layers.22.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ + 1024, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 1024, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 1024, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 1024, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.16.pre_feedforward_layernorm.weight": { + "model.layers.6.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.8.post_feedforward_layernorm.weight": { + "model.layers.2.mlp.down_proj.weight": { "type": "Distributed", "shape": [ - 1152 + 1152, + 6912 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 0 }, { "offsets": [ - 288 + 0, + 1728 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 1 }, { "offsets": [ - 576 + 0, + 3456 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 2 }, { "offsets": [ - 864 + 0, + 5184 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.3.post_attention_layernorm.weight": { + "model.layers.12.input_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -5751,10 +5733,10 @@ } ] }, - "model.layers.5.self_attn.q_norm.weight": { + "model.layers.5.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ - 256 + 1152 ], "dtype": "BF16", "chunks": [ @@ -5763,151 +5745,133 @@ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } ] }, - "model.layers.24.mlp.gate_proj.weight": { + "model.layers.24.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 6912, - 1152 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 64 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 128 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 192 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 3 } ] }, - "model.layers.1.self_attn.q_proj.weight": { + "model.layers.5.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 1024, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1024, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 1024, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 1024, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.3.pre_feedforward_layernorm.weight": { + "model.layers.11.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 1152 + 256 ], "dtype": "BF16", "chunks": [ @@ -5916,88 +5880,97 @@ 0 ], "shape": [ - 288 + 64 ], "filename_index": 0 }, { "offsets": [ - 288 + 64 ], "shape": [ - 288 + 64 ], "filename_index": 1 }, { "offsets": [ - 576 + 128 ], "shape": [ - 288 + 64 ], "filename_index": 2 }, { "offsets": [ - 864 + 192 ], "shape": [ - 288 + 64 ], "filename_index": 3 } ] }, - "model.layers.4.post_feedforward_layernorm.weight": { + "model.layers.21.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ - 1152 + 1152, + 1024 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 288 + 288, + 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 576 + 576, + 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 864 + 864, + 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.11.mlp.gate_proj.weight": { + "model.layers.18.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ - 6912, + 256, 1152 ], "dtype": "BF16", @@ -6008,101 +5981,92 @@ 0 ], "shape": [ - 1728, - 1152 + 256, + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 0, + 288 ], "shape": [ - 1728, - 1152 + 256, + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 0, + 576 ], "shape": [ - 1728, - 1152 + 256, + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 0, + 864 ], "shape": [ - 1728, - 1152 + 256, + 288 ], "filename_index": 3 } ] }, - "model.layers.25.mlp.down_proj.weight": { + "model.layers.4.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 1152, - 6912 + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 0 }, { "offsets": [ - 0, - 1728 + 288 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 1 }, { "offsets": [ - 0, - 3456 + 576 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 2 }, { "offsets": [ - 0, - 5184 + 864 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 3 } ] }, - "model.layers.21.mlp.up_proj.weight": { + "model.layers.9.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ 6912, @@ -6156,61 +6120,52 @@ } ] }, - "model.layers.2.mlp.up_proj.weight": { + "model.layers.20.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ - 6912, - 1152 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 64 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 128 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 192 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 3 } ] }, - "model.layers.1.self_attn.v_proj.weight": { + "model.layers.16.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ 256, @@ -6264,10 +6219,10 @@ } ] }, - "model.layers.0.self_attn.k_norm.weight": { + "model.layers.2.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 256 + 1152 ], "dtype": "BF16", "chunks": [ @@ -6276,517 +6231,607 @@ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } ] }, - "model.layers.13.input_layernorm.weight": { + "model.layers.4.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ + 1024, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 1024, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 1024, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 1024, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.24.self_attn.q_norm.weight": { + "model.layers.6.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ - 256 + 256, + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 64 + 256, + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 0, + 288 ], "shape": [ - 64 + 256, + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 0, + 576 ], "shape": [ - 64 + 256, + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 0, + 864 ], "shape": [ - 64 + 256, + 288 ], "filename_index": 3 } ] }, - "model.layers.17.input_layernorm.weight": { + "model.layers.22.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.6.mlp.gate_proj.weight": { + "model.layers.17.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ - 6912, - 1152 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 64 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 128 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 192 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 3 } ] }, - "model.layers.15.self_attn.k_proj.weight": { + "model.layers.13.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ - 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 256, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 256, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 256, 288 ], "filename_index": 3 } ] }, - "model.layers.18.pre_feedforward_layernorm.weight": { + "model.layers.10.mlp.down_proj.weight": { "type": "Distributed", "shape": [ - 1152 + 1152, + 6912 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 0 }, { "offsets": [ - 288 + 0, + 1728 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 1 }, { "offsets": [ - 576 + 0, + 3456 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 2 }, { "offsets": [ - 864 + 0, + 5184 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.21.input_layernorm.weight": { + "model.layers.5.self_attn.o_proj.weight": { + "type": "Distributed", + "shape": [ + 1152, + 1024 + ], + "dtype": "BF16", + "chunks": [ + { + "offsets": [ + 0, + 0 + ], + "shape": [ + 288, + 1024 + ], + "filename_index": 0 + }, + { + "offsets": [ + 288, + 0 + ], + "shape": [ + 288, + 1024 + ], + "filename_index": 1 + }, + { + "offsets": [ + 576, + 0 + ], + "shape": [ + 288, + 1024 + ], + "filename_index": 2 + }, + { + "offsets": [ + 864, + 0 + ], + "shape": [ + 288, + 1024 + ], + "filename_index": 3 + } + ] + }, + "model.layers.21.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.23.self_attn.k_norm.weight": { + "model.layers.1.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ - 256 + 6912, + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 64 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 64 + 1728, + 0 ], "shape": [ - 64 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 128 + 3456, + 0 ], "shape": [ - 64 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 192 + 5184, + 0 ], "shape": [ - 64 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.15.pre_feedforward_layernorm.weight": { + "model.layers.25.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.11.mlp.down_proj.weight": { + "model.layers.13.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 1152, - 6912 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 0 }, { "offsets": [ - 0, - 1728 + 64 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 1 }, { "offsets": [ - 0, - 3456 + 128 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 2 }, { "offsets": [ - 0, - 5184 + 192 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 3 } ] }, - "model.layers.21.pre_feedforward_layernorm.weight": { + "model.layers.4.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -6831,52 +6876,61 @@ } ] }, - "model.layers.24.post_attention_layernorm.weight": { + "model.layers.22.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.19.self_attn.q_norm.weight": { + "model.layers.7.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ 256 @@ -6921,10 +6975,10 @@ } ] }, - "model.layers.2.self_attn.k_norm.weight": { + "model.layers.21.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 256 + 1152 ], "dtype": "BF16", "chunks": [ @@ -6933,151 +6987,133 @@ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } ] }, - "model.layers.16.mlp.gate_proj.weight": { + "model.layers.16.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ - 6912, - 1152 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 64 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 128 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 192 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 3 } ] }, - "model.layers.9.mlp.up_proj.weight": { + "model.layers.6.input_layernorm.weight": { "type": "Distributed", "shape": [ - 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 288 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 576 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 864 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 3 } ] }, - "model.layers.14.self_attn.q_norm.weight": { + "model.layers.12.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ - 256 + 1152 ], "dtype": "BF16", "chunks": [ @@ -7086,44 +7122,44 @@ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } ] }, - "model.layers.21.mlp.down_proj.weight": { + "model.layers.8.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ - 1152, - 6912 + 256, + 1152 ], "dtype": "BF16", "chunks": [ @@ -7133,150 +7169,96 @@ 0 ], "shape": [ - 1152, - 1728 + 256, + 288 ], "filename_index": 0 }, { "offsets": [ 0, - 1728 + 288 ], "shape": [ - 1152, - 1728 + 256, + 288 ], "filename_index": 1 }, { "offsets": [ 0, - 3456 + 576 ], "shape": [ - 1152, - 1728 + 256, + 288 ], "filename_index": 2 }, { "offsets": [ 0, - 5184 + 864 ], "shape": [ - 1152, - 1728 + 256, + 288 ], "filename_index": 3 } ] }, - "model.layers.5.mlp.up_proj.weight": { + "model.layers.10.input_layernorm.weight": { "type": "Distributed", "shape": [ - 6912, 1152 ], "dtype": "BF16", - "chunks": [ - { - "offsets": [ - 0, - 0 - ], - "shape": [ - 1728, - 1152 - ], - "filename_index": 0 - }, - { - "offsets": [ - 1728, - 0 - ], - "shape": [ - 1728, - 1152 - ], - "filename_index": 1 - }, - { - "offsets": [ - 3456, - 0 - ], - "shape": [ - 1728, - 1152 - ], - "filename_index": 2 - }, - { - "offsets": [ - 5184, - 0 - ], - "shape": [ - 1728, - 1152 - ], - "filename_index": 3 - } - ] - }, - "model.layers.14.self_attn.k_norm.weight": { - "type": "Distributed", - "shape": [ - 256 - ], - "dtype": "BF16", "chunks": [ { "offsets": [ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } ] }, - "model.layers.11.self_attn.q_proj.weight": { + "model.layers.7.mlp.down_proj.weight": { "type": "Distributed", "shape": [ - 1024, - 1152 + 1152, + 6912 ], "dtype": "BF16", "chunks": [ @@ -7286,47 +7268,47 @@ 0 ], "shape": [ - 1024, - 288 + 1152, + 1728 ], "filename_index": 0 }, { "offsets": [ 0, - 288 + 1728 ], "shape": [ - 1024, - 288 + 1152, + 1728 ], "filename_index": 1 }, { "offsets": [ 0, - 576 + 3456 ], "shape": [ - 1024, - 288 + 1152, + 1728 ], "filename_index": 2 }, { "offsets": [ 0, - 864 + 5184 ], "shape": [ - 1024, - 288 + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.7.self_attn.q_norm.weight": { + "model.layers.11.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ 256 @@ -7371,7 +7353,7 @@ } ] }, - "model.layers.10.post_feedforward_layernorm.weight": { + "model.layers.7.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -7416,52 +7398,61 @@ } ] }, - "model.norm.weight": { + "model.layers.2.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.15.post_attention_layernorm.weight": { + "model.layers.7.input_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -7506,52 +7497,61 @@ } ] }, - "model.layers.0.input_layernorm.weight": { + "model.layers.13.mlp.up_proj.weight": { "type": "Distributed", "shape": [ + 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 288 + 1728, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 576 + 3456, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 864 + 5184, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.22.post_feedforward_layernorm.weight": { + "model.layers.0.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -7596,7 +7596,7 @@ } ] }, - "model.layers.18.mlp.up_proj.weight": { + "model.layers.7.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ 6912, @@ -7650,55 +7650,64 @@ } ] }, - "model.layers.8.pre_feedforward_layernorm.weight": { + "model.layers.14.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ + 1024, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 1024, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 1024, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 1024, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.14.self_attn.q_proj.weight": { + "model.layers.1.mlp.up_proj.weight": { "type": "Distributed", "shape": [ - 1024, + 6912, 1152 ], "dtype": "BF16", @@ -7709,104 +7718,95 @@ 0 ], "shape": [ - 1024, - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 1728, + 0 ], "shape": [ - 1024, - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 3456, + 0 ], "shape": [ - 1024, - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 5184, + 0 ], "shape": [ - 1024, - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.24.self_attn.o_proj.weight": { + "model.layers.23.input_layernorm.weight": { "type": "Distributed", "shape": [ - 1152, - 1024 + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 0 }, { "offsets": [ - 288, - 0 + 288 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 1 }, { "offsets": [ - 576, - 0 + 576 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 2 }, { "offsets": [ - 864, - 0 + 864 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 3 } ] }, - "model.layers.5.self_attn.v_proj.weight": { + "model.layers.25.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ - 256, + 1024, 1152 ], "dtype": "BF16", @@ -7817,7 +7817,7 @@ 0 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 0 @@ -7828,7 +7828,7 @@ 288 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 1 @@ -7839,7 +7839,7 @@ 576 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 2 @@ -7850,167 +7850,203 @@ 864 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.1.mlp.gate_proj.weight": { + "model.layers.3.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 288 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 576 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 864 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 3 } ] }, - "model.layers.23.post_attention_layernorm.weight": { + "model.layers.12.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.0.self_attn.v_proj.weight": { + "model.layers.12.self_attn.q_norm.weight": { + "type": "Distributed", + "shape": [ + 256 + ], + "dtype": "BF16", + "chunks": [ + { + "offsets": [ + 0 + ], + "shape": [ + 64 + ], + "filename_index": 0 + }, + { + "offsets": [ + 64 + ], + "shape": [ + 64 + ], + "filename_index": 1 + }, + { + "offsets": [ + 128 + ], + "shape": [ + 64 + ], + "filename_index": 2 + }, + { + "offsets": [ + 192 + ], + "shape": [ + 64 + ], + "filename_index": 3 + } + ] + }, + "model.layers.18.input_layernorm.weight": { "type": "Distributed", "shape": [ - 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 256, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 256, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 256, 288 ], "filename_index": 3 } ] }, - "model.layers.5.mlp.gate_proj.weight": { + "model.layers.0.mlp.up_proj.weight": { "type": "Distributed", "shape": [ 6912, @@ -8064,7 +8100,7 @@ } ] }, - "model.layers.16.self_attn.q_norm.weight": { + "model.layers.25.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ 256 @@ -8109,271 +8145,244 @@ } ] }, - "model.layers.2.self_attn.v_proj.weight": { + "model.layers.2.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ - 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 256, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 256, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 256, 288 ], "filename_index": 3 } ] }, - "model.layers.14.self_attn.v_proj.weight": { + "model.layers.9.input_layernorm.weight": { "type": "Distributed", "shape": [ - 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 256, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 256, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 256, 288 ], "filename_index": 3 } ] }, - "model.layers.25.post_attention_layernorm.weight": { + "model.layers.0.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ + 1024, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 1024, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 1024, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 1024, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.2.mlp.gate_proj.weight": { + "model.layers.18.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 288 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 576 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 864 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 3 } ] }, - "model.layers.6.mlp.up_proj.weight": { + "model.layers.9.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 288 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 576 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 864 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 3 } ] }, - "model.layers.16.self_attn.k_norm.weight": { + "model.layers.11.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 256 + 1152 ], "dtype": "BF16", "chunks": [ @@ -8382,43 +8391,43 @@ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } ] }, - "model.layers.5.self_attn.k_norm.weight": { + "model.layers.4.input_layernorm.weight": { "type": "Distributed", "shape": [ - 256 + 1152 ], "dtype": "BF16", "chunks": [ @@ -8427,44 +8436,44 @@ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } ] }, - "model.layers.0.mlp.down_proj.weight": { + "model.layers.14.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ 1152, - 6912 + 1024 ], "dtype": "BF16", "chunks": [ @@ -8474,137 +8483,101 @@ 0 ], "shape": [ - 1152, - 1728 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 0, - 1728 - ], - "shape": [ - 1152, - 1728 - ], - "filename_index": 1 - }, - { - "offsets": [ - 0, - 3456 - ], - "shape": [ - 1152, - 1728 - ], - "filename_index": 2 - }, - { - "offsets": [ - 0, - 5184 - ], - "shape": [ - 1152, - 1728 - ], - "filename_index": 3 - } - ] - }, - "model.layers.5.pre_feedforward_layernorm.weight": { - "type": "Distributed", - "shape": [ - 1152 - ], - "dtype": "BF16", - "chunks": [ - { - "offsets": [ + 288, 0 ], "shape": [ - 288 - ], - "filename_index": 0 - }, - { - "offsets": [ - 288 - ], - "shape": [ - 288 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 576 + 576, + 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 864 + 864, + 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.14.post_feedforward_layernorm.weight": { + "model.layers.8.mlp.up_proj.weight": { "type": "Distributed", "shape": [ + 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 288 + 1728, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 576 + 3456, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 864 + 5184, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.2.post_attention_layernorm.weight": { + "model.layers.8.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -8649,7 +8622,7 @@ } ] }, - "model.layers.2.self_attn.k_proj.weight": { + "model.layers.9.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ 256, @@ -8703,97 +8676,115 @@ } ] }, - "model.layers.3.post_feedforward_layernorm.weight": { + "model.layers.12.mlp.up_proj.weight": { "type": "Distributed", "shape": [ + 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 288 + 1728, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 576 + 3456, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 864 + 5184, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.3.input_layernorm.weight": { + "model.layers.3.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.14.post_attention_layernorm.weight": { + "model.layers.12.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -8838,7 +8829,7 @@ } ] }, - "model.layers.7.mlp.gate_proj.weight": { + "model.layers.24.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ 6912, @@ -8892,119 +8883,101 @@ } ] }, - "model.layers.16.mlp.down_proj.weight": { + "model.layers.25.input_layernorm.weight": { "type": "Distributed", "shape": [ - 1152, - 6912 + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 0 }, { "offsets": [ - 0, - 1728 + 288 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 1 }, { "offsets": [ - 0, - 3456 + 576 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 2 }, { "offsets": [ - 0, - 5184 + 864 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 3 } ] }, - "model.layers.23.self_attn.k_proj.weight": { + "model.layers.6.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 256, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 256, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 256, 288 ], "filename_index": 3 } ] }, - "model.layers.16.self_attn.k_proj.weight": { + "model.layers.17.mlp.down_proj.weight": { "type": "Distributed", "shape": [ - 256, - 1152 + 1152, + 6912 ], "dtype": "BF16", "chunks": [ @@ -9014,47 +8987,47 @@ 0 ], "shape": [ - 256, - 288 + 1152, + 1728 ], "filename_index": 0 }, { "offsets": [ 0, - 288 + 1728 ], "shape": [ - 256, - 288 + 1152, + 1728 ], "filename_index": 1 }, { "offsets": [ 0, - 576 + 3456 ], "shape": [ - 256, - 288 + 1152, + 1728 ], "filename_index": 2 }, { "offsets": [ 0, - 864 + 5184 ], "shape": [ - 256, - 288 + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.0.pre_feedforward_layernorm.weight": { + "model.layers.20.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -9099,7 +9072,7 @@ } ] }, - "model.layers.2.input_layernorm.weight": { + "model.layers.22.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -9144,7 +9117,7 @@ } ] }, - "model.layers.0.mlp.up_proj.weight": { + "model.layers.25.mlp.up_proj.weight": { "type": "Distributed", "shape": [ 6912, @@ -9198,65 +9171,56 @@ } ] }, - "model.layers.9.self_attn.q_proj.weight": { + "model.layers.1.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 1024, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1024, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 1024, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 1024, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.10.mlp.down_proj.weight": { + "model.layers.10.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ - 1152, - 6912 + 6912, + 1152 ], "dtype": "BF16", "chunks": [ @@ -9266,50 +9230,50 @@ 0 ], "shape": [ - 1152, - 1728 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 0, - 1728 + 1728, + 0 ], "shape": [ - 1152, - 1728 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 0, - 3456 + 3456, + 0 ], "shape": [ - 1152, - 1728 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 0, - 5184 + 5184, + 0 ], "shape": [ - 1152, - 1728 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.13.self_attn.q_proj.weight": { + "model.layers.5.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ - 1024, + 256, 1152 ], "dtype": "BF16", @@ -9320,7 +9284,7 @@ 0 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 0 @@ -9331,7 +9295,7 @@ 288 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 1 @@ -9342,7 +9306,7 @@ 576 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 2 @@ -9353,17 +9317,17 @@ 864 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.17.self_attn.k_norm.weight": { + "model.layers.17.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ - 256 + 1152 ], "dtype": "BF16", "chunks": [ @@ -9372,139 +9336,139 @@ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } ] }, - "model.layers.16.post_feedforward_layernorm.weight": { + "model.embed_tokens.weight": { "type": "Distributed", "shape": [ + 262144, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 65536, + 1152 ], "filename_index": 0 }, { "offsets": [ - 288 + 65536, + 0 ], "shape": [ - 288 + 65536, + 1152 ], "filename_index": 1 }, { "offsets": [ - 576 + 131072, + 0 ], "shape": [ - 288 + 65536, + 1152 ], "filename_index": 2 }, { "offsets": [ - 864 + 196608, + 0 ], "shape": [ - 288 + 65536, + 1152 ], "filename_index": 3 } ] }, - "model.layers.9.self_attn.k_proj.weight": { + "model.layers.6.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 256, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 256, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 256, 288 ], "filename_index": 3 } ] }, - "model.layers.11.pre_feedforward_layernorm.weight": { + "model.layers.1.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -9549,7 +9513,7 @@ } ] }, - "model.layers.2.self_attn.o_proj.weight": { + "model.layers.3.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ 1152, @@ -9603,61 +9567,52 @@ } ] }, - "model.layers.9.mlp.down_proj.weight": { + "model.layers.15.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 1152, - 6912 + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 0 }, { "offsets": [ - 0, - 1728 + 288 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 1 }, { "offsets": [ - 0, - 3456 + 576 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 2 }, { "offsets": [ - 0, - 5184 + 864 ], "shape": [ - 1152, - 1728 + 288 ], "filename_index": 3 } ] }, - "model.layers.7.mlp.down_proj.weight": { + "model.layers.25.mlp.down_proj.weight": { "type": "Distributed", "shape": [ 1152, @@ -9711,11 +9666,11 @@ } ] }, - "model.layers.25.mlp.gate_proj.weight": { + "model.layers.18.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ - 6912, - 1152 + 1152, + 1024 ], "dtype": "BF16", "chunks": [ @@ -9725,149 +9680,149 @@ 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 1728, + 288, 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 3456, + 576, 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 5184, + 864, 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.20.self_attn.o_proj.weight": { + "model.layers.8.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 1152, - 1024 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 0 }, { "offsets": [ - 288, - 0 + 64 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 1 }, { "offsets": [ - 576, - 0 + 128 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 2 }, { "offsets": [ - 864, - 0 + 192 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 3 } ] }, - "model.layers.24.input_layernorm.weight": { + "model.layers.3.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ + 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 288 + 1728, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 576 + 3456, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 864 + 5184, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.4.self_attn.k_proj.weight": { + "model.layers.2.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ - 256, + 1024, 1152 ], "dtype": "BF16", @@ -9878,7 +9833,7 @@ 0 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 0 @@ -9889,7 +9844,7 @@ 288 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 1 @@ -9900,7 +9855,7 @@ 576 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 2 @@ -9911,63 +9866,72 @@ 864 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.12.pre_feedforward_layernorm.weight": { + "model.layers.3.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.24.mlp.up_proj.weight": { + "model.layers.11.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ - 6912, - 1152 + 1152, + 1024 ], "dtype": "BF16", "chunks": [ @@ -9977,92 +9941,101 @@ 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 1728, + 288, 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 3456, + 576, 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 5184, + 864, 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.24.post_feedforward_layernorm.weight": { + "model.layers.11.mlp.up_proj.weight": { "type": "Distributed", "shape": [ + 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 288 + 1728, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 576 + 3456, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 864 + 5184, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.1.self_attn.q_norm.weight": { + "model.layers.3.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ 256 @@ -10107,65 +10080,56 @@ } ] }, - "model.layers.5.self_attn.k_proj.weight": { + "model.layers.20.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 256, - 1152 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 64 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 128 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 192 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 3 } ] }, - "model.layers.25.self_attn.q_proj.weight": { + "model.layers.24.mlp.down_proj.weight": { "type": "Distributed", "shape": [ - 1024, - 1152 + 1152, + 6912 ], "dtype": "BF16", "chunks": [ @@ -10175,158 +10139,95 @@ 0 ], "shape": [ - 1024, - 288 + 1152, + 1728 ], "filename_index": 0 }, { "offsets": [ 0, - 288 + 1728 ], "shape": [ - 1024, - 288 + 1152, + 1728 ], "filename_index": 1 }, { "offsets": [ 0, - 576 + 3456 ], "shape": [ - 1024, - 288 + 1152, + 1728 ], "filename_index": 2 }, { "offsets": [ 0, - 864 - ], - "shape": [ - 1024, - 288 - ], - "filename_index": 3 - } - ] - }, - "model.layers.18.self_attn.o_proj.weight": { - "type": "Distributed", - "shape": [ - 1152, - 1024 - ], - "dtype": "BF16", - "chunks": [ - { - "offsets": [ - 0, - 0 - ], - "shape": [ - 288, - 1024 - ], - "filename_index": 0 - }, - { - "offsets": [ - 288, - 0 - ], - "shape": [ - 288, - 1024 - ], - "filename_index": 1 - }, - { - "offsets": [ - 576, - 0 - ], - "shape": [ - 288, - 1024 - ], - "filename_index": 2 - }, - { - "offsets": [ - 864, - 0 + 5184 ], "shape": [ - 288, - 1024 + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.15.self_attn.v_proj.weight": { + "model.layers.17.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 256, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 256, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 256, 288 ], "filename_index": 3 } ] }, - "model.layers.2.self_attn.q_norm.weight": { + "model.layers.16.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 256 + 1152 ], "dtype": "BF16", "chunks": [ @@ -10335,94 +10236,85 @@ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } ] }, - "model.layers.21.self_attn.q_proj.weight": { + "model.layers.7.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 1024, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1024, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 1024, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 1024, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.11.mlp.up_proj.weight": { + "model.layers.18.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ 6912, @@ -10476,115 +10368,97 @@ } ] }, - "model.layers.6.self_attn.k_proj.weight": { + "model.layers.1.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ - 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 256, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 256, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 256, 288 ], "filename_index": 3 } ] }, - "model.layers.12.self_attn.o_proj.weight": { + "model.layers.15.input_layernorm.weight": { "type": "Distributed", "shape": [ - 1152, - 1024 + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 0 }, { "offsets": [ - 288, - 0 + 288 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 1 }, { "offsets": [ - 576, - 0 + 576 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 2 }, { "offsets": [ - 864, - 0 + 864 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 3 } ] }, - "model.layers.20.self_attn.q_norm.weight": { + "model.layers.14.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ 256 @@ -10629,7 +10503,7 @@ } ] }, - "model.layers.17.post_attention_layernorm.weight": { + "model.layers.10.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -10674,65 +10548,56 @@ } ] }, - "model.layers.23.mlp.up_proj.weight": { + "model.layers.13.input_layernorm.weight": { "type": "Distributed", "shape": [ - 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 288 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 576 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 864 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 3 } ] }, - "model.layers.14.mlp.gate_proj.weight": { + "model.layers.8.mlp.down_proj.weight": { "type": "Distributed", "shape": [ - 6912, - 1152 + 1152, + 6912 ], "dtype": "BF16", "chunks": [ @@ -10742,191 +10607,254 @@ 0 ], "shape": [ - 1728, - 1152 + 1152, + 1728 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 0, + 1728 ], "shape": [ - 1728, - 1152 + 1152, + 1728 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 0, + 3456 ], "shape": [ - 1728, - 1152 + 1152, + 1728 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 0, + 5184 ], "shape": [ - 1728, - 1152 + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.9.self_attn.o_proj.weight": { + "model.layers.24.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ - 1152, - 1024 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 0 }, { "offsets": [ - 288, - 0 + 64 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 1 }, { "offsets": [ - 576, - 0 + 128 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 2 }, { "offsets": [ - 864, - 0 + 192 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 3 } ] }, - "model.layers.12.post_attention_layernorm.weight": { + "model.layers.14.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.18.self_attn.q_norm.weight": { + "model.layers.1.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ - 256 + 1152, + 1024 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 64 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 64 + 288, + 0 ], "shape": [ - 64 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 128 + 576, + 0 ], "shape": [ - 64 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 192 + 864, + 0 ], "shape": [ - 64 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.20.mlp.up_proj.weight": { + "model.layers.16.self_attn.q_proj.weight": { + "type": "Distributed", + "shape": [ + 1024, + 1152 + ], + "dtype": "BF16", + "chunks": [ + { + "offsets": [ + 0, + 0 + ], + "shape": [ + 1024, + 288 + ], + "filename_index": 0 + }, + { + "offsets": [ + 0, + 288 + ], + "shape": [ + 1024, + 288 + ], + "filename_index": 1 + }, + { + "offsets": [ + 0, + 576 + ], + "shape": [ + 1024, + 288 + ], + "filename_index": 2 + }, + { + "offsets": [ + 0, + 864 + ], + "shape": [ + 1024, + 288 + ], + "filename_index": 3 + } + ] + }, + "model.layers.15.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ 6912, @@ -10980,11 +10908,11 @@ } ] }, - "model.layers.16.self_attn.o_proj.weight": { + "model.layers.10.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ - 1152, - 1024 + 256, + 1152 ], "dtype": "BF16", "chunks": [ @@ -10994,47 +10922,47 @@ 0 ], "shape": [ - 288, - 1024 + 256, + 288 ], "filename_index": 0 }, { "offsets": [ - 288, - 0 + 0, + 288 ], "shape": [ - 288, - 1024 + 256, + 288 ], "filename_index": 1 }, { "offsets": [ - 576, - 0 + 0, + 576 ], "shape": [ - 288, - 1024 + 256, + 288 ], "filename_index": 2 }, { "offsets": [ - 864, - 0 + 0, + 864 ], "shape": [ - 288, - 1024 + 256, + 288 ], "filename_index": 3 } ] }, - "model.layers.6.self_attn.k_norm.weight": { + "model.layers.7.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ 256 @@ -11079,7 +11007,7 @@ } ] }, - "model.layers.15.mlp.up_proj.weight": { + "model.layers.2.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ 6912, @@ -11133,52 +11061,61 @@ } ] }, - "model.layers.1.post_attention_layernorm.weight": { + "model.layers.15.mlp.down_proj.weight": { "type": "Distributed", "shape": [ - 1152 + 1152, + 6912 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 0 }, { "offsets": [ - 288 + 0, + 1728 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 1 }, { "offsets": [ - 576 + 0, + 3456 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 2 }, { "offsets": [ - 864 + 0, + 5184 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.3.mlp.up_proj.weight": { + "model.layers.14.mlp.up_proj.weight": { "type": "Distributed", "shape": [ 6912, @@ -11232,61 +11169,52 @@ } ] }, - "model.layers.22.mlp.gate_proj.weight": { + "model.layers.17.input_layernorm.weight": { "type": "Distributed", "shape": [ - 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 288 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 576 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 864 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 3 } ] }, - "model.layers.21.self_attn.k_proj.weight": { + "model.layers.2.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ 256, @@ -11340,65 +11268,164 @@ } ] }, - "model.layers.11.self_attn.o_proj.weight": { + "model.layers.5.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 1152, - 1024 + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 0 }, { "offsets": [ - 288, - 0 + 288 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 1 }, { "offsets": [ - 576, + 576 + ], + "shape": [ + 288 + ], + "filename_index": 2 + }, + { + "offsets": [ + 864 + ], + "shape": [ + 288 + ], + "filename_index": 3 + } + ] + }, + "model.layers.11.self_attn.k_proj.weight": { + "type": "Distributed", + "shape": [ + 256, + 1152 + ], + "dtype": "BF16", + "chunks": [ + { + "offsets": [ + 0, 0 ], "shape": [ - 288, - 1024 + 256, + 288 + ], + "filename_index": 0 + }, + { + "offsets": [ + 0, + 288 + ], + "shape": [ + 256, + 288 + ], + "filename_index": 1 + }, + { + "offsets": [ + 0, + 576 + ], + "shape": [ + 256, + 288 ], "filename_index": 2 }, { "offsets": [ - 864, + 0, + 864 + ], + "shape": [ + 256, + 288 + ], + "filename_index": 3 + } + ] + }, + "model.layers.3.mlp.down_proj.weight": { + "type": "Distributed", + "shape": [ + 1152, + 6912 + ], + "dtype": "BF16", + "chunks": [ + { + "offsets": [ + 0, 0 ], "shape": [ - 288, - 1024 + 1152, + 1728 + ], + "filename_index": 0 + }, + { + "offsets": [ + 0, + 1728 + ], + "shape": [ + 1152, + 1728 + ], + "filename_index": 1 + }, + { + "offsets": [ + 0, + 3456 + ], + "shape": [ + 1152, + 1728 + ], + "filename_index": 2 + }, + { + "offsets": [ + 0, + 5184 + ], + "shape": [ + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.18.mlp.gate_proj.weight": { + "model.layers.7.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ - 6912, - 1152 + 1152, + 1024 ], "dtype": "BF16", "chunks": [ @@ -11408,47 +11435,47 @@ 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 1728, + 288, 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 3456, + 576, 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 5184, + 864, 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.10.pre_feedforward_layernorm.weight": { + "model.layers.3.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -11493,199 +11520,217 @@ } ] }, - "model.layers.11.self_attn.k_norm.weight": { + "model.layers.4.mlp.down_proj.weight": { "type": "Distributed", "shape": [ - 256 + 1152, + 6912 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 64 + 1152, + 1728 ], "filename_index": 0 }, { "offsets": [ - 64 + 0, + 1728 ], "shape": [ - 64 + 1152, + 1728 ], "filename_index": 1 }, { "offsets": [ - 128 + 0, + 3456 ], "shape": [ - 64 + 1152, + 1728 ], "filename_index": 2 }, { "offsets": [ - 192 + 0, + 5184 ], "shape": [ - 64 + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.19.mlp.up_proj.weight": { + "model.layers.22.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ - 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 288 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 576 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 864 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 3 } ] }, - "model.layers.15.input_layernorm.weight": { + "model.layers.3.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ + 1024, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 1024, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 1024, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 1024, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.19.post_feedforward_layernorm.weight": { + "model.layers.23.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ - 1152 + 1152, + 1024 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 288 + 288, + 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 576 + 576, + 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 864 + 864, + 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.13.self_attn.k_norm.weight": { + "model.layers.3.input_layernorm.weight": { "type": "Distributed", "shape": [ - 256 + 1152 ], "dtype": "BF16", "chunks": [ @@ -11694,89 +11739,98 @@ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } ] }, - "model.layers.8.self_attn.k_norm.weight": { + "model.layers.6.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ - 256 + 1024, + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 64 + 1024, + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 0, + 288 ], "shape": [ - 64 + 1024, + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 0, + 576 ], "shape": [ - 64 + 1024, + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 0, + 864 ], "shape": [ - 64 + 1024, + 288 ], "filename_index": 3 } ] }, - "model.layers.18.mlp.down_proj.weight": { + "model.layers.18.mlp.up_proj.weight": { "type": "Distributed", "shape": [ - 1152, - 6912 + 6912, + 1152 ], "dtype": "BF16", "chunks": [ @@ -11786,47 +11840,47 @@ 0 ], "shape": [ - 1152, - 1728 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 0, - 1728 + 1728, + 0 ], "shape": [ - 1152, - 1728 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 0, - 3456 + 3456, + 0 ], "shape": [ - 1152, - 1728 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 0, - 5184 + 5184, + 0 ], "shape": [ - 1152, - 1728 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.10.self_attn.q_norm.weight": { + "model.layers.2.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ 256 @@ -11871,97 +11925,115 @@ } ] }, - "model.layers.0.post_attention_layernorm.weight": { + "model.layers.24.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.9.input_layernorm.weight": { + "model.layers.19.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ + 1024, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 1024, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 1024, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 1024, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.13.post_feedforward_layernorm.weight": { + "model.layers.15.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -12006,154 +12078,163 @@ } ] }, - "model.layers.17.mlp.gate_proj.weight": { + "model.layers.2.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 6912, - 1152 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 64 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 128 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 192 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 3 } ] }, - "model.layers.10.input_layernorm.weight": { + "model.layers.17.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.12.self_attn.k_norm.weight": { + "model.layers.0.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ - 256 + 256, + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 64 + 256, + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 0, + 288 ], "shape": [ - 64 + 256, + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 0, + 576 ], "shape": [ - 64 + 256, + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 0, + 864 ], "shape": [ - 64 + 256, + 288 ], "filename_index": 3 } ] }, - "model.layers.1.self_attn.k_norm.weight": { + "model.layers.9.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ - 256 + 1152 ], "dtype": "BF16", "chunks": [ @@ -12162,139 +12243,139 @@ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } ] }, - "model.layers.3.self_attn.q_proj.weight": { + "model.layers.16.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 1024, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1024, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 1024, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 1024, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.2.pre_feedforward_layernorm.weight": { + "model.layers.16.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.6.pre_feedforward_layernorm.weight": { + "model.layers.14.input_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -12339,10 +12420,10 @@ } ] }, - "model.layers.23.post_feedforward_layernorm.weight": { + "model.layers.25.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 1152 + 256 ], "dtype": "BF16", "chunks": [ @@ -12351,43 +12432,43 @@ 0 ], "shape": [ - 288 + 64 ], "filename_index": 0 }, { "offsets": [ - 288 + 64 ], "shape": [ - 288 + 64 ], "filename_index": 1 }, { "offsets": [ - 576 + 128 ], "shape": [ - 288 + 64 ], "filename_index": 2 }, { "offsets": [ - 864 + 192 ], "shape": [ - 288 + 64 ], "filename_index": 3 } ] }, - "model.layers.7.self_attn.k_norm.weight": { + "model.layers.13.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 256 + 1152 ], "dtype": "BF16", "chunks": [ @@ -12396,133 +12477,151 @@ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } ] }, - "model.layers.6.input_layernorm.weight": { + "model.layers.5.mlp.up_proj.weight": { "type": "Distributed", "shape": [ + 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 288 + 1728, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 576 + 3456, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 864 + 5184, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.20.post_feedforward_layernorm.weight": { + "model.layers.3.mlp.up_proj.weight": { "type": "Distributed", "shape": [ + 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 288 + 1728, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 576 + 3456, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 864 + 5184, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.20.self_attn.k_norm.weight": { + "model.layers.14.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ - 256 + 1152 ], "dtype": "BF16", "chunks": [ @@ -12531,34 +12630,34 @@ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } @@ -12618,106 +12717,52 @@ } ] }, - "model.layers.4.mlp.gate_proj.weight": { + "model.layers.21.input_layernorm.weight": { "type": "Distributed", "shape": [ - 6912, 1152 ], "dtype": "BF16", - "chunks": [ - { - "offsets": [ - 0, - 0 - ], - "shape": [ - 1728, - 1152 - ], - "filename_index": 0 - }, - { - "offsets": [ - 1728, - 0 - ], - "shape": [ - 1728, - 1152 - ], - "filename_index": 1 - }, - { - "offsets": [ - 3456, - 0 - ], - "shape": [ - 1728, - 1152 - ], - "filename_index": 2 - }, - { - "offsets": [ - 5184, - 0 - ], - "shape": [ - 1728, - 1152 - ], - "filename_index": 3 - } - ] - }, - "model.layers.10.self_attn.k_norm.weight": { - "type": "Distributed", - "shape": [ - 256 - ], - "dtype": "BF16", "chunks": [ { "offsets": [ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } ] }, - "model.layers.3.self_attn.q_norm.weight": { + "model.layers.8.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ 256 @@ -12762,7 +12807,7 @@ } ] }, - "model.layers.5.mlp.down_proj.weight": { + "model.layers.19.mlp.down_proj.weight": { "type": "Distributed", "shape": [ 1152, @@ -12816,10 +12861,10 @@ } ] }, - "model.layers.18.input_layernorm.weight": { + "model.layers.16.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 1152 + 256 ], "dtype": "BF16", "chunks": [ @@ -12828,151 +12873,133 @@ 0 ], "shape": [ - 288 + 64 ], "filename_index": 0 }, { "offsets": [ - 288 + 64 ], "shape": [ - 288 + 64 ], "filename_index": 1 }, { "offsets": [ - 576 + 128 ], "shape": [ - 288 + 64 ], "filename_index": 2 }, { "offsets": [ - 864 + 192 ], "shape": [ - 288 + 64 ], "filename_index": 3 } ] }, - "model.layers.25.self_attn.k_proj.weight": { + "model.layers.19.input_layernorm.weight": { "type": "Distributed", "shape": [ - 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 256, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 256, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 256, 288 ], "filename_index": 3 } ] }, - "model.layers.15.mlp.gate_proj.weight": { + "model.layers.3.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ - 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 288 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 576 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 864 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 3 } ] }, - "model.layers.12.self_attn.v_proj.weight": { + "model.layers.8.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ - 256, + 1024, 1152 ], "dtype": "BF16", @@ -12983,7 +13010,7 @@ 0 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 0 @@ -12994,7 +13021,7 @@ 288 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 1 @@ -13005,7 +13032,7 @@ 576 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 2 @@ -13016,68 +13043,59 @@ 864 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.7.self_attn.o_proj.weight": { + "model.layers.4.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ - 1152, - 1024 + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 0 }, { "offsets": [ - 288, - 0 + 288 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 1 }, { "offsets": [ - 576, - 0 + 576 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 2 }, { "offsets": [ - 864, - 0 + 864 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 3 } ] }, - "model.layers.4.self_attn.q_norm.weight": { + "model.layers.0.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ 256 @@ -13122,65 +13140,11 @@ } ] }, - "model.layers.19.self_attn.k_proj.weight": { - "type": "Distributed", - "shape": [ - 256, - 1152 - ], - "dtype": "BF16", - "chunks": [ - { - "offsets": [ - 0, - 0 - ], - "shape": [ - 256, - 288 - ], - "filename_index": 0 - }, - { - "offsets": [ - 0, - 288 - ], - "shape": [ - 256, - 288 - ], - "filename_index": 1 - }, - { - "offsets": [ - 0, - 576 - ], - "shape": [ - 256, - 288 - ], - "filename_index": 2 - }, - { - "offsets": [ - 0, - 864 - ], - "shape": [ - 256, - 288 - ], - "filename_index": 3 - } - ] - }, - "model.layers.19.mlp.gate_proj.weight": { + "model.layers.19.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ - 6912, - 1152 + 1152, + 1024 ], "dtype": "BF16", "chunks": [ @@ -13190,50 +13154,50 @@ 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 1728, + 288, 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 3456, + 576, 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 5184, + 864, 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.18.post_feedforward_layernorm.weight": { + "model.layers.6.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 1152 + 256 ], "dtype": "BF16", "chunks": [ @@ -13242,89 +13206,98 @@ 0 ], "shape": [ - 288 + 64 ], "filename_index": 0 }, { "offsets": [ - 288 + 64 ], "shape": [ - 288 + 64 ], "filename_index": 1 }, { "offsets": [ - 576 + 128 ], "shape": [ - 288 + 64 ], "filename_index": 2 }, { "offsets": [ - 864 + 192 ], "shape": [ - 288 + 64 ], "filename_index": 3 } ] }, - "model.layers.13.pre_feedforward_layernorm.weight": { + "model.layers.13.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.17.self_attn.q_proj.weight": { + "model.layers.8.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ - 1024, - 1152 + 1152, + 1024 ], "dtype": "BF16", "chunks": [ @@ -13334,92 +13307,101 @@ 0 ], "shape": [ - 1024, - 288 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 288, + 0 ], "shape": [ - 1024, - 288 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 576, + 0 ], "shape": [ - 1024, - 288 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 864, + 0 ], "shape": [ - 1024, - 288 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.1.post_feedforward_layernorm.weight": { + "model.layers.13.mlp.down_proj.weight": { "type": "Distributed", "shape": [ - 1152 + 1152, + 6912 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 0 }, { "offsets": [ - 288 + 0, + 1728 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 1 }, { "offsets": [ - 576 + 0, + 3456 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 2 }, { "offsets": [ - 864 + 0, + 5184 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.21.self_attn.v_proj.weight": { + "model.layers.20.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ 256, @@ -13473,164 +13455,155 @@ } ] }, - "model.layers.14.self_attn.o_proj.weight": { + "model.layers.14.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ - 1152, - 1024 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 0 }, { "offsets": [ - 288, - 0 + 64 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 1 }, { "offsets": [ - 576, - 0 + 128 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 2 }, { "offsets": [ - 864, - 0 + 192 ], "shape": [ - 288, - 1024 + 64 ], "filename_index": 3 } ] }, - "model.layers.13.self_attn.q_norm.weight": { + "model.layers.10.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ - 256 + 256, + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 64 + 256, + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 0, + 288 ], "shape": [ - 64 + 256, + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 0, + 576 ], "shape": [ - 64 + 256, + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 0, + 864 ], "shape": [ - 64 + 256, + 288 ], "filename_index": 3 } ] }, - "model.layers.14.mlp.down_proj.weight": { + "model.layers.23.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ - 1152, - 6912 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 0 }, { "offsets": [ - 0, - 1728 + 64 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 1 }, { "offsets": [ - 0, - 3456 + 128 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 2 }, { "offsets": [ - 0, - 5184 + 192 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 3 } ] }, - "model.layers.1.mlp.down_proj.weight": { + "model.layers.24.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ 1152, - 6912 + 1024 ], "dtype": "BF16", "chunks": [ @@ -13640,50 +13613,50 @@ 0 ], "shape": [ - 1152, - 1728 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 0, - 1728 + 288, + 0 ], "shape": [ - 1152, - 1728 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 0, - 3456 + 576, + 0 ], "shape": [ - 1152, - 1728 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 0, - 5184 + 864, + 0 ], "shape": [ - 1152, - 1728 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.6.self_attn.v_proj.weight": { + "model.layers.5.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ - 256, + 6912, 1152 ], "dtype": "BF16", @@ -13694,204 +13667,195 @@ 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 1728, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 3456, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 5184, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.21.self_attn.k_norm.weight": { + "model.layers.12.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ - 256 + 1024, + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 64 + 1024, + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 0, + 288 ], "shape": [ - 64 + 1024, + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 0, + 576 ], "shape": [ - 64 + 1024, + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 0, + 864 ], "shape": [ - 64 + 1024, + 288 ], "filename_index": 3 } ] }, - "model.layers.12.self_attn.k_proj.weight": { + "model.layers.21.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 256, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 256, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 256, 288 ], "filename_index": 3 } ] }, - "model.layers.8.self_attn.o_proj.weight": { + "model.layers.7.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 1152, - 1024 + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 0 }, { "offsets": [ - 288, - 0 + 288 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 1 }, { "offsets": [ - 576, - 0 + 576 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 2 }, { "offsets": [ - 864, - 0 + 864 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 3 } ] }, - "model.layers.4.self_attn.o_proj.weight": { + "model.layers.4.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ - 1152, - 1024 + 256, + 1152 ], "dtype": "BF16", "chunks": [ @@ -13901,239 +13865,257 @@ 0 ], "shape": [ - 288, - 1024 + 256, + 288 ], "filename_index": 0 }, { "offsets": [ - 288, - 0 + 0, + 288 ], "shape": [ - 288, - 1024 + 256, + 288 ], "filename_index": 1 }, { "offsets": [ - 576, - 0 + 0, + 576 ], "shape": [ - 288, - 1024 + 256, + 288 ], "filename_index": 2 }, { "offsets": [ - 864, - 0 + 0, + 864 ], "shape": [ - 288, - 1024 + 256, + 288 ], "filename_index": 3 } ] }, - "model.layers.10.post_attention_layernorm.weight": { + "model.layers.7.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.5.post_attention_layernorm.weight": { + "model.layers.22.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ + 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 288 + 1728, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 576 + 3456, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 864 + 5184, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.6.post_attention_layernorm.weight": { + "model.layers.20.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.21.mlp.gate_proj.weight": { + "model.layers.17.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 6912, - 1152 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 64 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 128 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 192 ], "shape": [ - 1728, - 1152 + 64 ], "filename_index": 3 } ] }, - "model.layers.17.post_feedforward_layernorm.weight": { + "model.layers.1.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ - 1152 + 256 ], "dtype": "BF16", "chunks": [ @@ -14142,43 +14124,43 @@ 0 ], "shape": [ - 288 + 64 ], "filename_index": 0 }, { "offsets": [ - 288 + 64 ], "shape": [ - 288 + 64 ], "filename_index": 1 }, { "offsets": [ - 576 + 128 ], "shape": [ - 288 + 64 ], "filename_index": 2 }, { "offsets": [ - 864 + 192 ], "shape": [ - 288 + 64 ], "filename_index": 3 } ] }, - "model.layers.0.mlp.gate_proj.weight": { + "model.layers.13.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ - 6912, + 1024, 1152 ], "dtype": "BF16", @@ -14189,47 +14171,47 @@ 0 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 0, + 288 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 0, + 576 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 0, + 864 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 3 } ] }, - "model.layers.12.input_layernorm.weight": { + "model.layers.17.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -14274,10 +14256,10 @@ } ] }, - "model.layers.10.self_attn.v_proj.weight": { + "model.layers.19.mlp.up_proj.weight": { "type": "Distributed", "shape": [ - 256, + 6912, 1152 ], "dtype": "BF16", @@ -14288,47 +14270,47 @@ 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 1728, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 3456, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 5184, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.20.pre_feedforward_layernorm.weight": { + "model.layers.18.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -14373,7 +14355,7 @@ } ] }, - "model.layers.20.post_attention_layernorm.weight": { + "model.layers.24.input_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -14418,7 +14400,7 @@ } ] }, - "model.layers.12.post_feedforward_layernorm.weight": { + "model.layers.0.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ 1152 @@ -14463,7 +14445,7 @@ } ] }, - "model.layers.7.mlp.up_proj.weight": { + "model.layers.25.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ 6912, @@ -14517,10 +14499,10 @@ } ] }, - "model.layers.13.self_attn.v_proj.weight": { + "model.layers.0.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ - 256, + 6912, 1152 ], "dtype": "BF16", @@ -14531,95 +14513,104 @@ 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 1728, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 3456, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 5184, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.16.post_attention_layernorm.weight": { + "model.layers.6.mlp.down_proj.weight": { "type": "Distributed", "shape": [ - 1152 + 1152, + 6912 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 0 }, { "offsets": [ - 288 + 0, + 1728 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 1 }, { "offsets": [ - 576 + 0, + 3456 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 2 }, { "offsets": [ - 864 + 0, + 5184 ], "shape": [ - 288 + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.22.self_attn.q_proj.weight": { + "model.layers.5.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ - 1024, + 256, 1152 ], "dtype": "BF16", @@ -14630,7 +14621,7 @@ 0 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 0 @@ -14641,7 +14632,7 @@ 288 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 1 @@ -14652,7 +14643,7 @@ 576 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 2 @@ -14663,125 +14654,107 @@ 864 ], "shape": [ - 1024, + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.20.self_attn.q_proj.weight": { + "model.layers.8.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 1024, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1024, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 1024, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 1024, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.17.self_attn.o_proj.weight": { + "model.layers.19.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 1152, - 1024 + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 0 }, { "offsets": [ - 288, - 0 + 288 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 1 }, { "offsets": [ - 576, - 0 + 576 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 2 }, { "offsets": [ - 864, - 0 + 864 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 3 } ] }, - "model.layers.8.mlp.up_proj.weight": { + "model.layers.12.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ - 6912, + 256, 1152 ], "dtype": "BF16", @@ -14792,105 +14765,96 @@ 0 ], "shape": [ - 1728, - 1152 + 256, + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 0, + 288 ], "shape": [ - 1728, - 1152 + 256, + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 0, + 576 ], "shape": [ - 1728, - 1152 + 256, + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 0, + 864 ], "shape": [ - 1728, - 1152 + 256, + 288 ], "filename_index": 3 } ] }, - "model.layers.19.self_attn.q_proj.weight": { + "model.layers.25.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 1024, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1024, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 1024, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 1024, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.22.mlp.down_proj.weight": { + "model.layers.17.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ - 1152, - 6912 + 256, + 1152 ], "dtype": "BF16", "chunks": [ @@ -14900,149 +14864,149 @@ 0 ], "shape": [ - 1152, - 1728 + 256, + 288 ], "filename_index": 0 }, { "offsets": [ 0, - 1728 + 288 ], "shape": [ - 1152, - 1728 + 256, + 288 ], "filename_index": 1 }, { "offsets": [ 0, - 3456 + 576 ], "shape": [ - 1152, - 1728 + 256, + 288 ], "filename_index": 2 }, { "offsets": [ 0, - 5184 + 864 ], "shape": [ - 1152, - 1728 + 256, + 288 ], "filename_index": 3 } ] }, - "model.layers.25.input_layernorm.weight": { + "model.layers.2.mlp.up_proj.weight": { "type": "Distributed", "shape": [ + 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 288 + 1728, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 576 + 3456, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 864 + 5184, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.3.self_attn.v_proj.weight": { + "model.layers.25.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 256, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 256, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 256, 288 ], "filename_index": 3 } ] }, - "model.layers.13.self_attn.k_proj.weight": { + "model.layers.21.mlp.up_proj.weight": { "type": "Distributed", "shape": [ - 256, + 6912, 1152 ], "dtype": "BF16", @@ -15053,158 +15017,140 @@ 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 1728, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 3456, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 5184, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.10.self_attn.o_proj.weight": { + "model.layers.14.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 1152, - 1024 + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 0 }, { "offsets": [ - 288, - 0 + 288 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 1 }, { "offsets": [ - 576, - 0 + 576 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 2 }, { "offsets": [ - 864, - 0 + 864 ], "shape": [ - 288, - 1024 + 288 ], "filename_index": 3 } ] }, - "model.layers.0.self_attn.q_proj.weight": { + "model.layers.9.self_attn.q_norm.weight": { "type": "Distributed", "shape": [ - 1024, - 1152 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1024, - 288 + 64 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 64 ], "shape": [ - 1024, - 288 + 64 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 128 ], "shape": [ - 1024, - 288 + 64 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 192 ], "shape": [ - 1024, - 288 + 64 ], "filename_index": 3 } ] }, - "model.layers.9.pre_feedforward_layernorm.weight": { + "model.layers.18.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 1152 + 256 ], "dtype": "BF16", "chunks": [ @@ -15213,44 +15159,44 @@ 0 ], "shape": [ - 288 + 64 ], "filename_index": 0 }, { "offsets": [ - 288 + 64 ], "shape": [ - 288 + 64 ], "filename_index": 1 }, { "offsets": [ - 576 + 128 ], "shape": [ - 288 + 64 ], "filename_index": 2 }, { "offsets": [ - 864 + 192 ], "shape": [ - 288 + 64 ], "filename_index": 3 } ] }, - "model.layers.12.mlp.gate_proj.weight": { + "model.layers.12.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ - 6912, - 1152 + 1152, + 1024 ], "dtype": "BF16", "chunks": [ @@ -15260,95 +15206,104 @@ 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 1728, + 288, 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 3456, + 576, 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 5184, + 864, 0 ], "shape": [ - 1728, - 1152 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.21.post_feedforward_layernorm.weight": { + "model.layers.23.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.12.mlp.up_proj.weight": { + "model.layers.18.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ - 6912, + 1024, 1152 ], "dtype": "BF16", @@ -15359,95 +15314,104 @@ 0 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 0, + 288 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 0, + 576 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 0, + 864 ], "shape": [ - 1728, - 1152 + 1024, + 288 ], "filename_index": 3 } ] }, - "model.layers.17.self_attn.q_norm.weight": { + "model.layers.17.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ - 256 + 6912, + 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 64 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 64 + 1728, + 0 ], "shape": [ - 64 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 128 + 3456, + 0 ], "shape": [ - 64 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 192 + 5184, + 0 ], "shape": [ - 64 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.22.self_attn.k_proj.weight": { + "model.layers.9.mlp.up_proj.weight": { "type": "Distributed", "shape": [ - 256, + 6912, 1152 ], "dtype": "BF16", @@ -15458,51 +15422,51 @@ 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 1728, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 3456, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 5184, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.24.self_attn.q_proj.weight": { + "model.layers.14.mlp.down_proj.weight": { "type": "Distributed", "shape": [ - 1024, - 1152 + 1152, + 6912 ], "dtype": "BF16", "chunks": [ @@ -15512,50 +15476,50 @@ 0 ], "shape": [ - 1024, - 288 + 1152, + 1728 ], "filename_index": 0 }, { "offsets": [ 0, - 288 + 1728 ], "shape": [ - 1024, - 288 + 1152, + 1728 ], "filename_index": 1 }, { "offsets": [ 0, - 576 + 3456 ], "shape": [ - 1024, - 288 + 1152, + 1728 ], "filename_index": 2 }, { "offsets": [ 0, - 864 + 5184 ], "shape": [ - 1024, - 288 + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.7.pre_feedforward_layernorm.weight": { + "model.layers.22.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 1152 + 256 ], "dtype": "BF16", "chunks": [ @@ -15564,40 +15528,40 @@ 0 ], "shape": [ - 288 + 64 ], "filename_index": 0 }, { "offsets": [ - 288 + 64 ], "shape": [ - 288 + 64 ], "filename_index": 1 }, { "offsets": [ - 576 + 128 ], "shape": [ - 288 + 64 ], "filename_index": 2 }, { "offsets": [ - 864 + 192 ], "shape": [ - 288 + 64 ], "filename_index": 3 } ] }, - "model.layers.15.mlp.down_proj.weight": { + "model.layers.0.mlp.down_proj.weight": { "type": "Distributed", "shape": [ 1152, @@ -15651,56 +15615,65 @@ } ] }, - "model.layers.1.pre_feedforward_layernorm.weight": { + "model.layers.25.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ - 1152 + 1152, + 1024 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 288 + 288, + 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 576 + 576, + 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 864 + 864, + 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.6.mlp.down_proj.weight": { + "model.layers.1.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ - 1152, - 6912 + 256, + 1152 ], "dtype": "BF16", "chunks": [ @@ -15710,85 +15683,94 @@ 0 ], "shape": [ - 1152, - 1728 + 256, + 288 ], "filename_index": 0 }, { "offsets": [ 0, - 1728 + 288 ], "shape": [ - 1152, - 1728 + 256, + 288 ], "filename_index": 1 }, { "offsets": [ 0, - 3456 + 576 ], "shape": [ - 1152, - 1728 + 256, + 288 ], "filename_index": 2 }, { "offsets": [ 0, - 5184 + 864 ], "shape": [ - 1152, - 1728 + 256, + 288 ], "filename_index": 3 } ] }, - "model.layers.17.pre_feedforward_layernorm.weight": { + "model.layers.23.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ + 1024, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 1024, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 1024, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 1024, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 1024, 288 ], "filename_index": 3 @@ -15840,10 +15822,10 @@ } ] }, - "model.layers.22.self_attn.v_proj.weight": { + "model.layers.7.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ - 256, + 1024, 1152 ], "dtype": "BF16", @@ -15854,7 +15836,7 @@ 0 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 0 @@ -15865,7 +15847,7 @@ 288 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 1 @@ -15876,7 +15858,7 @@ 576 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 2 @@ -15887,18 +15869,18 @@ 864 ], "shape": [ - 256, + 1024, 288 ], "filename_index": 3 } ] }, - "model.layers.17.self_attn.k_proj.weight": { + "model.layers.9.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ - 256, - 1152 + 1152, + 1024 ], "dtype": "BF16", "chunks": [ @@ -15908,47 +15890,47 @@ 0 ], "shape": [ - 256, - 288 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 288, + 0 ], "shape": [ - 256, - 288 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 576, + 0 ], "shape": [ - 256, - 288 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 864, + 0 ], "shape": [ - 256, - 288 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.9.self_attn.v_proj.weight": { + "model.layers.8.self_attn.v_proj.weight": { "type": "Distributed", "shape": [ 256, @@ -16002,11 +15984,11 @@ } ] }, - "model.layers.7.self_attn.v_proj.weight": { + "model.layers.16.mlp.down_proj.weight": { "type": "Distributed", "shape": [ - 256, - 1152 + 1152, + 6912 ], "dtype": "BF16", "chunks": [ @@ -16016,50 +15998,50 @@ 0 ], "shape": [ - 256, - 288 + 1152, + 1728 ], "filename_index": 0 }, { "offsets": [ 0, - 288 + 1728 ], "shape": [ - 256, - 288 + 1152, + 1728 ], "filename_index": 1 }, { "offsets": [ 0, - 576 + 3456 ], "shape": [ - 256, - 288 + 1152, + 1728 ], "filename_index": 2 }, { "offsets": [ 0, - 864 + 5184 ], "shape": [ - 256, - 288 + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.0.self_attn.k_proj.weight": { + "model.layers.19.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ - 256, + 6912, 1152 ], "dtype": "BF16", @@ -16070,294 +16052,303 @@ 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 1728, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 3456, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 5184, + 0 ], "shape": [ - 256, - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.9.mlp.gate_proj.weight": { + "model.layers.9.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 0 }, { "offsets": [ - 1728, - 0 + 288 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 1 }, { "offsets": [ - 3456, - 0 + 576 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 2 }, { "offsets": [ - 5184, - 0 + 864 ], "shape": [ - 1728, - 1152 + 288 ], "filename_index": 3 } ] }, - "model.layers.5.input_layernorm.weight": { + "model.layers.24.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.11.post_feedforward_layernorm.weight": { + "model.layers.12.mlp.gate_proj.weight": { "type": "Distributed", "shape": [ + 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 288 + 1728, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 576 + 3456, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 864 + 5184, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.20.input_layernorm.weight": { + "model.layers.0.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ - 1152 + 1152, + 1024 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 288 + 288, + 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 576 + 576, + 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 864 + 864, + 0 ], "shape": [ - 288 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.18.self_attn.v_proj.weight": { + "model.layers.10.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 256, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 256, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 256, 288 ], "filename_index": 3 } ] }, - "model.layers.13.mlp.down_proj.weight": { + "model.layers.15.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ 1152, - 6912 + 1024 ], "dtype": "BF16", "chunks": [ @@ -16367,51 +16358,51 @@ 0 ], "shape": [ - 1152, - 1728 + 288, + 1024 ], "filename_index": 0 }, { "offsets": [ - 0, - 1728 + 288, + 0 ], "shape": [ - 1152, - 1728 + 288, + 1024 ], "filename_index": 1 }, { "offsets": [ - 0, - 3456 + 576, + 0 ], "shape": [ - 1152, - 1728 + 288, + 1024 ], "filename_index": 2 }, { "offsets": [ - 0, - 5184 + 864, + 0 ], "shape": [ - 1152, - 1728 + 288, + 1024 ], "filename_index": 3 } ] }, - "model.layers.23.mlp.down_proj.weight": { + "model.layers.21.self_attn.q_proj.weight": { "type": "Distributed", "shape": [ - 1152, - 6912 + 1024, + 1152 ], "dtype": "BF16", "chunks": [ @@ -16421,95 +16412,104 @@ 0 ], "shape": [ - 1152, - 1728 + 1024, + 288 ], "filename_index": 0 }, { "offsets": [ 0, - 1728 + 288 ], "shape": [ - 1152, - 1728 + 1024, + 288 ], "filename_index": 1 }, { "offsets": [ 0, - 3456 + 576 ], "shape": [ - 1152, - 1728 + 1024, + 288 ], "filename_index": 2 }, { "offsets": [ 0, - 5184 + 864 ], "shape": [ - 1152, - 1728 + 1024, + 288 ], "filename_index": 3 } ] }, - "model.layers.18.self_attn.k_norm.weight": { + "model.layers.21.mlp.down_proj.weight": { "type": "Distributed", "shape": [ - 256 + 1152, + 6912 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 64 + 1152, + 1728 ], "filename_index": 0 }, { "offsets": [ - 64 + 0, + 1728 ], "shape": [ - 64 + 1152, + 1728 ], "filename_index": 1 }, { "offsets": [ - 128 + 0, + 3456 ], "shape": [ - 64 + 1152, + 1728 ], "filename_index": 2 }, { "offsets": [ - 192 + 0, + 5184 ], "shape": [ - 64 + 1152, + 1728 ], "filename_index": 3 } ] }, - "model.layers.9.self_attn.k_norm.weight": { + "model.layers.18.pre_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 256 + 1152 ], "dtype": "BF16", "chunks": [ @@ -16518,196 +16518,187 @@ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } ] }, - "model.layers.8.input_layernorm.weight": { + "model.layers.25.self_attn.k_proj.weight": { "type": "Distributed", "shape": [ + 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ + 256, 288 ], "filename_index": 0 }, { "offsets": [ + 0, 288 ], "shape": [ + 256, 288 ], "filename_index": 1 }, { "offsets": [ + 0, 576 ], "shape": [ + 256, 288 ], "filename_index": 2 }, { "offsets": [ + 0, 864 ], "shape": [ + 256, 288 ], "filename_index": 3 } ] }, - "model.layers.24.mlp.down_proj.weight": { + "model.layers.5.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 1152, - 6912 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 0 }, { "offsets": [ - 0, - 1728 + 64 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 1 }, { "offsets": [ - 0, - 3456 + 128 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 2 }, { "offsets": [ - 0, - 5184 + 192 ], "shape": [ - 1152, - 1728 + 64 ], "filename_index": 3 } ] }, - "model.layers.24.self_attn.k_proj.weight": { + "model.layers.10.self_attn.k_norm.weight": { "type": "Distributed", "shape": [ - 256, - 1152 + 256 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 0 }, { "offsets": [ - 0, - 288 + 64 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 1 }, { "offsets": [ - 0, - 576 + 128 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 2 }, { "offsets": [ - 0, - 864 + 192 ], "shape": [ - 256, - 288 + 64 ], "filename_index": 3 } ] }, - "model.layers.25.self_attn.q_norm.weight": { + "model.layers.8.post_attention_layernorm.weight": { "type": "Distributed", "shape": [ - 256 + 1152 ], "dtype": "BF16", "chunks": [ @@ -16716,184 +16707,193 @@ 0 ], "shape": [ - 64 + 288 ], "filename_index": 0 }, { "offsets": [ - 64 + 288 ], "shape": [ - 64 + 288 ], "filename_index": 1 }, { "offsets": [ - 128 + 576 ], "shape": [ - 64 + 288 ], "filename_index": 2 }, { "offsets": [ - 192 + 864 ], "shape": [ - 64 + 288 ], "filename_index": 3 } ] }, - "model.layers.11.input_layernorm.weight": { + "model.layers.24.mlp.up_proj.weight": { "type": "Distributed", "shape": [ + 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 288 + 1728, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 576 + 3456, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 864 + 5184, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.7.self_attn.k_proj.weight": { + "model.layers.23.post_feedforward_layernorm.weight": { "type": "Distributed", "shape": [ - 256, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ - 0, 0 ], "shape": [ - 256, 288 ], "filename_index": 0 }, { "offsets": [ - 0, 288 ], "shape": [ - 256, 288 ], "filename_index": 1 }, { "offsets": [ - 0, 576 ], "shape": [ - 256, 288 ], "filename_index": 2 }, { "offsets": [ - 0, 864 ], "shape": [ - 256, 288 ], "filename_index": 3 } ] }, - "model.layers.16.input_layernorm.weight": { + "model.layers.16.mlp.up_proj.weight": { "type": "Distributed", "shape": [ + 6912, 1152 ], "dtype": "BF16", "chunks": [ { "offsets": [ + 0, 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 0 }, { "offsets": [ - 288 + 1728, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 1 }, { "offsets": [ - 576 + 3456, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 2 }, { "offsets": [ - 864 + 5184, + 0 ], "shape": [ - 288 + 1728, + 1152 ], "filename_index": 3 } ] }, - "model.layers.3.self_attn.o_proj.weight": { + "model.layers.17.self_attn.o_proj.weight": { "type": "Distributed", "shape": [ 1152,