fdschmidt93 commited on
Commit
b0221f6
·
1 Parent(s): b687d12

initial commit

Browse files
__init__.py ADDED
File without changes
adapter_config.json ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": {
4
+ "base_model_class": "NLLBLLM2Vec",
5
+ "parent_library": "modeling_nllbllm2vec"
6
+ },
7
+ "base_model_name_or_path": "fdschmidt93/NLLBLLM2vec",
8
+ "bias": "none",
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_dropout": 0.0,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 16,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "task_type": "FEATURE_EXTRACTION",
26
+ "target_modules": [
27
+ "llm2vec.layers.0.self_attn.q_proj",
28
+ "llm2vec.layers.0.self_attn.k_proj",
29
+ "llm2vec.layers.0.self_attn.v_proj",
30
+ "llm2vec.layers.0.self_attn.o_proj",
31
+ "llm2vec.layers.0.mlp.gate_proj",
32
+ "llm2vec.layers.0.mlp.up_proj",
33
+ "llm2vec.layers.0.mlp.down_proj",
34
+ "llm2vec.layers.1.self_attn.q_proj",
35
+ "llm2vec.layers.1.self_attn.k_proj",
36
+ "llm2vec.layers.1.self_attn.v_proj",
37
+ "llm2vec.layers.1.self_attn.o_proj",
38
+ "llm2vec.layers.1.mlp.gate_proj",
39
+ "llm2vec.layers.1.mlp.up_proj",
40
+ "llm2vec.layers.1.mlp.down_proj",
41
+ "llm2vec.layers.2.self_attn.q_proj",
42
+ "llm2vec.layers.2.self_attn.k_proj",
43
+ "llm2vec.layers.2.self_attn.v_proj",
44
+ "llm2vec.layers.2.self_attn.o_proj",
45
+ "llm2vec.layers.2.mlp.gate_proj",
46
+ "llm2vec.layers.2.mlp.up_proj",
47
+ "llm2vec.layers.2.mlp.down_proj",
48
+ "llm2vec.layers.3.self_attn.q_proj",
49
+ "llm2vec.layers.3.self_attn.k_proj",
50
+ "llm2vec.layers.3.self_attn.v_proj",
51
+ "llm2vec.layers.3.self_attn.o_proj",
52
+ "llm2vec.layers.3.mlp.gate_proj",
53
+ "llm2vec.layers.3.mlp.up_proj",
54
+ "llm2vec.layers.3.mlp.down_proj",
55
+ "llm2vec.layers.4.self_attn.q_proj",
56
+ "llm2vec.layers.4.self_attn.k_proj",
57
+ "llm2vec.layers.4.self_attn.v_proj",
58
+ "llm2vec.layers.4.self_attn.o_proj",
59
+ "llm2vec.layers.4.mlp.gate_proj",
60
+ "llm2vec.layers.4.mlp.up_proj",
61
+ "llm2vec.layers.4.mlp.down_proj",
62
+ "llm2vec.layers.5.self_attn.q_proj",
63
+ "llm2vec.layers.5.self_attn.k_proj",
64
+ "llm2vec.layers.5.self_attn.v_proj",
65
+ "llm2vec.layers.5.self_attn.o_proj",
66
+ "llm2vec.layers.5.mlp.gate_proj",
67
+ "llm2vec.layers.5.mlp.up_proj",
68
+ "llm2vec.layers.5.mlp.down_proj",
69
+ "llm2vec.layers.6.self_attn.q_proj",
70
+ "llm2vec.layers.6.self_attn.k_proj",
71
+ "llm2vec.layers.6.self_attn.v_proj",
72
+ "llm2vec.layers.6.self_attn.o_proj",
73
+ "llm2vec.layers.6.mlp.gate_proj",
74
+ "llm2vec.layers.6.mlp.up_proj",
75
+ "llm2vec.layers.6.mlp.down_proj",
76
+ "llm2vec.layers.7.self_attn.q_proj",
77
+ "llm2vec.layers.7.self_attn.k_proj",
78
+ "llm2vec.layers.7.self_attn.v_proj",
79
+ "llm2vec.layers.7.self_attn.o_proj",
80
+ "llm2vec.layers.7.mlp.gate_proj",
81
+ "llm2vec.layers.7.mlp.up_proj",
82
+ "llm2vec.layers.7.mlp.down_proj",
83
+ "llm2vec.layers.8.self_attn.q_proj",
84
+ "llm2vec.layers.8.self_attn.k_proj",
85
+ "llm2vec.layers.8.self_attn.v_proj",
86
+ "llm2vec.layers.8.self_attn.o_proj",
87
+ "llm2vec.layers.8.mlp.gate_proj",
88
+ "llm2vec.layers.8.mlp.up_proj",
89
+ "llm2vec.layers.8.mlp.down_proj",
90
+ "llm2vec.layers.9.self_attn.q_proj",
91
+ "llm2vec.layers.9.self_attn.k_proj",
92
+ "llm2vec.layers.9.self_attn.v_proj",
93
+ "llm2vec.layers.9.self_attn.o_proj",
94
+ "llm2vec.layers.9.mlp.gate_proj",
95
+ "llm2vec.layers.9.mlp.up_proj",
96
+ "llm2vec.layers.9.mlp.down_proj",
97
+ "llm2vec.layers.10.self_attn.q_proj",
98
+ "llm2vec.layers.10.self_attn.k_proj",
99
+ "llm2vec.layers.10.self_attn.v_proj",
100
+ "llm2vec.layers.10.self_attn.o_proj",
101
+ "llm2vec.layers.10.mlp.gate_proj",
102
+ "llm2vec.layers.10.mlp.up_proj",
103
+ "llm2vec.layers.10.mlp.down_proj",
104
+ "llm2vec.layers.11.self_attn.q_proj",
105
+ "llm2vec.layers.11.self_attn.k_proj",
106
+ "llm2vec.layers.11.self_attn.v_proj",
107
+ "llm2vec.layers.11.self_attn.o_proj",
108
+ "llm2vec.layers.11.mlp.gate_proj",
109
+ "llm2vec.layers.11.mlp.up_proj",
110
+ "llm2vec.layers.11.mlp.down_proj",
111
+ "llm2vec.layers.12.self_attn.q_proj",
112
+ "llm2vec.layers.12.self_attn.k_proj",
113
+ "llm2vec.layers.12.self_attn.v_proj",
114
+ "llm2vec.layers.12.self_attn.o_proj",
115
+ "llm2vec.layers.12.mlp.gate_proj",
116
+ "llm2vec.layers.12.mlp.up_proj",
117
+ "llm2vec.layers.12.mlp.down_proj",
118
+ "llm2vec.layers.13.self_attn.q_proj",
119
+ "llm2vec.layers.13.self_attn.k_proj",
120
+ "llm2vec.layers.13.self_attn.v_proj",
121
+ "llm2vec.layers.13.self_attn.o_proj",
122
+ "llm2vec.layers.13.mlp.gate_proj",
123
+ "llm2vec.layers.13.mlp.up_proj",
124
+ "llm2vec.layers.13.mlp.down_proj",
125
+ "llm2vec.layers.14.self_attn.q_proj",
126
+ "llm2vec.layers.14.self_attn.k_proj",
127
+ "llm2vec.layers.14.self_attn.v_proj",
128
+ "llm2vec.layers.14.self_attn.o_proj",
129
+ "llm2vec.layers.14.mlp.gate_proj",
130
+ "llm2vec.layers.14.mlp.up_proj",
131
+ "llm2vec.layers.14.mlp.down_proj",
132
+ "llm2vec.layers.15.self_attn.q_proj",
133
+ "llm2vec.layers.15.self_attn.k_proj",
134
+ "llm2vec.layers.15.self_attn.v_proj",
135
+ "llm2vec.layers.15.self_attn.o_proj",
136
+ "llm2vec.layers.15.mlp.gate_proj",
137
+ "llm2vec.layers.15.mlp.up_proj",
138
+ "llm2vec.layers.15.mlp.down_proj",
139
+ "llm2vec.layers.16.self_attn.q_proj",
140
+ "llm2vec.layers.16.self_attn.k_proj",
141
+ "llm2vec.layers.16.self_attn.v_proj",
142
+ "llm2vec.layers.16.self_attn.o_proj",
143
+ "llm2vec.layers.16.mlp.gate_proj",
144
+ "llm2vec.layers.16.mlp.up_proj",
145
+ "llm2vec.layers.16.mlp.down_proj",
146
+ "llm2vec.layers.17.self_attn.q_proj",
147
+ "llm2vec.layers.17.self_attn.k_proj",
148
+ "llm2vec.layers.17.self_attn.v_proj",
149
+ "llm2vec.layers.17.self_attn.o_proj",
150
+ "llm2vec.layers.17.mlp.gate_proj",
151
+ "llm2vec.layers.17.mlp.up_proj",
152
+ "llm2vec.layers.17.mlp.down_proj",
153
+ "llm2vec.layers.18.self_attn.q_proj",
154
+ "llm2vec.layers.18.self_attn.k_proj",
155
+ "llm2vec.layers.18.self_attn.v_proj",
156
+ "llm2vec.layers.18.self_attn.o_proj",
157
+ "llm2vec.layers.18.mlp.gate_proj",
158
+ "llm2vec.layers.18.mlp.up_proj",
159
+ "llm2vec.layers.18.mlp.down_proj",
160
+ "llm2vec.layers.19.self_attn.q_proj",
161
+ "llm2vec.layers.19.self_attn.k_proj",
162
+ "llm2vec.layers.19.self_attn.v_proj",
163
+ "llm2vec.layers.19.self_attn.o_proj",
164
+ "llm2vec.layers.19.mlp.gate_proj",
165
+ "llm2vec.layers.19.mlp.up_proj",
166
+ "llm2vec.layers.19.mlp.down_proj",
167
+ "llm2vec.layers.20.self_attn.q_proj",
168
+ "llm2vec.layers.20.self_attn.k_proj",
169
+ "llm2vec.layers.20.self_attn.v_proj",
170
+ "llm2vec.layers.20.self_attn.o_proj",
171
+ "llm2vec.layers.20.mlp.gate_proj",
172
+ "llm2vec.layers.20.mlp.up_proj",
173
+ "llm2vec.layers.20.mlp.down_proj",
174
+ "llm2vec.layers.21.self_attn.q_proj",
175
+ "llm2vec.layers.21.self_attn.k_proj",
176
+ "llm2vec.layers.21.self_attn.v_proj",
177
+ "llm2vec.layers.21.self_attn.o_proj",
178
+ "llm2vec.layers.21.mlp.gate_proj",
179
+ "llm2vec.layers.21.mlp.up_proj",
180
+ "llm2vec.layers.21.mlp.down_proj",
181
+ "llm2vec.layers.22.self_attn.q_proj",
182
+ "llm2vec.layers.22.self_attn.k_proj",
183
+ "llm2vec.layers.22.self_attn.v_proj",
184
+ "llm2vec.layers.22.self_attn.o_proj",
185
+ "llm2vec.layers.22.mlp.gate_proj",
186
+ "llm2vec.layers.22.mlp.up_proj",
187
+ "llm2vec.layers.22.mlp.down_proj",
188
+ "llm2vec.layers.23.self_attn.q_proj",
189
+ "llm2vec.layers.23.self_attn.k_proj",
190
+ "llm2vec.layers.23.self_attn.v_proj",
191
+ "llm2vec.layers.23.self_attn.o_proj",
192
+ "llm2vec.layers.23.mlp.gate_proj",
193
+ "llm2vec.layers.23.mlp.up_proj",
194
+ "llm2vec.layers.23.mlp.down_proj",
195
+ "llm2vec.layers.24.self_attn.q_proj",
196
+ "llm2vec.layers.24.self_attn.k_proj",
197
+ "llm2vec.layers.24.self_attn.v_proj",
198
+ "llm2vec.layers.24.self_attn.o_proj",
199
+ "llm2vec.layers.24.mlp.gate_proj",
200
+ "llm2vec.layers.24.mlp.up_proj",
201
+ "llm2vec.layers.24.mlp.down_proj",
202
+ "llm2vec.layers.25.self_attn.q_proj",
203
+ "llm2vec.layers.25.self_attn.k_proj",
204
+ "llm2vec.layers.25.self_attn.v_proj",
205
+ "llm2vec.layers.25.self_attn.o_proj",
206
+ "llm2vec.layers.25.mlp.gate_proj",
207
+ "llm2vec.layers.25.mlp.up_proj",
208
+ "llm2vec.layers.25.mlp.down_proj",
209
+ "llm2vec.layers.26.self_attn.q_proj",
210
+ "llm2vec.layers.26.self_attn.k_proj",
211
+ "llm2vec.layers.26.self_attn.v_proj",
212
+ "llm2vec.layers.26.self_attn.o_proj",
213
+ "llm2vec.layers.26.mlp.gate_proj",
214
+ "llm2vec.layers.26.mlp.up_proj",
215
+ "llm2vec.layers.26.mlp.down_proj",
216
+ "llm2vec.layers.27.self_attn.q_proj",
217
+ "llm2vec.layers.27.self_attn.k_proj",
218
+ "llm2vec.layers.27.self_attn.v_proj",
219
+ "llm2vec.layers.27.self_attn.o_proj",
220
+ "llm2vec.layers.27.mlp.gate_proj",
221
+ "llm2vec.layers.27.mlp.up_proj",
222
+ "llm2vec.layers.27.mlp.down_proj",
223
+ "llm2vec.layers.28.self_attn.q_proj",
224
+ "llm2vec.layers.28.self_attn.k_proj",
225
+ "llm2vec.layers.28.self_attn.v_proj",
226
+ "llm2vec.layers.28.self_attn.o_proj",
227
+ "llm2vec.layers.28.mlp.gate_proj",
228
+ "llm2vec.layers.28.mlp.up_proj",
229
+ "llm2vec.layers.28.mlp.down_proj",
230
+ "llm2vec.layers.29.self_attn.q_proj",
231
+ "llm2vec.layers.29.self_attn.k_proj",
232
+ "llm2vec.layers.29.self_attn.v_proj",
233
+ "llm2vec.layers.29.self_attn.o_proj",
234
+ "llm2vec.layers.29.mlp.gate_proj",
235
+ "llm2vec.layers.29.mlp.up_proj",
236
+ "llm2vec.layers.29.mlp.down_proj",
237
+ "llm2vec.layers.30.self_attn.q_proj",
238
+ "llm2vec.layers.30.self_attn.k_proj",
239
+ "llm2vec.layers.30.self_attn.v_proj",
240
+ "llm2vec.layers.30.self_attn.o_proj",
241
+ "llm2vec.layers.30.mlp.gate_proj",
242
+ "llm2vec.layers.30.mlp.up_proj",
243
+ "llm2vec.layers.30.mlp.down_proj",
244
+ "llm2vec.layers.31.self_attn.q_proj",
245
+ "llm2vec.layers.31.self_attn.k_proj",
246
+ "llm2vec.layers.31.self_attn.v_proj",
247
+ "llm2vec.layers.31.self_attn.o_proj",
248
+ "llm2vec.layers.31.mlp.gate_proj",
249
+ "llm2vec.layers.31.mlp.up_proj",
250
+ "llm2vec.layers.31.mlp.down_proj"
251
+ ],
252
+ "use_dora": false,
253
+ "use_rslora": false
254
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fd0c6b2576840df0e26ccac0f6eb18e27945cefcd33668a3720cb20fec94b66
3
+ size 167833136
config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "fdschmidt93/NLLBLLM2Vec",
3
+ "architectures": [
4
+ "NLLBLLM2Vec"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_nllbllm2vec.NLLBLLM2VecConfig",
8
+ "AutoModel": "modeling_nllbllm2vec.NLLBLLM2Vec"
9
+ },
10
+ "llm2vec_config": {
11
+ "_name_or_path": "McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp",
12
+ "bos_token_id": 128000,
13
+ "eos_token_id": 128001,
14
+ "intermediate_size": 14336,
15
+ "max_position_embeddings": 8192,
16
+ "model_type": "llama",
17
+ "num_key_value_heads": 8,
18
+ "rms_norm_eps": 1e-05,
19
+ "rope_theta": 500000,
20
+ "torch_dtype": "bfloat16",
21
+ "use_cache": false,
22
+ "vocab_size": 128256
23
+ },
24
+ "model_type": "nllb-llm2vec",
25
+ "nllb_config": {
26
+ "_name_or_path": "facebook/nllb-200-distilled-600M",
27
+ "architectures": [
28
+ "M2M100Encoder"
29
+ ],
30
+ "decoder_layerdrop": 0,
31
+ "encoder_layerdrop": 0,
32
+ "max_length": 200,
33
+ "model_type": "m2m_100",
34
+ "tokenizer_class": "NllbTokenizer",
35
+ "torch_dtype": "bfloat16",
36
+ "vocab_size": 256206
37
+ },
38
+ "torch_dtype": "bfloat16",
39
+ "transformers_version": "4.44.2"
40
+ }
41
+
configuration_nllbllm2vec.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoConfig
2
+ from transformers.configuration_utils import PretrainedConfig
3
+ from transformers.models.llama.configuration_llama import LlamaConfig
4
+ from transformers.models.m2m_100.configuration_m2m_100 import M2M100Config
5
+
6
+ NLLBLLM2VEC_TYPE = "nllb-llm2vec"
7
+
8
+ DEFAULT_M2M100_CONFIG = {
9
+ "activation_dropout": 0.0,
10
+ "activation_function": "relu",
11
+ "architectures": ["M2M100Encoder"],
12
+ "attention_dropout": 0.1,
13
+ "bos_token_id": 0,
14
+ "d_model": 1024,
15
+ "decoder_attention_heads": 16,
16
+ "decoder_ffn_dim": 4096,
17
+ "decoder_layerdrop": 0,
18
+ "decoder_layers": 12,
19
+ "decoder_start_token_id": 2,
20
+ "dropout": 0.1,
21
+ "encoder_attention_heads": 16,
22
+ "encoder_ffn_dim": 4096,
23
+ "encoder_layerdrop": 0,
24
+ "encoder_layers": 12,
25
+ "eos_token_id": 2,
26
+ "init_std": 0.02,
27
+ "is_encoder_decoder": True,
28
+ "max_position_embeddings": 1024,
29
+ "model_type": "m2m_100",
30
+ "num_hidden_layers": 12,
31
+ "pad_token_id": 1,
32
+ "scale_embedding": True,
33
+ "torch_dtype": "float32",
34
+ "transformers_version": "4.21.0.dev0",
35
+ "use_cache": True,
36
+ "vocab_size": 256206,
37
+ "tokenizer_class": "NllbTokenizer",
38
+ "max_length": 200,
39
+ }
40
+
41
+ DEFAULT_LLAMA_CONFIG = {
42
+ "attention_bias": False,
43
+ "attention_dropout": 0,
44
+ "bos_token_id": 128000,
45
+ "eos_token_id": 128001,
46
+ "hidden_act": "silu",
47
+ "hidden_size": 4096,
48
+ "initializer_range": 0.02,
49
+ "intermediate_size": 14336,
50
+ "max_position_embeddings": 8192,
51
+ "model_type": "llama",
52
+ "num_attention_heads": 32,
53
+ "num_hidden_layers": 32,
54
+ "num_key_value_heads": 8,
55
+ "pretraining_tp": 1,
56
+ "rms_norm_eps": 0.00001,
57
+ "rope_scaling": None,
58
+ "rope_theta": 500000,
59
+ "tie_word_embeddings": False,
60
+ "torch_dtype": "bfloat16",
61
+ "transformers_version": "4.40.0.dev0",
62
+ "use_cache": False,
63
+ "vocab_size": 128256,
64
+ }
65
+
66
+
67
+ class NLLBLLM2VecConfig(PretrainedConfig):
68
+ model_type = "nllb-llm2vec"
69
+ is_composition = False
70
+
71
+ def __init__(
72
+ self,
73
+ nllb_config: dict = DEFAULT_M2M100_CONFIG,
74
+ llm2vec_config: dict = DEFAULT_LLAMA_CONFIG,
75
+ **kwargs,
76
+ ):
77
+ super().__init__(**kwargs)
78
+ self.nllb_config = M2M100Config(**nllb_config)
79
+ self.llm2vec_config = LlamaConfig(**llm2vec_config)
80
+
81
+
82
+ AutoConfig.register(NLLBLLM2VEC_TYPE, NLLBLLM2VecConfig)
83
+
84
+ NLLBLLM2VecConfig.register_for_auto_class()
model-00001-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2409158d0692d1b6d8ade7342ddf40bbaeda0ed6bb545624d679bd3549d4d894
3
+ size 4795637544
model-00002-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:302a8d954626008cb14b1f74ae39d399ade67fe2d3404a7788ed5cb792b68cfa
3
+ size 4832007544
model-00003-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a77f2fa237b592921ffecb745ac613764da66643fbe98145435532cc1e63707a
3
+ size 4999813200
model-00004-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a22835800d55155cfc48b8ec2f06a173d203eec52af80c0923c5ad057ee977e9
3
+ size 4999813232
model-00005-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1482c1048034b6bbb552b0d6a3969f5e76149e8dd4a7cc45ddb6cfae4d524a92
3
+ size 4832007592
model-00006-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6a5b0755ef742857b9f785cd6b61c9f3f16677581208d105562f53d6094647b
3
+ size 4999813232
model-00007-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2b0493c68cadb801fe6b5c0ff4d6c9d2093eca333c98736665c36ab7c523900
3
+ size 2231487200
model.safetensors.index.json ADDED
@@ -0,0 +1,493 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 31690522624
4
+ },
5
+ "weight_map": {
6
+ "llm2vec.embed_tokens.weight": "model-00001-of-00007.safetensors",
7
+ "llm2vec.layers.0.input_layernorm.weight": "model-00001-of-00007.safetensors",
8
+ "llm2vec.layers.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
9
+ "llm2vec.layers.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
10
+ "llm2vec.layers.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
11
+ "llm2vec.layers.0.post_attention_layernorm.weight": "model-00001-of-00007.safetensors",
12
+ "llm2vec.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
13
+ "llm2vec.layers.0.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
14
+ "llm2vec.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
15
+ "llm2vec.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
16
+ "llm2vec.layers.1.input_layernorm.weight": "model-00002-of-00007.safetensors",
17
+ "llm2vec.layers.1.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
18
+ "llm2vec.layers.1.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
19
+ "llm2vec.layers.1.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
20
+ "llm2vec.layers.1.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
21
+ "llm2vec.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
22
+ "llm2vec.layers.1.self_attn.o_proj.weight": "model-00001-of-00007.safetensors",
23
+ "llm2vec.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
24
+ "llm2vec.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
25
+ "llm2vec.layers.10.input_layernorm.weight": "model-00003-of-00007.safetensors",
26
+ "llm2vec.layers.10.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
27
+ "llm2vec.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
28
+ "llm2vec.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
29
+ "llm2vec.layers.10.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
30
+ "llm2vec.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
31
+ "llm2vec.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
32
+ "llm2vec.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
33
+ "llm2vec.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
34
+ "llm2vec.layers.11.input_layernorm.weight": "model-00003-of-00007.safetensors",
35
+ "llm2vec.layers.11.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
36
+ "llm2vec.layers.11.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
37
+ "llm2vec.layers.11.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
38
+ "llm2vec.layers.11.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
39
+ "llm2vec.layers.11.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
40
+ "llm2vec.layers.11.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
41
+ "llm2vec.layers.11.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
42
+ "llm2vec.layers.11.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
43
+ "llm2vec.layers.12.input_layernorm.weight": "model-00004-of-00007.safetensors",
44
+ "llm2vec.layers.12.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
45
+ "llm2vec.layers.12.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
46
+ "llm2vec.layers.12.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
47
+ "llm2vec.layers.12.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
48
+ "llm2vec.layers.12.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
49
+ "llm2vec.layers.12.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
50
+ "llm2vec.layers.12.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
51
+ "llm2vec.layers.12.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
52
+ "llm2vec.layers.13.input_layernorm.weight": "model-00004-of-00007.safetensors",
53
+ "llm2vec.layers.13.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
54
+ "llm2vec.layers.13.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
55
+ "llm2vec.layers.13.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
56
+ "llm2vec.layers.13.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
57
+ "llm2vec.layers.13.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
58
+ "llm2vec.layers.13.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
59
+ "llm2vec.layers.13.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
60
+ "llm2vec.layers.13.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
61
+ "llm2vec.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors",
62
+ "llm2vec.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
63
+ "llm2vec.layers.14.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
64
+ "llm2vec.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
65
+ "llm2vec.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
66
+ "llm2vec.layers.14.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
67
+ "llm2vec.layers.14.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
68
+ "llm2vec.layers.14.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
69
+ "llm2vec.layers.14.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
70
+ "llm2vec.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
71
+ "llm2vec.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
72
+ "llm2vec.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
73
+ "llm2vec.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
74
+ "llm2vec.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
75
+ "llm2vec.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
76
+ "llm2vec.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
77
+ "llm2vec.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
78
+ "llm2vec.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
79
+ "llm2vec.layers.16.input_layernorm.weight": "model-00004-of-00007.safetensors",
80
+ "llm2vec.layers.16.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
81
+ "llm2vec.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
82
+ "llm2vec.layers.16.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
83
+ "llm2vec.layers.16.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
84
+ "llm2vec.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
85
+ "llm2vec.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
86
+ "llm2vec.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
87
+ "llm2vec.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
88
+ "llm2vec.layers.17.input_layernorm.weight": "model-00004-of-00007.safetensors",
89
+ "llm2vec.layers.17.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
90
+ "llm2vec.layers.17.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
91
+ "llm2vec.layers.17.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
92
+ "llm2vec.layers.17.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
93
+ "llm2vec.layers.17.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
94
+ "llm2vec.layers.17.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
95
+ "llm2vec.layers.17.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
96
+ "llm2vec.layers.17.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
97
+ "llm2vec.layers.18.input_layernorm.weight": "model-00005-of-00007.safetensors",
98
+ "llm2vec.layers.18.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
99
+ "llm2vec.layers.18.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
100
+ "llm2vec.layers.18.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
101
+ "llm2vec.layers.18.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
102
+ "llm2vec.layers.18.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
103
+ "llm2vec.layers.18.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
104
+ "llm2vec.layers.18.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
105
+ "llm2vec.layers.18.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
106
+ "llm2vec.layers.19.input_layernorm.weight": "model-00005-of-00007.safetensors",
107
+ "llm2vec.layers.19.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
108
+ "llm2vec.layers.19.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
109
+ "llm2vec.layers.19.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
110
+ "llm2vec.layers.19.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
111
+ "llm2vec.layers.19.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
112
+ "llm2vec.layers.19.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
113
+ "llm2vec.layers.19.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
114
+ "llm2vec.layers.19.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
115
+ "llm2vec.layers.2.input_layernorm.weight": "model-00002-of-00007.safetensors",
116
+ "llm2vec.layers.2.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
117
+ "llm2vec.layers.2.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
118
+ "llm2vec.layers.2.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
119
+ "llm2vec.layers.2.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
120
+ "llm2vec.layers.2.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
121
+ "llm2vec.layers.2.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
122
+ "llm2vec.layers.2.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
123
+ "llm2vec.layers.2.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
124
+ "llm2vec.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors",
125
+ "llm2vec.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
126
+ "llm2vec.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
127
+ "llm2vec.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
128
+ "llm2vec.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
129
+ "llm2vec.layers.20.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
130
+ "llm2vec.layers.20.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
131
+ "llm2vec.layers.20.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
132
+ "llm2vec.layers.20.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
133
+ "llm2vec.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors",
134
+ "llm2vec.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
135
+ "llm2vec.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
136
+ "llm2vec.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
137
+ "llm2vec.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
138
+ "llm2vec.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
139
+ "llm2vec.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
140
+ "llm2vec.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
141
+ "llm2vec.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
142
+ "llm2vec.layers.22.input_layernorm.weight": "model-00005-of-00007.safetensors",
143
+ "llm2vec.layers.22.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
144
+ "llm2vec.layers.22.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
145
+ "llm2vec.layers.22.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
146
+ "llm2vec.layers.22.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
147
+ "llm2vec.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
148
+ "llm2vec.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
149
+ "llm2vec.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
150
+ "llm2vec.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
151
+ "llm2vec.layers.23.input_layernorm.weight": "model-00006-of-00007.safetensors",
152
+ "llm2vec.layers.23.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
153
+ "llm2vec.layers.23.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
154
+ "llm2vec.layers.23.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
155
+ "llm2vec.layers.23.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
156
+ "llm2vec.layers.23.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
157
+ "llm2vec.layers.23.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
158
+ "llm2vec.layers.23.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
159
+ "llm2vec.layers.23.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
160
+ "llm2vec.layers.24.input_layernorm.weight": "model-00006-of-00007.safetensors",
161
+ "llm2vec.layers.24.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
162
+ "llm2vec.layers.24.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
163
+ "llm2vec.layers.24.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
164
+ "llm2vec.layers.24.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
165
+ "llm2vec.layers.24.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
166
+ "llm2vec.layers.24.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
167
+ "llm2vec.layers.24.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
168
+ "llm2vec.layers.24.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
169
+ "llm2vec.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors",
170
+ "llm2vec.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
171
+ "llm2vec.layers.25.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
172
+ "llm2vec.layers.25.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
173
+ "llm2vec.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
174
+ "llm2vec.layers.25.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
175
+ "llm2vec.layers.25.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
176
+ "llm2vec.layers.25.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
177
+ "llm2vec.layers.25.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
178
+ "llm2vec.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors",
179
+ "llm2vec.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
180
+ "llm2vec.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
181
+ "llm2vec.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
182
+ "llm2vec.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
183
+ "llm2vec.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
184
+ "llm2vec.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
185
+ "llm2vec.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
186
+ "llm2vec.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
187
+ "llm2vec.layers.27.input_layernorm.weight": "model-00006-of-00007.safetensors",
188
+ "llm2vec.layers.27.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
189
+ "llm2vec.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
190
+ "llm2vec.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
191
+ "llm2vec.layers.27.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
192
+ "llm2vec.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
193
+ "llm2vec.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
194
+ "llm2vec.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
195
+ "llm2vec.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
196
+ "llm2vec.layers.28.input_layernorm.weight": "model-00006-of-00007.safetensors",
197
+ "llm2vec.layers.28.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
198
+ "llm2vec.layers.28.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
199
+ "llm2vec.layers.28.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
200
+ "llm2vec.layers.28.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
201
+ "llm2vec.layers.28.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
202
+ "llm2vec.layers.28.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
203
+ "llm2vec.layers.28.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
204
+ "llm2vec.layers.28.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
205
+ "llm2vec.layers.29.input_layernorm.weight": "model-00007-of-00007.safetensors",
206
+ "llm2vec.layers.29.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
207
+ "llm2vec.layers.29.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
208
+ "llm2vec.layers.29.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
209
+ "llm2vec.layers.29.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
210
+ "llm2vec.layers.29.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
211
+ "llm2vec.layers.29.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
212
+ "llm2vec.layers.29.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
213
+ "llm2vec.layers.29.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
214
+ "llm2vec.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
215
+ "llm2vec.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
216
+ "llm2vec.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
217
+ "llm2vec.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
218
+ "llm2vec.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
219
+ "llm2vec.layers.3.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
220
+ "llm2vec.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
221
+ "llm2vec.layers.3.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
222
+ "llm2vec.layers.3.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
223
+ "llm2vec.layers.30.input_layernorm.weight": "model-00007-of-00007.safetensors",
224
+ "llm2vec.layers.30.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
225
+ "llm2vec.layers.30.mlp.gate_proj.weight": "model-00007-of-00007.safetensors",
226
+ "llm2vec.layers.30.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
227
+ "llm2vec.layers.30.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
228
+ "llm2vec.layers.30.self_attn.k_proj.weight": "model-00007-of-00007.safetensors",
229
+ "llm2vec.layers.30.self_attn.o_proj.weight": "model-00007-of-00007.safetensors",
230
+ "llm2vec.layers.30.self_attn.q_proj.weight": "model-00007-of-00007.safetensors",
231
+ "llm2vec.layers.30.self_attn.v_proj.weight": "model-00007-of-00007.safetensors",
232
+ "llm2vec.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors",
233
+ "llm2vec.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
234
+ "llm2vec.layers.31.mlp.gate_proj.weight": "model-00007-of-00007.safetensors",
235
+ "llm2vec.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
236
+ "llm2vec.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
237
+ "llm2vec.layers.31.self_attn.k_proj.weight": "model-00007-of-00007.safetensors",
238
+ "llm2vec.layers.31.self_attn.o_proj.weight": "model-00007-of-00007.safetensors",
239
+ "llm2vec.layers.31.self_attn.q_proj.weight": "model-00007-of-00007.safetensors",
240
+ "llm2vec.layers.31.self_attn.v_proj.weight": "model-00007-of-00007.safetensors",
241
+ "llm2vec.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
242
+ "llm2vec.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
243
+ "llm2vec.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
244
+ "llm2vec.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
245
+ "llm2vec.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
246
+ "llm2vec.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
247
+ "llm2vec.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
248
+ "llm2vec.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
249
+ "llm2vec.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
250
+ "llm2vec.layers.5.input_layernorm.weight": "model-00002-of-00007.safetensors",
251
+ "llm2vec.layers.5.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
252
+ "llm2vec.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
253
+ "llm2vec.layers.5.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
254
+ "llm2vec.layers.5.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
255
+ "llm2vec.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
256
+ "llm2vec.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
257
+ "llm2vec.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
258
+ "llm2vec.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
259
+ "llm2vec.layers.6.input_layernorm.weight": "model-00003-of-00007.safetensors",
260
+ "llm2vec.layers.6.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
261
+ "llm2vec.layers.6.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
262
+ "llm2vec.layers.6.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
263
+ "llm2vec.layers.6.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
264
+ "llm2vec.layers.6.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
265
+ "llm2vec.layers.6.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
266
+ "llm2vec.layers.6.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
267
+ "llm2vec.layers.6.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
268
+ "llm2vec.layers.7.input_layernorm.weight": "model-00003-of-00007.safetensors",
269
+ "llm2vec.layers.7.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
270
+ "llm2vec.layers.7.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
271
+ "llm2vec.layers.7.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
272
+ "llm2vec.layers.7.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
273
+ "llm2vec.layers.7.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
274
+ "llm2vec.layers.7.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
275
+ "llm2vec.layers.7.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
276
+ "llm2vec.layers.7.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
277
+ "llm2vec.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors",
278
+ "llm2vec.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
279
+ "llm2vec.layers.8.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
280
+ "llm2vec.layers.8.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
281
+ "llm2vec.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
282
+ "llm2vec.layers.8.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
283
+ "llm2vec.layers.8.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
284
+ "llm2vec.layers.8.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
285
+ "llm2vec.layers.8.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
286
+ "llm2vec.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
287
+ "llm2vec.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
288
+ "llm2vec.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
289
+ "llm2vec.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
290
+ "llm2vec.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
291
+ "llm2vec.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
292
+ "llm2vec.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
293
+ "llm2vec.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
294
+ "llm2vec.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
295
+ "llm2vec.norm.weight": "model-00007-of-00007.safetensors",
296
+ "nllb_encoder.embed_tokens.weight": "model-00001-of-00007.safetensors",
297
+ "nllb_encoder.layer_norm.bias": "model-00001-of-00007.safetensors",
298
+ "nllb_encoder.layer_norm.weight": "model-00001-of-00007.safetensors",
299
+ "nllb_encoder.layers.0.fc1.bias": "model-00001-of-00007.safetensors",
300
+ "nllb_encoder.layers.0.fc1.weight": "model-00001-of-00007.safetensors",
301
+ "nllb_encoder.layers.0.fc2.bias": "model-00001-of-00007.safetensors",
302
+ "nllb_encoder.layers.0.fc2.weight": "model-00001-of-00007.safetensors",
303
+ "nllb_encoder.layers.0.final_layer_norm.bias": "model-00001-of-00007.safetensors",
304
+ "nllb_encoder.layers.0.final_layer_norm.weight": "model-00001-of-00007.safetensors",
305
+ "nllb_encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
306
+ "nllb_encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
307
+ "nllb_encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
308
+ "nllb_encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
309
+ "nllb_encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
310
+ "nllb_encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
311
+ "nllb_encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
312
+ "nllb_encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
313
+ "nllb_encoder.layers.0.self_attn_layer_norm.bias": "model-00001-of-00007.safetensors",
314
+ "nllb_encoder.layers.0.self_attn_layer_norm.weight": "model-00001-of-00007.safetensors",
315
+ "nllb_encoder.layers.1.fc1.bias": "model-00001-of-00007.safetensors",
316
+ "nllb_encoder.layers.1.fc1.weight": "model-00001-of-00007.safetensors",
317
+ "nllb_encoder.layers.1.fc2.bias": "model-00001-of-00007.safetensors",
318
+ "nllb_encoder.layers.1.fc2.weight": "model-00001-of-00007.safetensors",
319
+ "nllb_encoder.layers.1.final_layer_norm.bias": "model-00001-of-00007.safetensors",
320
+ "nllb_encoder.layers.1.final_layer_norm.weight": "model-00001-of-00007.safetensors",
321
+ "nllb_encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
322
+ "nllb_encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
323
+ "nllb_encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
324
+ "nllb_encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
325
+ "nllb_encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
326
+ "nllb_encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
327
+ "nllb_encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
328
+ "nllb_encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
329
+ "nllb_encoder.layers.1.self_attn_layer_norm.bias": "model-00001-of-00007.safetensors",
330
+ "nllb_encoder.layers.1.self_attn_layer_norm.weight": "model-00001-of-00007.safetensors",
331
+ "nllb_encoder.layers.10.fc1.bias": "model-00001-of-00007.safetensors",
332
+ "nllb_encoder.layers.10.fc1.weight": "model-00001-of-00007.safetensors",
333
+ "nllb_encoder.layers.10.fc2.bias": "model-00001-of-00007.safetensors",
334
+ "nllb_encoder.layers.10.fc2.weight": "model-00001-of-00007.safetensors",
335
+ "nllb_encoder.layers.10.final_layer_norm.bias": "model-00001-of-00007.safetensors",
336
+ "nllb_encoder.layers.10.final_layer_norm.weight": "model-00001-of-00007.safetensors",
337
+ "nllb_encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
338
+ "nllb_encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
339
+ "nllb_encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
340
+ "nllb_encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
341
+ "nllb_encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
342
+ "nllb_encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
343
+ "nllb_encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
344
+ "nllb_encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
345
+ "nllb_encoder.layers.10.self_attn_layer_norm.bias": "model-00001-of-00007.safetensors",
346
+ "nllb_encoder.layers.10.self_attn_layer_norm.weight": "model-00001-of-00007.safetensors",
347
+ "nllb_encoder.layers.11.fc1.bias": "model-00001-of-00007.safetensors",
348
+ "nllb_encoder.layers.11.fc1.weight": "model-00001-of-00007.safetensors",
349
+ "nllb_encoder.layers.11.fc2.bias": "model-00001-of-00007.safetensors",
350
+ "nllb_encoder.layers.11.fc2.weight": "model-00001-of-00007.safetensors",
351
+ "nllb_encoder.layers.11.final_layer_norm.bias": "model-00001-of-00007.safetensors",
352
+ "nllb_encoder.layers.11.final_layer_norm.weight": "model-00001-of-00007.safetensors",
353
+ "nllb_encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
354
+ "nllb_encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
355
+ "nllb_encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
356
+ "nllb_encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
357
+ "nllb_encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
358
+ "nllb_encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
359
+ "nllb_encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
360
+ "nllb_encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
361
+ "nllb_encoder.layers.11.self_attn_layer_norm.bias": "model-00001-of-00007.safetensors",
362
+ "nllb_encoder.layers.11.self_attn_layer_norm.weight": "model-00001-of-00007.safetensors",
363
+ "nllb_encoder.layers.2.fc1.bias": "model-00001-of-00007.safetensors",
364
+ "nllb_encoder.layers.2.fc1.weight": "model-00001-of-00007.safetensors",
365
+ "nllb_encoder.layers.2.fc2.bias": "model-00001-of-00007.safetensors",
366
+ "nllb_encoder.layers.2.fc2.weight": "model-00001-of-00007.safetensors",
367
+ "nllb_encoder.layers.2.final_layer_norm.bias": "model-00001-of-00007.safetensors",
368
+ "nllb_encoder.layers.2.final_layer_norm.weight": "model-00001-of-00007.safetensors",
369
+ "nllb_encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
370
+ "nllb_encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
371
+ "nllb_encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
372
+ "nllb_encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
373
+ "nllb_encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
374
+ "nllb_encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
375
+ "nllb_encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
376
+ "nllb_encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
377
+ "nllb_encoder.layers.2.self_attn_layer_norm.bias": "model-00001-of-00007.safetensors",
378
+ "nllb_encoder.layers.2.self_attn_layer_norm.weight": "model-00001-of-00007.safetensors",
379
+ "nllb_encoder.layers.3.fc1.bias": "model-00001-of-00007.safetensors",
380
+ "nllb_encoder.layers.3.fc1.weight": "model-00001-of-00007.safetensors",
381
+ "nllb_encoder.layers.3.fc2.bias": "model-00001-of-00007.safetensors",
382
+ "nllb_encoder.layers.3.fc2.weight": "model-00001-of-00007.safetensors",
383
+ "nllb_encoder.layers.3.final_layer_norm.bias": "model-00001-of-00007.safetensors",
384
+ "nllb_encoder.layers.3.final_layer_norm.weight": "model-00001-of-00007.safetensors",
385
+ "nllb_encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
386
+ "nllb_encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
387
+ "nllb_encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
388
+ "nllb_encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
389
+ "nllb_encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
390
+ "nllb_encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
391
+ "nllb_encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
392
+ "nllb_encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
393
+ "nllb_encoder.layers.3.self_attn_layer_norm.bias": "model-00001-of-00007.safetensors",
394
+ "nllb_encoder.layers.3.self_attn_layer_norm.weight": "model-00001-of-00007.safetensors",
395
+ "nllb_encoder.layers.4.fc1.bias": "model-00001-of-00007.safetensors",
396
+ "nllb_encoder.layers.4.fc1.weight": "model-00001-of-00007.safetensors",
397
+ "nllb_encoder.layers.4.fc2.bias": "model-00001-of-00007.safetensors",
398
+ "nllb_encoder.layers.4.fc2.weight": "model-00001-of-00007.safetensors",
399
+ "nllb_encoder.layers.4.final_layer_norm.bias": "model-00001-of-00007.safetensors",
400
+ "nllb_encoder.layers.4.final_layer_norm.weight": "model-00001-of-00007.safetensors",
401
+ "nllb_encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
402
+ "nllb_encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
403
+ "nllb_encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
404
+ "nllb_encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
405
+ "nllb_encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
406
+ "nllb_encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
407
+ "nllb_encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
408
+ "nllb_encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
409
+ "nllb_encoder.layers.4.self_attn_layer_norm.bias": "model-00001-of-00007.safetensors",
410
+ "nllb_encoder.layers.4.self_attn_layer_norm.weight": "model-00001-of-00007.safetensors",
411
+ "nllb_encoder.layers.5.fc1.bias": "model-00001-of-00007.safetensors",
412
+ "nllb_encoder.layers.5.fc1.weight": "model-00001-of-00007.safetensors",
413
+ "nllb_encoder.layers.5.fc2.bias": "model-00001-of-00007.safetensors",
414
+ "nllb_encoder.layers.5.fc2.weight": "model-00001-of-00007.safetensors",
415
+ "nllb_encoder.layers.5.final_layer_norm.bias": "model-00001-of-00007.safetensors",
416
+ "nllb_encoder.layers.5.final_layer_norm.weight": "model-00001-of-00007.safetensors",
417
+ "nllb_encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
418
+ "nllb_encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
419
+ "nllb_encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
420
+ "nllb_encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
421
+ "nllb_encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
422
+ "nllb_encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
423
+ "nllb_encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
424
+ "nllb_encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
425
+ "nllb_encoder.layers.5.self_attn_layer_norm.bias": "model-00001-of-00007.safetensors",
426
+ "nllb_encoder.layers.5.self_attn_layer_norm.weight": "model-00001-of-00007.safetensors",
427
+ "nllb_encoder.layers.6.fc1.bias": "model-00001-of-00007.safetensors",
428
+ "nllb_encoder.layers.6.fc1.weight": "model-00001-of-00007.safetensors",
429
+ "nllb_encoder.layers.6.fc2.bias": "model-00001-of-00007.safetensors",
430
+ "nllb_encoder.layers.6.fc2.weight": "model-00001-of-00007.safetensors",
431
+ "nllb_encoder.layers.6.final_layer_norm.bias": "model-00001-of-00007.safetensors",
432
+ "nllb_encoder.layers.6.final_layer_norm.weight": "model-00001-of-00007.safetensors",
433
+ "nllb_encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
434
+ "nllb_encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
435
+ "nllb_encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
436
+ "nllb_encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
437
+ "nllb_encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
438
+ "nllb_encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
439
+ "nllb_encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
440
+ "nllb_encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
441
+ "nllb_encoder.layers.6.self_attn_layer_norm.bias": "model-00001-of-00007.safetensors",
442
+ "nllb_encoder.layers.6.self_attn_layer_norm.weight": "model-00001-of-00007.safetensors",
443
+ "nllb_encoder.layers.7.fc1.bias": "model-00001-of-00007.safetensors",
444
+ "nllb_encoder.layers.7.fc1.weight": "model-00001-of-00007.safetensors",
445
+ "nllb_encoder.layers.7.fc2.bias": "model-00001-of-00007.safetensors",
446
+ "nllb_encoder.layers.7.fc2.weight": "model-00001-of-00007.safetensors",
447
+ "nllb_encoder.layers.7.final_layer_norm.bias": "model-00001-of-00007.safetensors",
448
+ "nllb_encoder.layers.7.final_layer_norm.weight": "model-00001-of-00007.safetensors",
449
+ "nllb_encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
450
+ "nllb_encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
451
+ "nllb_encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
452
+ "nllb_encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
453
+ "nllb_encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
454
+ "nllb_encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
455
+ "nllb_encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
456
+ "nllb_encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
457
+ "nllb_encoder.layers.7.self_attn_layer_norm.bias": "model-00001-of-00007.safetensors",
458
+ "nllb_encoder.layers.7.self_attn_layer_norm.weight": "model-00001-of-00007.safetensors",
459
+ "nllb_encoder.layers.8.fc1.bias": "model-00001-of-00007.safetensors",
460
+ "nllb_encoder.layers.8.fc1.weight": "model-00001-of-00007.safetensors",
461
+ "nllb_encoder.layers.8.fc2.bias": "model-00001-of-00007.safetensors",
462
+ "nllb_encoder.layers.8.fc2.weight": "model-00001-of-00007.safetensors",
463
+ "nllb_encoder.layers.8.final_layer_norm.bias": "model-00001-of-00007.safetensors",
464
+ "nllb_encoder.layers.8.final_layer_norm.weight": "model-00001-of-00007.safetensors",
465
+ "nllb_encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
466
+ "nllb_encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
467
+ "nllb_encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
468
+ "nllb_encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
469
+ "nllb_encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
470
+ "nllb_encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
471
+ "nllb_encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
472
+ "nllb_encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
473
+ "nllb_encoder.layers.8.self_attn_layer_norm.bias": "model-00001-of-00007.safetensors",
474
+ "nllb_encoder.layers.8.self_attn_layer_norm.weight": "model-00001-of-00007.safetensors",
475
+ "nllb_encoder.layers.9.fc1.bias": "model-00001-of-00007.safetensors",
476
+ "nllb_encoder.layers.9.fc1.weight": "model-00001-of-00007.safetensors",
477
+ "nllb_encoder.layers.9.fc2.bias": "model-00001-of-00007.safetensors",
478
+ "nllb_encoder.layers.9.fc2.weight": "model-00001-of-00007.safetensors",
479
+ "nllb_encoder.layers.9.final_layer_norm.bias": "model-00001-of-00007.safetensors",
480
+ "nllb_encoder.layers.9.final_layer_norm.weight": "model-00001-of-00007.safetensors",
481
+ "nllb_encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
482
+ "nllb_encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
483
+ "nllb_encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00007.safetensors",
484
+ "nllb_encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00007.safetensors",
485
+ "nllb_encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
486
+ "nllb_encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
487
+ "nllb_encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
488
+ "nllb_encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
489
+ "nllb_encoder.layers.9.self_attn_layer_norm.bias": "model-00001-of-00007.safetensors",
490
+ "nllb_encoder.layers.9.self_attn_layer_norm.weight": "model-00001-of-00007.safetensors",
491
+ "up_proj.weight": "model-00007-of-00007.safetensors"
492
+ }
493
+ }
modeling_llama_encoder.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import importlib.metadata
2
+
3
+ import torch
4
+ from packaging import version
5
+ from torch import nn
6
+ from transformers import LlamaConfig, LlamaModel, LlamaPreTrainedModel
7
+ from transformers.cache_utils import Cache, StaticCache
8
+ from transformers.modeling_attn_mask_utils import AttentionMaskConverter
9
+ from transformers.models.llama.modeling_llama import (
10
+ LlamaAttention,
11
+ LlamaDecoderLayer,
12
+ LlamaFlashAttention2,
13
+ LlamaMLP,
14
+ LlamaRMSNorm,
15
+ LlamaRotaryEmbedding,
16
+ LlamaSdpaAttention,
17
+ )
18
+ from transformers.utils import logging
19
+ from transformers.utils.import_utils import _is_package_available
20
+
21
+ logger = logging.get_logger(__name__)
22
+
23
+
24
+ def is_transformers_attn_greater_or_equal_4_43_1():
25
+ if not _is_package_available("transformers"):
26
+ return False
27
+
28
+ return version.parse(importlib.metadata.version("transformers")) >= version.parse(
29
+ "4.43.1"
30
+ )
31
+
32
+
33
+ class ModifiedLlamaAttention(LlamaAttention):
34
+ def __init__(self, *args, **kwargs):
35
+ super().__init__(*args, **kwargs)
36
+ self.is_causal = False
37
+
38
+
39
+ class ModifiedLlamaFlashAttention2(LlamaFlashAttention2):
40
+ def __init__(self, *args, **kwargs):
41
+ super().__init__(*args, **kwargs)
42
+ self.is_causal = False
43
+
44
+
45
+ class ModifiedLlamaSdpaAttention(LlamaSdpaAttention):
46
+ def __init__(self, *args, **kwargs):
47
+ super().__init__(*args, **kwargs)
48
+ self.is_causal = False
49
+
50
+
51
+ LLAMA_ATTENTION_CLASSES = {
52
+ "eager": ModifiedLlamaAttention,
53
+ "flash_attention_2": ModifiedLlamaFlashAttention2,
54
+ "sdpa": ModifiedLlamaSdpaAttention,
55
+ }
56
+
57
+
58
+ class ModifiedLlamaDecoderLayer(LlamaDecoderLayer):
59
+ def __init__(self, config: LlamaConfig, layer_idx: int):
60
+ nn.Module.__init__(self)
61
+ self.hidden_size = config.hidden_size
62
+
63
+ self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](
64
+ config=config, layer_idx=layer_idx
65
+ )
66
+
67
+ self.mlp = LlamaMLP(config)
68
+ self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
69
+ self.post_attention_layernorm = LlamaRMSNorm(
70
+ config.hidden_size, eps=config.rms_norm_eps
71
+ )
72
+
73
+
74
+ class LlamaEncoderModel(LlamaModel):
75
+ _no_split_modules = ["ModifiedLlamaDecoderLayer"]
76
+
77
+ def __init__(self, config: LlamaConfig):
78
+ if not is_transformers_attn_greater_or_equal_4_43_1():
79
+ raise ValueError(
80
+ "The current implementation of LlamaEncoderModel follows modeling_llama.py of transformers version >= 4.43.1"
81
+ )
82
+ LlamaPreTrainedModel.__init__(self, config)
83
+ self.padding_idx = config.pad_token_id
84
+ self.vocab_size = config.vocab_size
85
+
86
+ self.embed_tokens = nn.Embedding(
87
+ config.vocab_size, config.hidden_size, self.padding_idx
88
+ )
89
+ self.layers = nn.ModuleList(
90
+ [
91
+ ModifiedLlamaDecoderLayer(config, layer_idx)
92
+ for layer_idx in range(config.num_hidden_layers)
93
+ ]
94
+ )
95
+ self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
96
+ self.rotary_emb = LlamaRotaryEmbedding(config=config)
97
+ self.gradient_checkpointing = False
98
+
99
+ # Initialize weights and apply final processing
100
+ self.post_init()
101
+
102
+ def _update_causal_mask(
103
+ self,
104
+ attention_mask,
105
+ input_tensor,
106
+ cache_position,
107
+ past_key_values: Cache,
108
+ output_attentions: bool,
109
+ ):
110
+ if self.config._attn_implementation == "flash_attention_2":
111
+ if attention_mask is not None and 0.0 in attention_mask:
112
+ return attention_mask
113
+ return None
114
+
115
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
116
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
117
+ # to infer the attention mask.
118
+ past_seen_tokens = (
119
+ past_key_values.get_seq_length() if past_key_values is not None else 0
120
+ )
121
+ using_static_cache = isinstance(past_key_values, StaticCache)
122
+
123
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
124
+ # if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
125
+ # if AttentionMaskConverter._ignore_causal_mask_sdpa(
126
+ # attention_mask,
127
+ # inputs_embeds=input_tensor,
128
+ # past_key_values_length=past_seen_tokens,
129
+ # is_training=self.training,
130
+ # ):
131
+ # return None
132
+
133
+ dtype, device = input_tensor.dtype, input_tensor.device
134
+ min_dtype = torch.finfo(dtype).min
135
+ sequence_length = input_tensor.shape[1]
136
+ if using_static_cache:
137
+ target_length = past_key_values.get_max_length()
138
+ else:
139
+ target_length = (
140
+ attention_mask.shape[-1]
141
+ if isinstance(attention_mask, torch.Tensor)
142
+ else past_seen_tokens + sequence_length + 1
143
+ )
144
+
145
+ causal_mask = torch.zeros(
146
+ (sequence_length, target_length), dtype=dtype, device=device
147
+ ) # in original implementation - torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
148
+ # Commenting out next 2 lines to disable causal masking
149
+ # if sequence_length != 1:
150
+ # causal_mask = torch.triu(causal_mask, diagonal=1)
151
+ causal_mask *= torch.arange(
152
+ target_length, device=device
153
+ ) > cache_position.reshape(-1, 1)
154
+ causal_mask = causal_mask[None, None, :, :].expand(
155
+ input_tensor.shape[0], 1, -1, -1
156
+ )
157
+ if attention_mask is not None:
158
+ causal_mask = (
159
+ causal_mask.clone()
160
+ ) # copy to contiguous memory for in-place edit
161
+ if attention_mask.dim() == 2:
162
+ mask_length = attention_mask.shape[-1]
163
+ padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[
164
+ :, None, None, :
165
+ ].eq(0.0)
166
+ causal_mask[..., :mask_length] = causal_mask[
167
+ ..., :mask_length
168
+ ].masked_fill(padding_mask, min_dtype)
169
+ elif attention_mask.dim() == 4:
170
+ # backwards compatibility: we allow passing a 4D attention mask shorter than the input length with
171
+ # cache. In that case, the 4D attention mask attends to the newest tokens only.
172
+ if attention_mask.shape[-2] < cache_position[0] + sequence_length:
173
+ offset = cache_position[0]
174
+ else:
175
+ offset = 0
176
+ mask_shape = attention_mask.shape
177
+ mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype
178
+ causal_mask[
179
+ : mask_shape[0],
180
+ : mask_shape[1],
181
+ offset : mask_shape[2] + offset,
182
+ : mask_shape[3],
183
+ ] = mask_slice
184
+
185
+ if (
186
+ self.config._attn_implementation == "sdpa"
187
+ and attention_mask is not None
188
+ and attention_mask.device.type == "cuda"
189
+ and not output_attentions
190
+ ):
191
+ causal_mask = AttentionMaskConverter._unmask_unattended(
192
+ causal_mask, min_dtype
193
+ )
194
+
195
+ return causal_mask
modeling_nllbllm2vec.py ADDED
@@ -0,0 +1,481 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, List, Optional, Tuple, cast
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+ from transformers.models.auto import AutoModel
7
+ from transformers.modeling_outputs import BaseModelOutputWithPooling
8
+ from transformers.modeling_utils import PreTrainedModel
9
+ from transformers.models.m2m_100.modeling_m2m_100 import M2M100Encoder
10
+
11
+ from .configuration_nllbllm2vec import NLLBLLM2VecConfig
12
+ from .modeling_llama_encoder import LlamaEncoderModel
13
+
14
+
15
+ class NLLBLLM2Vec(PreTrainedModel):
16
+ config_class = NLLBLLM2VecConfig
17
+ """
18
+ NLLBLLM2Vec model combining NLLB and LLama encoders.
19
+
20
+ Args:
21
+ config (Optional[NLLBLLM2VecConfig]): Configuration object.
22
+ nllb_encoder (Optional[M2M100Encoder]): Pre-initialized NLLB encoder.
23
+ llm2vec (Optional[LlamaEncoderModel]): Pre-initialized LLama encoder.
24
+ *inputs: Additional positional arguments.
25
+ **kwargs: Additional keyword arguments.
26
+ """
27
+
28
+ model_type = "nllb-llm2vec"
29
+
30
+ def __init__(
31
+ self,
32
+ config: Optional[NLLBLLM2VecConfig] = None,
33
+ nllb_encoder: Optional[M2M100Encoder] = None,
34
+ llm2vec: Optional[LlamaEncoderModel] = None,
35
+ *inputs,
36
+ **kwargs,
37
+ ):
38
+ # Ensure that either config is not None or both encoders are provided
39
+ if config is None and (nllb_encoder is None or llm2vec is None):
40
+ raise ValueError(
41
+ "Either `config` must be provided, or both `nllb_encoder` and `llm2vec` must be specified."
42
+ )
43
+
44
+ if config is not None:
45
+ super().__init__(config, *inputs, **kwargs)
46
+ self.nllb_encoder = nllb_encoder or M2M100Encoder(config.nllb_config)
47
+ self.llm2vec = llm2vec or LlamaEncoderModel(config.llm2vec_config)
48
+ self.config = config
49
+ else:
50
+ # Both encoders are provided
51
+ self.nllb_encoder = cast(M2M100Encoder, nllb_encoder)
52
+ self.llm2vec = cast(LlamaEncoderModel, llm2vec)
53
+ self.config = NLLBLLM2VecConfig(
54
+ nllb_config=self.nllb_encoder.config, # type: ignore
55
+ llm2vec_config=self.llm2vec.config, # type: ignore
56
+ )
57
+ super().__init__(self.config, *inputs, **kwargs)
58
+
59
+ self.up_proj = nn.Linear(
60
+ self.nllb_encoder.config.d_model,
61
+ self.llm2vec.config.hidden_size,
62
+ bias=False,
63
+ )
64
+ # Additional initialization logic can go here
65
+
66
+ def forward(
67
+ self,
68
+ input_ids: torch.Tensor,
69
+ attention_mask: torch.Tensor,
70
+ indices: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
71
+ *args,
72
+ **kwargs,
73
+ ) -> BaseModelOutputWithPooling:
74
+ """
75
+ Forward pass of the model.
76
+
77
+ Args:
78
+ input_ids (torch.Tensor): Input token IDs.
79
+ attention_mask (torch.Tensor): Attention mask.
80
+ indices (Optional[Tuple[torch.Tensor, torch.Tensor]]): Precomputed input indices and offsets.
81
+
82
+ Returns:
83
+ BaseModelOutputWithPooling: Model outputs with last hidden state and pooled output.
84
+ """
85
+ # Compute input indices and offsets if not provided
86
+ if indices is None:
87
+ seq_indices, seq_offsets = self._get_input_offsets(attention_mask)
88
+ else:
89
+ seq_indices, seq_offsets = indices
90
+
91
+ with torch.inference_mode():
92
+ nllb_outputs = self.nllb_encoder(
93
+ input_ids=input_ids,
94
+ attention_mask=attention_mask,
95
+ )
96
+ nllb_last_hidden_state = nllb_outputs.last_hidden_state
97
+ nllb_last_hidden_state = self.up_proj(nllb_last_hidden_state)
98
+ if self.training:
99
+ # Inference mode otherwise yields embeddings that cannot be trained with
100
+ nllb_last_hidden_state = nllb_last_hidden_state.detach().clone()
101
+ outputs = self.llm2vec(
102
+ inputs_embeds=nllb_last_hidden_state,
103
+ attention_mask=attention_mask,
104
+ )
105
+ pooler_output = self._mean_embedding(
106
+ hidden_states=outputs.last_hidden_state,
107
+ input_indices=seq_indices,
108
+ offsets=seq_offsets,
109
+ )
110
+ return BaseModelOutputWithPooling(
111
+ last_hidden_state=outputs.last_hidden_state,
112
+ pooler_output=pooler_output,
113
+ )
114
+
115
+ @property
116
+ def tokenizer(self):
117
+ """
118
+ Get the tokenizer associated with the model.
119
+
120
+ Returns:
121
+ PreTrainedTokenizer: The tokenizer instance.
122
+ """
123
+ if not hasattr(self, "_tokenizer"):
124
+ from transformers import AutoTokenizer
125
+
126
+ self._tokenizer = AutoTokenizer.from_pretrained(
127
+ "facebook/nllb-200-distilled-600M", padding_side="right"
128
+ )
129
+ return self._tokenizer
130
+
131
+ def encode(
132
+ self,
133
+ inputs: List[str],
134
+ src_lang: str = "eng_Latn",
135
+ tokenize_kwargs: Optional[Dict[str, Any]] = None,
136
+ ) -> torch.Tensor:
137
+ """
138
+ Encode input texts into embeddings.
139
+
140
+ Args:
141
+ inputs (List[str]): List of input texts.
142
+ src_lang (str): Source language code.
143
+ tokenize_kwargs (Optional[Dict[str, Any]]): Additional keyword arguments for the tokenizer.
144
+ Defaults to:
145
+ >> tokenize_kwargs = {
146
+ >> "padding": True,
147
+ >> "truncation": True,
148
+ >> "max_length": 512,
149
+ >> "return_tensors": "pt",
150
+ >> }
151
+
152
+ Returns:
153
+ torch.Tensor: Mean-pooled sequence embeddings of the inputs.
154
+ """
155
+ if tokenize_kwargs is None:
156
+ tokenize_kwargs = {
157
+ "padding": True,
158
+ "truncation": True,
159
+ "max_length": 512,
160
+ "return_tensors": "pt",
161
+ }
162
+
163
+ tokenizer = self.tokenizer
164
+ tokenizer.src_lang = src_lang
165
+ device = next(self.parameters()).device
166
+ batch = tokenizer(inputs, **tokenize_kwargs).to(device)
167
+ device_type = device.type # e.g., 'cuda' or 'cpu'
168
+
169
+ with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
170
+ return self(**batch).pooler_output
171
+
172
+ @staticmethod
173
+ def _get_input_offsets(
174
+ attention_mask: torch.Tensor,
175
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
176
+ """
177
+ Compute indices and offsets for mean pooling using EmbeddingBag.
178
+
179
+ Args:
180
+ attention_mask (torch.Tensor): Attention mask of shape (batch_size, seq_len).
181
+
182
+ Returns:
183
+ Tuple[torch.Tensor, torch.Tensor]: A tuple containing:
184
+ - input_indices: Indices of non-padded tokens in the flattened input.
185
+ - offsets: Offsets indicating the start index of each sequence in the flattened input.
186
+ """
187
+ # Find the indices of non-padded tokens in flattened hidden_states
188
+ input_indices = attention_mask.view(-1).nonzero(as_tuple=False).squeeze()
189
+
190
+ # Compute the offsets: for each sequence, where it starts in the flattened input
191
+ non_padded_lengths = attention_mask.sum(
192
+ dim=1
193
+ ) # Count non-padded tokens per sequence
194
+ offsets = torch.cat(
195
+ [
196
+ torch.tensor([0], device=attention_mask.device),
197
+ non_padded_lengths.cumsum(dim=0)[:-1],
198
+ ]
199
+ )
200
+ return input_indices, offsets
201
+
202
+ @staticmethod
203
+ def _mean_embedding(
204
+ hidden_states: torch.Tensor,
205
+ input_indices: torch.Tensor,
206
+ offsets: torch.Tensor,
207
+ ) -> torch.Tensor:
208
+ """
209
+ Compute the mean of non-padded embeddings using `embedding_bag`,
210
+ properly handling padding with offsets.
211
+
212
+ Args:
213
+ hidden_states (torch.Tensor): Hidden states of shape (batch_size, seq_len, embed_dim).
214
+ input_indices (torch.Tensor): Indices of non-padded tokens in flattened form.
215
+ offsets (torch.Tensor): Offsets specifying the start of each sequence.
216
+
217
+ Returns:
218
+ torch.Tensor: Pooled mean embeddings of shape (batch_size, embed_dim).
219
+ """
220
+ # Flatten hidden_states to 2D: shape (batch_size * seq_len, embedding_dim)
221
+ batch_size, seq_len, embed_dim = hidden_states.shape
222
+ token_embeds = hidden_states.view(-1, embed_dim)
223
+
224
+ # Use embedding_bag with mode 'mean' and appropriate indices
225
+ return F.embedding_bag(
226
+ input=input_indices, # Indices of non-padded tokens in flattened form
227
+ weight=token_embeds, # The flattened hidden states as embedding matrix
228
+ offsets=offsets, # Offsets specifying start of each sequence
229
+ mode="mean", # Aggregation mode
230
+ )
231
+
232
+
233
+ AutoModel.register(NLLBLLM2VecConfig, NLLBLLM2Vec)
234
+
235
+
236
+ def repl():
237
+ cfg = NLLBLLM2VecConfig()
238
+ model = NLLBLLM2Vec(cfg)
239
+
240
+ from peft.mapping import get_peft_model
241
+ from peft.tuners.lora.config import LoraConfig
242
+
243
+ lora_config = LoraConfig(
244
+ r=16,
245
+ lora_alpha=32,
246
+ lora_dropout=0.0,
247
+ bias="none",
248
+ task_type="FEATURE_EXTRACTION",
249
+ target_modules=[
250
+ "llm2vec.layers.0.self_attn.q_proj",
251
+ "llm2vec.layers.0.self_attn.k_proj",
252
+ "llm2vec.layers.0.self_attn.v_proj",
253
+ "llm2vec.layers.0.self_attn.o_proj",
254
+ "llm2vec.layers.0.mlp.gate_proj",
255
+ "llm2vec.layers.0.mlp.up_proj",
256
+ "llm2vec.layers.0.mlp.down_proj",
257
+ "llm2vec.layers.1.self_attn.q_proj",
258
+ "llm2vec.layers.1.self_attn.k_proj",
259
+ "llm2vec.layers.1.self_attn.v_proj",
260
+ "llm2vec.layers.1.self_attn.o_proj",
261
+ "llm2vec.layers.1.mlp.gate_proj",
262
+ "llm2vec.layers.1.mlp.up_proj",
263
+ "llm2vec.layers.1.mlp.down_proj",
264
+ "llm2vec.layers.2.self_attn.q_proj",
265
+ "llm2vec.layers.2.self_attn.k_proj",
266
+ "llm2vec.layers.2.self_attn.v_proj",
267
+ "llm2vec.layers.2.self_attn.o_proj",
268
+ "llm2vec.layers.2.mlp.gate_proj",
269
+ "llm2vec.layers.2.mlp.up_proj",
270
+ "llm2vec.layers.2.mlp.down_proj",
271
+ "llm2vec.layers.3.self_attn.q_proj",
272
+ "llm2vec.layers.3.self_attn.k_proj",
273
+ "llm2vec.layers.3.self_attn.v_proj",
274
+ "llm2vec.layers.3.self_attn.o_proj",
275
+ "llm2vec.layers.3.mlp.gate_proj",
276
+ "llm2vec.layers.3.mlp.up_proj",
277
+ "llm2vec.layers.3.mlp.down_proj",
278
+ "llm2vec.layers.4.self_attn.q_proj",
279
+ "llm2vec.layers.4.self_attn.k_proj",
280
+ "llm2vec.layers.4.self_attn.v_proj",
281
+ "llm2vec.layers.4.self_attn.o_proj",
282
+ "llm2vec.layers.4.mlp.gate_proj",
283
+ "llm2vec.layers.4.mlp.up_proj",
284
+ "llm2vec.layers.4.mlp.down_proj",
285
+ "llm2vec.layers.5.self_attn.q_proj",
286
+ "llm2vec.layers.5.self_attn.k_proj",
287
+ "llm2vec.layers.5.self_attn.v_proj",
288
+ "llm2vec.layers.5.self_attn.o_proj",
289
+ "llm2vec.layers.5.mlp.gate_proj",
290
+ "llm2vec.layers.5.mlp.up_proj",
291
+ "llm2vec.layers.5.mlp.down_proj",
292
+ "llm2vec.layers.6.self_attn.q_proj",
293
+ "llm2vec.layers.6.self_attn.k_proj",
294
+ "llm2vec.layers.6.self_attn.v_proj",
295
+ "llm2vec.layers.6.self_attn.o_proj",
296
+ "llm2vec.layers.6.mlp.gate_proj",
297
+ "llm2vec.layers.6.mlp.up_proj",
298
+ "llm2vec.layers.6.mlp.down_proj",
299
+ "llm2vec.layers.7.self_attn.q_proj",
300
+ "llm2vec.layers.7.self_attn.k_proj",
301
+ "llm2vec.layers.7.self_attn.v_proj",
302
+ "llm2vec.layers.7.self_attn.o_proj",
303
+ "llm2vec.layers.7.mlp.gate_proj",
304
+ "llm2vec.layers.7.mlp.up_proj",
305
+ "llm2vec.layers.7.mlp.down_proj",
306
+ "llm2vec.layers.8.self_attn.q_proj",
307
+ "llm2vec.layers.8.self_attn.k_proj",
308
+ "llm2vec.layers.8.self_attn.v_proj",
309
+ "llm2vec.layers.8.self_attn.o_proj",
310
+ "llm2vec.layers.8.mlp.gate_proj",
311
+ "llm2vec.layers.8.mlp.up_proj",
312
+ "llm2vec.layers.8.mlp.down_proj",
313
+ "llm2vec.layers.9.self_attn.q_proj",
314
+ "llm2vec.layers.9.self_attn.k_proj",
315
+ "llm2vec.layers.9.self_attn.v_proj",
316
+ "llm2vec.layers.9.self_attn.o_proj",
317
+ "llm2vec.layers.9.mlp.gate_proj",
318
+ "llm2vec.layers.9.mlp.up_proj",
319
+ "llm2vec.layers.9.mlp.down_proj",
320
+ "llm2vec.layers.10.self_attn.q_proj",
321
+ "llm2vec.layers.10.self_attn.k_proj",
322
+ "llm2vec.layers.10.self_attn.v_proj",
323
+ "llm2vec.layers.10.self_attn.o_proj",
324
+ "llm2vec.layers.10.mlp.gate_proj",
325
+ "llm2vec.layers.10.mlp.up_proj",
326
+ "llm2vec.layers.10.mlp.down_proj",
327
+ "llm2vec.layers.11.self_attn.q_proj",
328
+ "llm2vec.layers.11.self_attn.k_proj",
329
+ "llm2vec.layers.11.self_attn.v_proj",
330
+ "llm2vec.layers.11.self_attn.o_proj",
331
+ "llm2vec.layers.11.mlp.gate_proj",
332
+ "llm2vec.layers.11.mlp.up_proj",
333
+ "llm2vec.layers.11.mlp.down_proj",
334
+ "llm2vec.layers.12.self_attn.q_proj",
335
+ "llm2vec.layers.12.self_attn.k_proj",
336
+ "llm2vec.layers.12.self_attn.v_proj",
337
+ "llm2vec.layers.12.self_attn.o_proj",
338
+ "llm2vec.layers.12.mlp.gate_proj",
339
+ "llm2vec.layers.12.mlp.up_proj",
340
+ "llm2vec.layers.12.mlp.down_proj",
341
+ "llm2vec.layers.13.self_attn.q_proj",
342
+ "llm2vec.layers.13.self_attn.k_proj",
343
+ "llm2vec.layers.13.self_attn.v_proj",
344
+ "llm2vec.layers.13.self_attn.o_proj",
345
+ "llm2vec.layers.13.mlp.gate_proj",
346
+ "llm2vec.layers.13.mlp.up_proj",
347
+ "llm2vec.layers.13.mlp.down_proj",
348
+ "llm2vec.layers.14.self_attn.q_proj",
349
+ "llm2vec.layers.14.self_attn.k_proj",
350
+ "llm2vec.layers.14.self_attn.v_proj",
351
+ "llm2vec.layers.14.self_attn.o_proj",
352
+ "llm2vec.layers.14.mlp.gate_proj",
353
+ "llm2vec.layers.14.mlp.up_proj",
354
+ "llm2vec.layers.14.mlp.down_proj",
355
+ "llm2vec.layers.15.self_attn.q_proj",
356
+ "llm2vec.layers.15.self_attn.k_proj",
357
+ "llm2vec.layers.15.self_attn.v_proj",
358
+ "llm2vec.layers.15.self_attn.o_proj",
359
+ "llm2vec.layers.15.mlp.gate_proj",
360
+ "llm2vec.layers.15.mlp.up_proj",
361
+ "llm2vec.layers.15.mlp.down_proj",
362
+ "llm2vec.layers.16.self_attn.q_proj",
363
+ "llm2vec.layers.16.self_attn.k_proj",
364
+ "llm2vec.layers.16.self_attn.v_proj",
365
+ "llm2vec.layers.16.self_attn.o_proj",
366
+ "llm2vec.layers.16.mlp.gate_proj",
367
+ "llm2vec.layers.16.mlp.up_proj",
368
+ "llm2vec.layers.16.mlp.down_proj",
369
+ "llm2vec.layers.17.self_attn.q_proj",
370
+ "llm2vec.layers.17.self_attn.k_proj",
371
+ "llm2vec.layers.17.self_attn.v_proj",
372
+ "llm2vec.layers.17.self_attn.o_proj",
373
+ "llm2vec.layers.17.mlp.gate_proj",
374
+ "llm2vec.layers.17.mlp.up_proj",
375
+ "llm2vec.layers.17.mlp.down_proj",
376
+ "llm2vec.layers.18.self_attn.q_proj",
377
+ "llm2vec.layers.18.self_attn.k_proj",
378
+ "llm2vec.layers.18.self_attn.v_proj",
379
+ "llm2vec.layers.18.self_attn.o_proj",
380
+ "llm2vec.layers.18.mlp.gate_proj",
381
+ "llm2vec.layers.18.mlp.up_proj",
382
+ "llm2vec.layers.18.mlp.down_proj",
383
+ "llm2vec.layers.19.self_attn.q_proj",
384
+ "llm2vec.layers.19.self_attn.k_proj",
385
+ "llm2vec.layers.19.self_attn.v_proj",
386
+ "llm2vec.layers.19.self_attn.o_proj",
387
+ "llm2vec.layers.19.mlp.gate_proj",
388
+ "llm2vec.layers.19.mlp.up_proj",
389
+ "llm2vec.layers.19.mlp.down_proj",
390
+ "llm2vec.layers.20.self_attn.q_proj",
391
+ "llm2vec.layers.20.self_attn.k_proj",
392
+ "llm2vec.layers.20.self_attn.v_proj",
393
+ "llm2vec.layers.20.self_attn.o_proj",
394
+ "llm2vec.layers.20.mlp.gate_proj",
395
+ "llm2vec.layers.20.mlp.up_proj",
396
+ "llm2vec.layers.20.mlp.down_proj",
397
+ "llm2vec.layers.21.self_attn.q_proj",
398
+ "llm2vec.layers.21.self_attn.k_proj",
399
+ "llm2vec.layers.21.self_attn.v_proj",
400
+ "llm2vec.layers.21.self_attn.o_proj",
401
+ "llm2vec.layers.21.mlp.gate_proj",
402
+ "llm2vec.layers.21.mlp.up_proj",
403
+ "llm2vec.layers.21.mlp.down_proj",
404
+ "llm2vec.layers.22.self_attn.q_proj",
405
+ "llm2vec.layers.22.self_attn.k_proj",
406
+ "llm2vec.layers.22.self_attn.v_proj",
407
+ "llm2vec.layers.22.self_attn.o_proj",
408
+ "llm2vec.layers.22.mlp.gate_proj",
409
+ "llm2vec.layers.22.mlp.up_proj",
410
+ "llm2vec.layers.22.mlp.down_proj",
411
+ "llm2vec.layers.23.self_attn.q_proj",
412
+ "llm2vec.layers.23.self_attn.k_proj",
413
+ "llm2vec.layers.23.self_attn.v_proj",
414
+ "llm2vec.layers.23.self_attn.o_proj",
415
+ "llm2vec.layers.23.mlp.gate_proj",
416
+ "llm2vec.layers.23.mlp.up_proj",
417
+ "llm2vec.layers.23.mlp.down_proj",
418
+ "llm2vec.layers.24.self_attn.q_proj",
419
+ "llm2vec.layers.24.self_attn.k_proj",
420
+ "llm2vec.layers.24.self_attn.v_proj",
421
+ "llm2vec.layers.24.self_attn.o_proj",
422
+ "llm2vec.layers.24.mlp.gate_proj",
423
+ "llm2vec.layers.24.mlp.up_proj",
424
+ "llm2vec.layers.24.mlp.down_proj",
425
+ "llm2vec.layers.25.self_attn.q_proj",
426
+ "llm2vec.layers.25.self_attn.k_proj",
427
+ "llm2vec.layers.25.self_attn.v_proj",
428
+ "llm2vec.layers.25.self_attn.o_proj",
429
+ "llm2vec.layers.25.mlp.gate_proj",
430
+ "llm2vec.layers.25.mlp.up_proj",
431
+ "llm2vec.layers.25.mlp.down_proj",
432
+ "llm2vec.layers.26.self_attn.q_proj",
433
+ "llm2vec.layers.26.self_attn.k_proj",
434
+ "llm2vec.layers.26.self_attn.v_proj",
435
+ "llm2vec.layers.26.self_attn.o_proj",
436
+ "llm2vec.layers.26.mlp.gate_proj",
437
+ "llm2vec.layers.26.mlp.up_proj",
438
+ "llm2vec.layers.26.mlp.down_proj",
439
+ "llm2vec.layers.27.self_attn.q_proj",
440
+ "llm2vec.layers.27.self_attn.k_proj",
441
+ "llm2vec.layers.27.self_attn.v_proj",
442
+ "llm2vec.layers.27.self_attn.o_proj",
443
+ "llm2vec.layers.27.mlp.gate_proj",
444
+ "llm2vec.layers.27.mlp.up_proj",
445
+ "llm2vec.layers.27.mlp.down_proj",
446
+ "llm2vec.layers.28.self_attn.q_proj",
447
+ "llm2vec.layers.28.self_attn.k_proj",
448
+ "llm2vec.layers.28.self_attn.v_proj",
449
+ "llm2vec.layers.28.self_attn.o_proj",
450
+ "llm2vec.layers.28.mlp.gate_proj",
451
+ "llm2vec.layers.28.mlp.up_proj",
452
+ "llm2vec.layers.28.mlp.down_proj",
453
+ "llm2vec.layers.29.self_attn.q_proj",
454
+ "llm2vec.layers.29.self_attn.k_proj",
455
+ "llm2vec.layers.29.self_attn.v_proj",
456
+ "llm2vec.layers.29.self_attn.o_proj",
457
+ "llm2vec.layers.29.mlp.gate_proj",
458
+ "llm2vec.layers.29.mlp.up_proj",
459
+ "llm2vec.layers.29.mlp.down_proj",
460
+ "llm2vec.layers.30.self_attn.q_proj",
461
+ "llm2vec.layers.30.self_attn.k_proj",
462
+ "llm2vec.layers.30.self_attn.v_proj",
463
+ "llm2vec.layers.30.self_attn.o_proj",
464
+ "llm2vec.layers.30.mlp.gate_proj",
465
+ "llm2vec.layers.30.mlp.up_proj",
466
+ "llm2vec.layers.30.mlp.down_proj",
467
+ "llm2vec.layers.31.self_attn.q_proj",
468
+ "llm2vec.layers.31.self_attn.k_proj",
469
+ "llm2vec.layers.31.self_attn.v_proj",
470
+ "llm2vec.layers.31.self_attn.o_proj",
471
+ "llm2vec.layers.31.mlp.gate_proj",
472
+ "llm2vec.layers.31.mlp.up_proj",
473
+ "llm2vec.layers.31.mlp.down_proj",
474
+ ],
475
+ )
476
+ peft_model = get_peft_model(model, lora_config)
477
+ peft_model.save_pretrained("../nllb-llm2vec-saved")
478
+ import json
479
+
480
+ with open("./model.safetensors.index.json", "r") as f:
481
+ print(json.load(f))