Why in my test data 0.6B model not better than v2-m3
hi(印地语)
v2-m3 77.2566
0.6B 69.2133
4B 75.18
id(印尼语)
v2-m3 77.2766
0.6B 76.8766
4B 79.8933
th(泰语)
v2-m3 79.2833
0.6B 76.5566
4B 80.4366
I use linke this
class RerankModel():
def __init__(self, model_name, local_rank):
self.name = model_name
self.tokenizer = AutoTokenizer.from_pretrained(model_name,
padding_side='left')
self.reranker = AutoModelForCausalLM.from_pretrained(model_name).to(
local_rank).eval() #, device_map="auto"
self.reranker = DDP(self.reranker,
device_ids=[local_rank],
output_device=local_rank)
self.local_rank = local_rank
self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
self.max_length = 1024
self.prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
self.suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
self.prefix_tokens = self.tokenizer.encode(self.prefix,
add_special_tokens=False)
self.suffix_tokens = self.tokenizer.encode(self.suffix,
add_special_tokens=False)
def process_inputs(self, pairs):
inputs = self.tokenizer(pairs,
padding=False,
truncation='longest_first',
return_attention_mask=False,
max_length=self.max_length -
len(self.prefix_tokens) -
len(self.suffix_tokens))
for i, ele in enumerate(inputs['input_ids']):
inputs['input_ids'][
i] = self.prefix_tokens + ele + self.suffix_tokens
inputs = self.tokenizer.pad(inputs,
padding=True,
return_tensors="pt",
max_length=self.max_length)
for key in inputs:
inputs[key] = inputs[key].to(self.local_rank)
return inputs
@torch
.no_grad()
def compute_logits(self, inputs):
batch_scores = self.reranker.module(**inputs).logits[:, -1, :]
true_vector = batch_scores[:, self.token_true_id]
false_vector = batch_scores[:, self.token_false_id]
batch_scores = torch.stack([false_vector, true_vector], dim=1)
batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
scores = batch_scores[:, 1].exp()
return scores
def __call__(self, sentence_pairs, batch_size=64):
inputs = self.process_inputs(sentence_pairs)
scores = self.compute_logits(inputs)
return scores
def format_instruction(instruction, query, doc):
if instruction is None:
instruction = 'Given a web search query, retrieve relevant passages that answer the query'
output = ": {instruction}\n: {query}\n: {doc}".format(
instruction=instruction, query=query, doc=doc)
return output
reranker_task = 'Given a web search query, retrieve relevant passages that answer the query'
rerank_pairs = [
format_instruction(reranker_task, question,
sgement_dict[doc_id])
for doc_id in recall_api
]
I dont know if you notice that there are prefix and suffix tokens to concat query and doc in the implementation. How could we judge the influence of them