Why in my test data 0.6B model not better than v2-m3

#11
by hookzeng - opened

hi(印地语)
v2-m3 77.2566
0.6B 69.2133
4B 75.18
id(印尼语)
v2-m3 77.2766
0.6B 76.8766
4B 79.8933
th(泰语)
v2-m3 79.2833
0.6B 76.5566
4B 80.4366

I use linke this

class RerankModel():

def __init__(self, model_name, local_rank):
    self.name = model_name
    self.tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                   padding_side='left')
    self.reranker = AutoModelForCausalLM.from_pretrained(model_name).to(
        local_rank).eval()  #, device_map="auto"
    self.reranker = DDP(self.reranker,
                        device_ids=[local_rank],
                        output_device=local_rank)
    self.local_rank = local_rank

    self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
    self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
    self.max_length = 1024
    self.prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
    self.suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
    self.prefix_tokens = self.tokenizer.encode(self.prefix,
                                               add_special_tokens=False)
    self.suffix_tokens = self.tokenizer.encode(self.suffix,
                                               add_special_tokens=False)

def process_inputs(self, pairs):
    inputs = self.tokenizer(pairs,
                            padding=False,
                            truncation='longest_first',
                            return_attention_mask=False,
                            max_length=self.max_length -
                            len(self.prefix_tokens) -
                            len(self.suffix_tokens))
    for i, ele in enumerate(inputs['input_ids']):
        inputs['input_ids'][
            i] = self.prefix_tokens + ele + self.suffix_tokens
    inputs = self.tokenizer.pad(inputs,
                                padding=True,
                                return_tensors="pt",
                                max_length=self.max_length)
    for key in inputs:
        inputs[key] = inputs[key].to(self.local_rank)
    return inputs



@torch
	.no_grad()
def compute_logits(self, inputs):
    batch_scores = self.reranker.module(**inputs).logits[:, -1, :]
    true_vector = batch_scores[:, self.token_true_id]
    false_vector = batch_scores[:, self.token_false_id]
    batch_scores = torch.stack([false_vector, true_vector], dim=1)
    batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
    scores = batch_scores[:, 1].exp()
    return scores

def __call__(self, sentence_pairs, batch_size=64):
    inputs = self.process_inputs(sentence_pairs)
    scores = self.compute_logits(inputs)
    return scores

def format_instruction(instruction, query, doc):
if instruction is None:
instruction = 'Given a web search query, retrieve relevant passages that answer the query'
output = ": {instruction}\n: {query}\n: {doc}".format(
instruction=instruction, query=query, doc=doc)
return output

reranker_task = 'Given a web search query, retrieve relevant passages that answer the query'

                        rerank_pairs = [
                            format_instruction(reranker_task, question,
                                               sgement_dict[doc_id])
                            for doc_id in recall_api
                        ]
This comment has been hidden (marked as Abuse)

I dont know if you notice that there are prefix and suffix tokens to concat query and doc in the implementation. How could we judge the influence of them

Sign up or log in to comment