Spaces:
Sleeping
Sleeping
removed prob_calibrated_output logic, modif complete_prompt, polish docs
Browse files- varco_arena/main.py +2 -2
- varco_arena/setup.py +1 -1
- varco_arena/varco_arena_core/data_utils.py +2 -2
- varco_arena/varco_arena_core/elo.py +1 -1
- varco_arena/varco_arena_core/eval_utils.py +26 -10
- varco_arena/varco_arena_core/prompts/base_prompt.py +29 -9
- varco_arena/varco_arena_core/prompts/llmbar.py +81 -81
varco_arena/main.py
CHANGED
@@ -75,9 +75,9 @@ def main(
|
|
75 |
return
|
76 |
|
77 |
# prompt user whether to continue
|
78 |
-
flag = input("[*] Run
|
79 |
if not flag.lower() == "y" and not flag.lower() == "yes":
|
80 |
-
print("[-]
|
81 |
return
|
82 |
|
83 |
manager = Manager(
|
|
|
75 |
return
|
76 |
|
77 |
# prompt user whether to continue
|
78 |
+
flag = input("[*] Run Arena-Lite? (y/n) : ")
|
79 |
if not flag.lower() == "y" and not flag.lower() == "yes":
|
80 |
+
print("[-] Arena-Lite Stopped")
|
81 |
return
|
82 |
|
83 |
manager = Manager(
|
varco_arena/setup.py
CHANGED
@@ -19,7 +19,7 @@ setup(
|
|
19 |
version="1.0.0", # ํจํค์ง์ ๋ฒ์
|
20 |
author="Text AI Lab, Generation Model Team", # ์์ฑ์ ์ด๋ฆ
|
21 |
author_email="ncsoft_generationmodelteamd@ncsoft.com", # ์์ฑ์ ์ด๋ฉ์ผ
|
22 |
-
description="
|
23 |
packages=find_packages(), # ํจํค์ง ํด๋๋ฅผ ์๋์ผ๋ก ์ฐพ์ ์ถ๊ฐ
|
24 |
install_requires=required_packages, # ํจํค์ง ์ค์น์ ํ์ํ ๋ค๋ฅธ ํจํค์ง๋ค
|
25 |
dependency_links=["git+https://github.com/shobrook/openlimit.git#egg=openlimit"],
|
|
|
19 |
version="1.0.0", # ํจํค์ง์ ๋ฒ์
|
20 |
author="Text AI Lab, Generation Model Team", # ์์ฑ์ ์ด๋ฆ
|
21 |
author_email="ncsoft_generationmodelteamd@ncsoft.com", # ์์ฑ์ ์ด๋ฉ์ผ
|
22 |
+
description="Arena-Lite๋ ๊ฐ ๋ชจ๋ธ ๋ณ๋ก ์์ฑ๋ ๊ฒฐ๊ณผ๋ฅผ ๊ธฐ๋ฐ์ผ๋ก ์ด๋ค ๋ชจ๋ธ์ด ๊ฐ์ฅ ์ข์ ๋ชจ๋ธ์ธ์ง ํ๋ณํด์ฃผ๋ ํ๋ก์ ํธ์
๋๋ค.", # ํจํค์ง์ ๋ํ ๊ฐ๋จํ ์ค๋ช
|
23 |
packages=find_packages(), # ํจํค์ง ํด๋๋ฅผ ์๋์ผ๋ก ์ฐพ์ ์ถ๊ฐ
|
24 |
install_requires=required_packages, # ํจํค์ง ์ค์น์ ํ์ํ ๋ค๋ฅธ ํจํค์ง๋ค
|
25 |
dependency_links=["git+https://github.com/shobrook/openlimit.git#egg=openlimit"],
|
varco_arena/varco_arena_core/data_utils.py
CHANGED
@@ -108,9 +108,9 @@ def _fill_mcguffin_field(df: pd.DataFrame) -> pd.DataFrame:
|
|
108 |
|
109 |
|
110 |
def num_uniq_queries(dataset: List[Dict]) -> int:
|
111 |
-
"""use this function to check if the input jsonl files are suitable to intended use of
|
112 |
|
113 |
-
|
114 |
--> dataset.instruction + dataset.source shouldn't change by adding another model output file, if it does, it is would be a buggy input.
|
115 |
"""
|
116 |
df = pd.DataFrame(dataset)
|
|
|
108 |
|
109 |
|
110 |
def num_uniq_queries(dataset: List[Dict]) -> int:
|
111 |
+
"""use this function to check if the input jsonl files are suitable to intended use of Arena-Lite
|
112 |
|
113 |
+
Arena-Lite: Let {n_models} of LLMs compete each other on {len_file} number of instructions
|
114 |
--> dataset.instruction + dataset.source shouldn't change by adding another model output file, if it does, it is would be a buggy input.
|
115 |
"""
|
116 |
df = pd.DataFrame(dataset)
|
varco_arena/varco_arena_core/elo.py
CHANGED
@@ -26,7 +26,7 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
26 |
Y = np.zeros(n)
|
27 |
Y[df["winner"] == "A"] = 1.0
|
28 |
|
29 |
-
WARNING = "elo.py:L{L} compute_mle_elo() // Warning: Seeing this message indicates the regression result for elo is unreliable. You should be test-running the
|
30 |
if (Y == 0).all():
|
31 |
print(WARNING.format(L=32))
|
32 |
Y[-1] = 1.0
|
|
|
26 |
Y = np.zeros(n)
|
27 |
Y[df["winner"] == "A"] = 1.0
|
28 |
|
29 |
+
WARNING = "elo.py:L{L} compute_mle_elo() // Warning: Seeing this message indicates the regression result for elo is unreliable. You should be test-running the Arena-Lite or something odd (perfect one-sided wins) is happening\n\nto avoid logistic regressor error, manually putting other class"
|
30 |
if (Y == 0).all():
|
31 |
print(WARNING.format(L=32))
|
32 |
Y[-1] = 1.0
|
varco_arena/varco_arena_core/eval_utils.py
CHANGED
@@ -195,18 +195,34 @@ async def async_eval_w_prompt(
|
|
195 |
# prob_calibration: bool = True, # default True.
|
196 |
):
|
197 |
# complete the prompt
|
198 |
-
kwargs =
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
208 |
kwargs["source_lang"] = position_1st.source_lang
|
|
|
209 |
kwargs["target_lang"] = position_1st.target_lang
|
|
|
|
|
210 |
|
211 |
completed_prompt = prompt_obj.complete_prompt(**kwargs)
|
212 |
# print(completed_prompt[-1]["content"])
|
|
|
195 |
# prob_calibration: bool = True, # default True.
|
196 |
):
|
197 |
# complete the prompt
|
198 |
+
kwargs = vars(position_1st).copy()
|
199 |
+
kwargs["out_a"] = position_1st.generated
|
200 |
+
kwargs["out_b"] = position_2nd.generated
|
201 |
+
# Remove 'generated' from kwargs if it was copied from position_1st, as it's now 'out_a'
|
202 |
+
if "generated" in kwargs:
|
203 |
+
del kwargs["generated"]
|
204 |
+
# Remove 'instruction', 'source', 'task' if they are not needed as separate keys
|
205 |
+
# and are already covered by the vars(position_1st).copy()
|
206 |
+
# For now, keep them as they might be explicitly used in some templates.
|
207 |
+
# If 'inst', 'src', 'task' are expected as specific keys in templates,
|
208 |
+
# ensure they are present.
|
209 |
+
# Assuming 'instruction' -> 'inst', 'source' -> 'src' etc. is handled by prompt templates.
|
210 |
+
# If not, explicit mapping might be needed.
|
211 |
+
# For now, let's assume the template uses the original field names from input.jsonl.
|
212 |
+
# If the template expects 'inst' but input.jsonl has 'instruction',
|
213 |
+
# then a mapping is required.
|
214 |
+
# Based on the original code, it seems 'inst', 'src', 'task' are the expected keys.
|
215 |
+
# Let's ensure these are explicitly set if they exist in position_1st.
|
216 |
+
if hasattr(position_1st, "instruction"):
|
217 |
+
kwargs["inst"] = position_1st.instruction
|
218 |
+
if hasattr(position_1st, "source"):
|
219 |
+
kwargs["src"] = position_1st.source
|
220 |
+
if hasattr(position_1st, "source_lang"):
|
221 |
kwargs["source_lang"] = position_1st.source_lang
|
222 |
+
if hasattr(position_1st, "target_lang"):
|
223 |
kwargs["target_lang"] = position_1st.target_lang
|
224 |
+
if hasattr(position_1st, "task"):
|
225 |
+
kwargs["task"] = position_1st.task
|
226 |
|
227 |
completed_prompt = prompt_obj.complete_prompt(**kwargs)
|
228 |
# print(completed_prompt[-1]["content"])
|
varco_arena/varco_arena_core/prompts/base_prompt.py
CHANGED
@@ -45,15 +45,35 @@ class ComparisonPromptBase:
|
|
45 |
sampling_parameters = raw_d["sampling_parameters"]
|
46 |
|
47 |
decision_tokens = raw_d["decision_tokens"]
|
48 |
-
prompt_template = [
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
expected_generation_str = raw_d["expected_generation_str"]
|
59 |
|
|
|
45 |
sampling_parameters = raw_d["sampling_parameters"]
|
46 |
|
47 |
decision_tokens = raw_d["decision_tokens"]
|
48 |
+
prompt_template = []
|
49 |
+
for message in raw_d["prompt_template"]:
|
50 |
+
content = message["content"]
|
51 |
+
if "${task}" in content:
|
52 |
+
raise ValueError(
|
53 |
+
f"The prompt template in {yamlf} contains '${{task}}'. "
|
54 |
+
"The 'task' field is reserved for prompt branching logic "
|
55 |
+
"and cannot be used as a template variable."
|
56 |
+
)
|
57 |
+
if "${model_id}" in content:
|
58 |
+
raise ValueError(
|
59 |
+
f"The prompt template in {yamlf} contains '${{model_id}}'. "
|
60 |
+
"The 'model_id' field is reserved for match logic "
|
61 |
+
"and cannot be used as a template variable."
|
62 |
+
)
|
63 |
+
if "${generated}" in content:
|
64 |
+
raise ValueError(
|
65 |
+
f"The prompt template in {yamlf} contains '${{generated}}'. "
|
66 |
+
"The 'generated' field is reserved for match logic "
|
67 |
+
"and cannot be used as a template variable."
|
68 |
+
)
|
69 |
+
prompt_template.append(
|
70 |
+
{
|
71 |
+
"role": message["role"],
|
72 |
+
"content": Template(content)
|
73 |
+
if get_fields(content)
|
74 |
+
else content,
|
75 |
+
}
|
76 |
+
)
|
77 |
|
78 |
expected_generation_str = raw_d["expected_generation_str"]
|
79 |
|
varco_arena/varco_arena_core/prompts/llmbar.py
CHANGED
@@ -64,88 +64,88 @@ class LLMBarPrompt(ComparisonPromptBase):
|
|
64 |
|
65 |
return res_tok
|
66 |
|
67 |
-
def prob_calibrated_output(self, response) -> Dict[str, float]:
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
|
150 |
def complete_prompt(
|
151 |
self,
|
|
|
64 |
|
65 |
return res_tok
|
66 |
|
67 |
+
# def prob_calibrated_output(self, response) -> Dict[str, float]:
|
68 |
+
# """
|
69 |
+
# ChatCompletionTokenLogProb:
|
70 |
+
# token: str
|
71 |
+
# logprob: float
|
72 |
+
# top_logprobs: List[
|
73 |
+
# TopLogprob:
|
74 |
+
# token: str
|
75 |
+
# logprob: float
|
76 |
+
# ]
|
77 |
+
# or
|
78 |
+
# vllm response object (logprob struct differs)
|
79 |
+
|
80 |
+
# """
|
81 |
+
|
82 |
+
# # focus to the token of interest
|
83 |
+
# # NOTE: res_tok is not guaranteed to follow the tokenization of the model, it just checks whether our output follows the expected format
|
84 |
+
# res_tok = self.parsed_output(response)
|
85 |
+
# if res_tok == None: # if not found, return None
|
86 |
+
# return None
|
87 |
+
|
88 |
+
# isopenai: bool = is_model_from_openai(response=response)
|
89 |
+
# found_tokens: list = []
|
90 |
+
# if isopenai:
|
91 |
+
# try:
|
92 |
+
# logprobs = response.choices[0].logprobs.content
|
93 |
+
# top_logprob_list = find_logprob_of_a_token_openai(
|
94 |
+
# logprobs=logprobs, token=res_tok
|
95 |
+
# ).top_logprobs
|
96 |
+
# except Exception as e:
|
97 |
+
# print(f"(Warning) logprob method not implemented for this prompt")
|
98 |
+
# print(f"use token result from parsed_output: {res_tok}")
|
99 |
+
# probs_normalized = {
|
100 |
+
# "prefer_1st": 1.0 if "a" in res_tok.lower() else 0.0,
|
101 |
+
# "prefer_2nd": 1.0 if "b" in res_tok.lower() else 0.0,
|
102 |
+
# "target_tokens_found": [res_tok],
|
103 |
+
# "no_target_tokens_found": False,
|
104 |
+
# "model": response.model,
|
105 |
+
# "decision_by": "parsed_output",
|
106 |
+
# }
|
107 |
+
# return probs_normalized
|
108 |
|
109 |
+
# # explore the `top_logprobs_list` list and then gather
|
110 |
+
# probs_of_interest_pos = dict()
|
111 |
+
# for k, tok in self.decision_tokens.items():
|
112 |
+
# lp_obj = find_logprob_of_a_token_openai(
|
113 |
+
# logprobs=top_logprob_list, token=tok
|
114 |
+
# )
|
115 |
+
|
116 |
+
# if lp_obj is not None:
|
117 |
+
# logp = lp_obj.logprob
|
118 |
+
# found_tokens.append(tok)
|
119 |
+
|
120 |
+
# p = np.exp(logp)
|
121 |
+
# probs_of_interest_pos[k] = p # prefer_1st = a_prob
|
122 |
+
# else:
|
123 |
+
# # vllm/openai=0.4.2, 0.5.5
|
124 |
+
# probs_of_interest_pos = dict()
|
125 |
+
# top_logprobs_d = top_logprob_of_a_token_vllm(
|
126 |
+
# response=response, token=res_tok
|
127 |
+
# )
|
128 |
+
# for k, tok in self.decision_tokens.items():
|
129 |
+
# if tok in top_logprobs_d.keys():
|
130 |
+
# probs_of_interest_pos[k] = np.exp(top_logprobs_d[tok])
|
131 |
+
# found_tokens.append(tok)
|
132 |
+
|
133 |
+
# # normalize
|
134 |
+
# norm = sum(
|
135 |
+
# probs_of_interest_pos.values()
|
136 |
+
# ) # even for empty dict, this wouldn't cause divbyzero error
|
137 |
+
# probs_normalized = {k: v / norm for k, v in probs_of_interest_pos.items()}
|
138 |
+
# probs_normalized["target_tokens_found"] = found_tokens
|
139 |
+
# probs_normalized["no_target_tokens_found"] = not bool(found_tokens)
|
140 |
+
# probs_normalized["model"] = response.model
|
141 |
+
|
142 |
+
# if not found_tokens: # empty dict, no toks in self.decision_tokens.values()
|
143 |
+
# # for the ease of downstream processings: set values as exact .5 (tie case)
|
144 |
+
# probs_normalized["prefer_1st"] = 0.5
|
145 |
+
# probs_normalized["prefer_2nd"] = 0.5
|
146 |
+
# probs_normalized["decision_by"] = "normalized likelihood"
|
147 |
+
|
148 |
+
# return probs_normalized
|
149 |
|
150 |
def complete_prompt(
|
151 |
self,
|