sonsus commited on
Commit
1eadcf1
ยท
1 Parent(s): 5bdfe24

removed prob_calibrated_output logic, modif complete_prompt, polish docs

Browse files
varco_arena/main.py CHANGED
@@ -75,9 +75,9 @@ def main(
75
  return
76
 
77
  # prompt user whether to continue
78
- flag = input("[*] Run Varco Arena? (y/n) : ")
79
  if not flag.lower() == "y" and not flag.lower() == "yes":
80
- print("[-] Varco Arena Stopped")
81
  return
82
 
83
  manager = Manager(
 
75
  return
76
 
77
  # prompt user whether to continue
78
+ flag = input("[*] Run Arena-Lite? (y/n) : ")
79
  if not flag.lower() == "y" and not flag.lower() == "yes":
80
+ print("[-] Arena-Lite Stopped")
81
  return
82
 
83
  manager = Manager(
varco_arena/setup.py CHANGED
@@ -19,7 +19,7 @@ setup(
19
  version="1.0.0", # ํŒจํ‚ค์ง€์˜ ๋ฒ„์ „
20
  author="Text AI Lab, Generation Model Team", # ์ž‘์„ฑ์ž ์ด๋ฆ„
21
  author_email="ncsoft_generationmodelteamd@ncsoft.com", # ์ž‘์„ฑ์ž ์ด๋ฉ”์ผ
22
- description="VARCO Arena๋Š” ๊ฐ ๋ชจ๋ธ ๋ณ„๋กœ ์ƒ์„ฑ๋œ ๊ฒฐ๊ณผ๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ์–ด๋–ค ๋ชจ๋ธ์ด ๊ฐ€์žฅ ์ข‹์€ ๋ชจ๋ธ์ธ์ง€ ํŒ๋ณ„ํ•ด์ฃผ๋Š” ํ”„๋กœ์ ํŠธ์ž…๋‹ˆ๋‹ค.", # ํŒจํ‚ค์ง€์— ๋Œ€ํ•œ ๊ฐ„๋‹จํ•œ ์„ค๋ช…
23
  packages=find_packages(), # ํŒจํ‚ค์ง€ ํด๋”๋ฅผ ์ž๋™์œผ๋กœ ์ฐพ์•„ ์ถ”๊ฐ€
24
  install_requires=required_packages, # ํŒจํ‚ค์ง€ ์„ค์น˜์— ํ•„์š”ํ•œ ๋‹ค๋ฅธ ํŒจํ‚ค์ง€๋“ค
25
  dependency_links=["git+https://github.com/shobrook/openlimit.git#egg=openlimit"],
 
19
  version="1.0.0", # ํŒจํ‚ค์ง€์˜ ๋ฒ„์ „
20
  author="Text AI Lab, Generation Model Team", # ์ž‘์„ฑ์ž ์ด๋ฆ„
21
  author_email="ncsoft_generationmodelteamd@ncsoft.com", # ์ž‘์„ฑ์ž ์ด๋ฉ”์ผ
22
+ description="Arena-Lite๋Š” ๊ฐ ๋ชจ๋ธ ๋ณ„๋กœ ์ƒ์„ฑ๋œ ๊ฒฐ๊ณผ๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ์–ด๋–ค ๋ชจ๋ธ์ด ๊ฐ€์žฅ ์ข‹์€ ๋ชจ๋ธ์ธ์ง€ ํŒ๋ณ„ํ•ด์ฃผ๋Š” ํ”„๋กœ์ ํŠธ์ž…๋‹ˆ๋‹ค.", # ํŒจํ‚ค์ง€์— ๋Œ€ํ•œ ๊ฐ„๋‹จํ•œ ์„ค๋ช…
23
  packages=find_packages(), # ํŒจํ‚ค์ง€ ํด๋”๋ฅผ ์ž๋™์œผ๋กœ ์ฐพ์•„ ์ถ”๊ฐ€
24
  install_requires=required_packages, # ํŒจํ‚ค์ง€ ์„ค์น˜์— ํ•„์š”ํ•œ ๋‹ค๋ฅธ ํŒจํ‚ค์ง€๋“ค
25
  dependency_links=["git+https://github.com/shobrook/openlimit.git#egg=openlimit"],
varco_arena/varco_arena_core/data_utils.py CHANGED
@@ -108,9 +108,9 @@ def _fill_mcguffin_field(df: pd.DataFrame) -> pd.DataFrame:
108
 
109
 
110
  def num_uniq_queries(dataset: List[Dict]) -> int:
111
- """use this function to check if the input jsonl files are suitable to intended use of Varco Arena
112
 
113
- Varco Arena: Let {n_models} of LLMs compete each other on {len_file} number of instructions
114
  --> dataset.instruction + dataset.source shouldn't change by adding another model output file, if it does, it is would be a buggy input.
115
  """
116
  df = pd.DataFrame(dataset)
 
108
 
109
 
110
  def num_uniq_queries(dataset: List[Dict]) -> int:
111
+ """use this function to check if the input jsonl files are suitable to intended use of Arena-Lite
112
 
113
+ Arena-Lite: Let {n_models} of LLMs compete each other on {len_file} number of instructions
114
  --> dataset.instruction + dataset.source shouldn't change by adding another model output file, if it does, it is would be a buggy input.
115
  """
116
  df = pd.DataFrame(dataset)
varco_arena/varco_arena_core/elo.py CHANGED
@@ -26,7 +26,7 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
26
  Y = np.zeros(n)
27
  Y[df["winner"] == "A"] = 1.0
28
 
29
- WARNING = "elo.py:L{L} compute_mle_elo() // Warning: Seeing this message indicates the regression result for elo is unreliable. You should be test-running the Varco Arena or something odd (perfect one-sided wins) is happening\n\nto avoid logistic regressor error, manually putting other class"
30
  if (Y == 0).all():
31
  print(WARNING.format(L=32))
32
  Y[-1] = 1.0
 
26
  Y = np.zeros(n)
27
  Y[df["winner"] == "A"] = 1.0
28
 
29
+ WARNING = "elo.py:L{L} compute_mle_elo() // Warning: Seeing this message indicates the regression result for elo is unreliable. You should be test-running the Arena-Lite or something odd (perfect one-sided wins) is happening\n\nto avoid logistic regressor error, manually putting other class"
30
  if (Y == 0).all():
31
  print(WARNING.format(L=32))
32
  Y[-1] = 1.0
varco_arena/varco_arena_core/eval_utils.py CHANGED
@@ -195,18 +195,34 @@ async def async_eval_w_prompt(
195
  # prob_calibration: bool = True, # default True.
196
  ):
197
  # complete the prompt
198
- kwargs = dict(
199
- inst=position_1st.instruction,
200
- src=position_1st.source,
201
- out_a=position_1st.generated,
202
- out_b=position_2nd.generated,
203
- task=position_1st.task,
204
- )
205
-
206
- if isinstance(prompt_obj, TranslationPairPrompt) or \
207
- isinstance(prompt_obj, TranslationNewPrompt):
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  kwargs["source_lang"] = position_1st.source_lang
 
209
  kwargs["target_lang"] = position_1st.target_lang
 
 
210
 
211
  completed_prompt = prompt_obj.complete_prompt(**kwargs)
212
  # print(completed_prompt[-1]["content"])
 
195
  # prob_calibration: bool = True, # default True.
196
  ):
197
  # complete the prompt
198
+ kwargs = vars(position_1st).copy()
199
+ kwargs["out_a"] = position_1st.generated
200
+ kwargs["out_b"] = position_2nd.generated
201
+ # Remove 'generated' from kwargs if it was copied from position_1st, as it's now 'out_a'
202
+ if "generated" in kwargs:
203
+ del kwargs["generated"]
204
+ # Remove 'instruction', 'source', 'task' if they are not needed as separate keys
205
+ # and are already covered by the vars(position_1st).copy()
206
+ # For now, keep them as they might be explicitly used in some templates.
207
+ # If 'inst', 'src', 'task' are expected as specific keys in templates,
208
+ # ensure they are present.
209
+ # Assuming 'instruction' -> 'inst', 'source' -> 'src' etc. is handled by prompt templates.
210
+ # If not, explicit mapping might be needed.
211
+ # For now, let's assume the template uses the original field names from input.jsonl.
212
+ # If the template expects 'inst' but input.jsonl has 'instruction',
213
+ # then a mapping is required.
214
+ # Based on the original code, it seems 'inst', 'src', 'task' are the expected keys.
215
+ # Let's ensure these are explicitly set if they exist in position_1st.
216
+ if hasattr(position_1st, "instruction"):
217
+ kwargs["inst"] = position_1st.instruction
218
+ if hasattr(position_1st, "source"):
219
+ kwargs["src"] = position_1st.source
220
+ if hasattr(position_1st, "source_lang"):
221
  kwargs["source_lang"] = position_1st.source_lang
222
+ if hasattr(position_1st, "target_lang"):
223
  kwargs["target_lang"] = position_1st.target_lang
224
+ if hasattr(position_1st, "task"):
225
+ kwargs["task"] = position_1st.task
226
 
227
  completed_prompt = prompt_obj.complete_prompt(**kwargs)
228
  # print(completed_prompt[-1]["content"])
varco_arena/varco_arena_core/prompts/base_prompt.py CHANGED
@@ -45,15 +45,35 @@ class ComparisonPromptBase:
45
  sampling_parameters = raw_d["sampling_parameters"]
46
 
47
  decision_tokens = raw_d["decision_tokens"]
48
- prompt_template = [
49
- {
50
- "role": message["role"],
51
- "content": Template(message["content"])
52
- if get_fields(message["content"])
53
- else message["content"],
54
- }
55
- for message in raw_d["prompt_template"]
56
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  expected_generation_str = raw_d["expected_generation_str"]
59
 
 
45
  sampling_parameters = raw_d["sampling_parameters"]
46
 
47
  decision_tokens = raw_d["decision_tokens"]
48
+ prompt_template = []
49
+ for message in raw_d["prompt_template"]:
50
+ content = message["content"]
51
+ if "${task}" in content:
52
+ raise ValueError(
53
+ f"The prompt template in {yamlf} contains '${{task}}'. "
54
+ "The 'task' field is reserved for prompt branching logic "
55
+ "and cannot be used as a template variable."
56
+ )
57
+ if "${model_id}" in content:
58
+ raise ValueError(
59
+ f"The prompt template in {yamlf} contains '${{model_id}}'. "
60
+ "The 'model_id' field is reserved for match logic "
61
+ "and cannot be used as a template variable."
62
+ )
63
+ if "${generated}" in content:
64
+ raise ValueError(
65
+ f"The prompt template in {yamlf} contains '${{generated}}'. "
66
+ "The 'generated' field is reserved for match logic "
67
+ "and cannot be used as a template variable."
68
+ )
69
+ prompt_template.append(
70
+ {
71
+ "role": message["role"],
72
+ "content": Template(content)
73
+ if get_fields(content)
74
+ else content,
75
+ }
76
+ )
77
 
78
  expected_generation_str = raw_d["expected_generation_str"]
79
 
varco_arena/varco_arena_core/prompts/llmbar.py CHANGED
@@ -64,88 +64,88 @@ class LLMBarPrompt(ComparisonPromptBase):
64
 
65
  return res_tok
66
 
67
- def prob_calibrated_output(self, response) -> Dict[str, float]:
68
- """
69
- ChatCompletionTokenLogProb:
70
- token: str
71
- logprob: float
72
- top_logprobs: List[
73
- TopLogprob:
74
- token: str
75
- logprob: float
76
- ]
77
- or
78
- vllm response object (logprob struct differs)
79
-
80
- """
81
-
82
- # focus to the token of interest
83
- # NOTE: res_tok is not guaranteed to follow the tokenization of the model, it just checks whether our output follows the expected format
84
- res_tok = self.parsed_output(response)
85
- if res_tok == None: # if not found, return None
86
- return None
87
-
88
- isopenai: bool = is_model_from_openai(response=response)
89
- found_tokens: list = []
90
- if isopenai:
91
- try:
92
- logprobs = response.choices[0].logprobs.content
93
- top_logprob_list = find_logprob_of_a_token_openai(
94
- logprobs=logprobs, token=res_tok
95
- ).top_logprobs
96
- except Exception as e:
97
- print(f"(Warning) logprob method not implemented for this prompt")
98
- print(f"use token result from parsed_output: {res_tok}")
99
- probs_normalized = {
100
- "prefer_1st": 1.0 if "a" in res_tok.lower() else 0.0,
101
- "prefer_2nd": 1.0 if "b" in res_tok.lower() else 0.0,
102
- "target_tokens_found": [res_tok],
103
- "no_target_tokens_found": False,
104
- "model": response.model,
105
- "decision_by": "parsed_output",
106
- }
107
- return probs_normalized
108
 
109
- # explore the `top_logprobs_list` list and then gather
110
- probs_of_interest_pos = dict()
111
- for k, tok in self.decision_tokens.items():
112
- lp_obj = find_logprob_of_a_token_openai(
113
- logprobs=top_logprob_list, token=tok
114
- )
115
-
116
- if lp_obj is not None:
117
- logp = lp_obj.logprob
118
- found_tokens.append(tok)
119
-
120
- p = np.exp(logp)
121
- probs_of_interest_pos[k] = p # prefer_1st = a_prob
122
- else:
123
- # vllm/openai=0.4.2, 0.5.5
124
- probs_of_interest_pos = dict()
125
- top_logprobs_d = top_logprob_of_a_token_vllm(
126
- response=response, token=res_tok
127
- )
128
- for k, tok in self.decision_tokens.items():
129
- if tok in top_logprobs_d.keys():
130
- probs_of_interest_pos[k] = np.exp(top_logprobs_d[tok])
131
- found_tokens.append(tok)
132
-
133
- # normalize
134
- norm = sum(
135
- probs_of_interest_pos.values()
136
- ) # even for empty dict, this wouldn't cause divbyzero error
137
- probs_normalized = {k: v / norm for k, v in probs_of_interest_pos.items()}
138
- probs_normalized["target_tokens_found"] = found_tokens
139
- probs_normalized["no_target_tokens_found"] = not bool(found_tokens)
140
- probs_normalized["model"] = response.model
141
-
142
- if not found_tokens: # empty dict, no toks in self.decision_tokens.values()
143
- # for the ease of downstream processings: set values as exact .5 (tie case)
144
- probs_normalized["prefer_1st"] = 0.5
145
- probs_normalized["prefer_2nd"] = 0.5
146
- probs_normalized["decision_by"] = "normalized likelihood"
147
-
148
- return probs_normalized
149
 
150
  def complete_prompt(
151
  self,
 
64
 
65
  return res_tok
66
 
67
+ # def prob_calibrated_output(self, response) -> Dict[str, float]:
68
+ # """
69
+ # ChatCompletionTokenLogProb:
70
+ # token: str
71
+ # logprob: float
72
+ # top_logprobs: List[
73
+ # TopLogprob:
74
+ # token: str
75
+ # logprob: float
76
+ # ]
77
+ # or
78
+ # vllm response object (logprob struct differs)
79
+
80
+ # """
81
+
82
+ # # focus to the token of interest
83
+ # # NOTE: res_tok is not guaranteed to follow the tokenization of the model, it just checks whether our output follows the expected format
84
+ # res_tok = self.parsed_output(response)
85
+ # if res_tok == None: # if not found, return None
86
+ # return None
87
+
88
+ # isopenai: bool = is_model_from_openai(response=response)
89
+ # found_tokens: list = []
90
+ # if isopenai:
91
+ # try:
92
+ # logprobs = response.choices[0].logprobs.content
93
+ # top_logprob_list = find_logprob_of_a_token_openai(
94
+ # logprobs=logprobs, token=res_tok
95
+ # ).top_logprobs
96
+ # except Exception as e:
97
+ # print(f"(Warning) logprob method not implemented for this prompt")
98
+ # print(f"use token result from parsed_output: {res_tok}")
99
+ # probs_normalized = {
100
+ # "prefer_1st": 1.0 if "a" in res_tok.lower() else 0.0,
101
+ # "prefer_2nd": 1.0 if "b" in res_tok.lower() else 0.0,
102
+ # "target_tokens_found": [res_tok],
103
+ # "no_target_tokens_found": False,
104
+ # "model": response.model,
105
+ # "decision_by": "parsed_output",
106
+ # }
107
+ # return probs_normalized
108
 
109
+ # # explore the `top_logprobs_list` list and then gather
110
+ # probs_of_interest_pos = dict()
111
+ # for k, tok in self.decision_tokens.items():
112
+ # lp_obj = find_logprob_of_a_token_openai(
113
+ # logprobs=top_logprob_list, token=tok
114
+ # )
115
+
116
+ # if lp_obj is not None:
117
+ # logp = lp_obj.logprob
118
+ # found_tokens.append(tok)
119
+
120
+ # p = np.exp(logp)
121
+ # probs_of_interest_pos[k] = p # prefer_1st = a_prob
122
+ # else:
123
+ # # vllm/openai=0.4.2, 0.5.5
124
+ # probs_of_interest_pos = dict()
125
+ # top_logprobs_d = top_logprob_of_a_token_vllm(
126
+ # response=response, token=res_tok
127
+ # )
128
+ # for k, tok in self.decision_tokens.items():
129
+ # if tok in top_logprobs_d.keys():
130
+ # probs_of_interest_pos[k] = np.exp(top_logprobs_d[tok])
131
+ # found_tokens.append(tok)
132
+
133
+ # # normalize
134
+ # norm = sum(
135
+ # probs_of_interest_pos.values()
136
+ # ) # even for empty dict, this wouldn't cause divbyzero error
137
+ # probs_normalized = {k: v / norm for k, v in probs_of_interest_pos.items()}
138
+ # probs_normalized["target_tokens_found"] = found_tokens
139
+ # probs_normalized["no_target_tokens_found"] = not bool(found_tokens)
140
+ # probs_normalized["model"] = response.model
141
+
142
+ # if not found_tokens: # empty dict, no toks in self.decision_tokens.values()
143
+ # # for the ease of downstream processings: set values as exact .5 (tie case)
144
+ # probs_normalized["prefer_1st"] = 0.5
145
+ # probs_normalized["prefer_2nd"] = 0.5
146
+ # probs_normalized["decision_by"] = "normalized likelihood"
147
+
148
+ # return probs_normalized
149
 
150
  def complete_prompt(
151
  self,