Spaces:
Running
Running
freeman.genie777@gmail.com
commited on
Commit
·
a88ccc4
1
Parent(s):
ba378b3
95% CI added (rebuttal)
Browse files- app.py +2 -0
- streamlit_app_local/app.py +2 -0
- streamlit_app_local/view_utils.py +42 -2
- varco_arena/varco_arena_core/elo.py +41 -1
- varco_arena/varco_arena_core/prompts/llmbar.py +1 -0
- varco_arena/varco_arena_core/visualization.py +5 -4
- view_utils.py +42 -2
app.py
CHANGED
@@ -205,6 +205,7 @@ def run_varco_arena(
|
|
205 |
last_update_time = time.time()
|
206 |
terminal_output = st.empty()
|
207 |
full_output = f"{command}\n"
|
|
|
208 |
while True:
|
209 |
# Check if we have output to read
|
210 |
if select.select([process.stdout], [], [], 0)[0]:
|
@@ -229,6 +230,7 @@ def run_varco_arena(
|
|
229 |
# Check if the process has finished
|
230 |
if process.poll() is not None:
|
231 |
# Read any remaining output
|
|
|
232 |
remaining_output = process.stdout.read()
|
233 |
if remaining_output:
|
234 |
lines = remaining_output.split("\n")
|
|
|
205 |
last_update_time = time.time()
|
206 |
terminal_output = st.empty()
|
207 |
full_output = f"{command}\n"
|
208 |
+
to_show = full_output
|
209 |
while True:
|
210 |
# Check if we have output to read
|
211 |
if select.select([process.stdout], [], [], 0)[0]:
|
|
|
230 |
# Check if the process has finished
|
231 |
if process.poll() is not None:
|
232 |
# Read any remaining output
|
233 |
+
os.set_blocking(process.stdout.fileno(), True)
|
234 |
remaining_output = process.stdout.read()
|
235 |
if remaining_output:
|
236 |
lines = remaining_output.split("\n")
|
streamlit_app_local/app.py
CHANGED
@@ -106,6 +106,7 @@ def run_varco_arena(
|
|
106 |
last_update_time = time.time()
|
107 |
terminal_output = st.empty()
|
108 |
full_output = f"{command}\n"
|
|
|
109 |
while True:
|
110 |
# Check if we have output to read
|
111 |
if select.select([process.stdout], [], [], 0)[0]:
|
@@ -130,6 +131,7 @@ def run_varco_arena(
|
|
130 |
# Check if the process has finished
|
131 |
if process.poll() is not None:
|
132 |
# Read any remaining output
|
|
|
133 |
remaining_output = process.stdout.read()
|
134 |
if remaining_output:
|
135 |
lines = remaining_output.split("\n")
|
|
|
106 |
last_update_time = time.time()
|
107 |
terminal_output = st.empty()
|
108 |
full_output = f"{command}\n"
|
109 |
+
to_show = full_output
|
110 |
while True:
|
111 |
# Check if we have output to read
|
112 |
if select.select([process.stdout], [], [], 0)[0]:
|
|
|
131 |
# Check if the process has finished
|
132 |
if process.poll() is not None:
|
133 |
# Read any remaining output
|
134 |
+
os.set_blocking(process.stdout.fileno(), True)
|
135 |
remaining_output = process.stdout.read()
|
136 |
if remaining_output:
|
137 |
lines = remaining_output.split("\n")
|
streamlit_app_local/view_utils.py
CHANGED
@@ -154,6 +154,44 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
154 |
return df
|
155 |
|
156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
def fill_missing_values(df, default_value=0):
|
158 |
"""
|
159 |
This is used for completing pivot table
|
@@ -378,14 +416,16 @@ def visualization(results, is_overall=False):
|
|
378 |
figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
|
379 |
|
380 |
# Elo Rating
|
381 |
-
elo =
|
382 |
elo_wr = compute_relative_winrate_to_1st(elo)
|
383 |
# beautify
|
384 |
elo_wr["Elo rating"] = elo_wr["Elo rating"].astype(int)
|
385 |
elo_wr["winrate_vs_1st"] = elo_wr["winrate_vs_1st"].round(3)
|
386 |
elo_wr.index.name = "Rank"
|
|
|
|
|
387 |
|
388 |
-
figure_dict["elo_rating"] = elo_wr
|
389 |
|
390 |
# Elo Rating by Task: Radar chart
|
391 |
if is_overall:
|
|
|
154 |
return df
|
155 |
|
156 |
|
157 |
+
def compute_elo_with_ci(df, n_bootstrap=1000):
|
158 |
+
"""
|
159 |
+
Compute ELO ratings with 95% confidence intervals using bootstrapping.
|
160 |
+
"""
|
161 |
+
if isinstance(df, list):
|
162 |
+
df = pd.DataFrame(df)
|
163 |
+
|
164 |
+
bootstrap_elo_scores = []
|
165 |
+
for i in range(n_bootstrap):
|
166 |
+
# 복원추출로 샘플링
|
167 |
+
sample_df = df.sample(n=len(df), replace=True)
|
168 |
+
elo_scores = compute_mle_elo(sample_df)
|
169 |
+
elo_scores = elo_scores.set_index("Model")["Elo rating"]
|
170 |
+
bootstrap_elo_scores.append(elo_scores)
|
171 |
+
|
172 |
+
bootstrap_df = pd.DataFrame(bootstrap_elo_scores)
|
173 |
+
|
174 |
+
# 신뢰구간 계산
|
175 |
+
ci_lower = bootstrap_df.quantile(0.025)
|
176 |
+
ci_upper = bootstrap_df.quantile(0.975)
|
177 |
+
|
178 |
+
# 원본 데이터로 ELO 점수 계산
|
179 |
+
main_elo_df = compute_mle_elo(df)
|
180 |
+
main_elo_df = main_elo_df.set_index("Model")
|
181 |
+
|
182 |
+
# 결과 합치기
|
183 |
+
result_df = main_elo_df.copy()
|
184 |
+
result_df["95% CI_lower"] = ci_lower
|
185 |
+
result_df["95% CI_upper"] = ci_upper
|
186 |
+
|
187 |
+
result_df = result_df.sort_values("Elo rating", ascending=False)
|
188 |
+
result_df = result_df.reset_index()
|
189 |
+
result_df.index = result_df.index + 1
|
190 |
+
|
191 |
+
return result_df
|
192 |
+
|
193 |
+
|
194 |
+
|
195 |
def fill_missing_values(df, default_value=0):
|
196 |
"""
|
197 |
This is used for completing pivot table
|
|
|
416 |
figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
|
417 |
|
418 |
# Elo Rating
|
419 |
+
elo = compute_elo_with_ci(results)
|
420 |
elo_wr = compute_relative_winrate_to_1st(elo)
|
421 |
# beautify
|
422 |
elo_wr["Elo rating"] = elo_wr["Elo rating"].astype(int)
|
423 |
elo_wr["winrate_vs_1st"] = elo_wr["winrate_vs_1st"].round(3)
|
424 |
elo_wr.index.name = "Rank"
|
425 |
+
elo_wr["95% CI"] = elo_wr.apply(lambda row: f"({row['95% CI_upper']:.1f}, {row['95% CI_lower']:.1f})", axis=1)
|
426 |
+
elo_wr = elo_wr.rename(columns = {"95% CI": "95% CI (UB, LB)"})
|
427 |
|
428 |
+
figure_dict["elo_rating"] = elo_wr[["Model", "Elo rating", "95% CI (UB, LB)", "winrate_vs_1st"]]
|
429 |
|
430 |
# Elo Rating by Task: Radar chart
|
431 |
if is_overall:
|
varco_arena/varco_arena_core/elo.py
CHANGED
@@ -50,6 +50,7 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
50 |
.reset_index(drop=True)
|
51 |
)
|
52 |
df.index = df.index + 1
|
|
|
53 |
|
54 |
return df
|
55 |
|
@@ -69,7 +70,7 @@ def compute_relative_winrate_to_1st(elo_df):
|
|
69 |
|
70 |
rating1st = elo_df["Elo rating"].max()
|
71 |
win_rate_to_1st = partial(elo_to_winrate, rating_b=rating1st)
|
72 |
-
elo_df["winrate_vs_1st"] = elo_df["Elo rating"].apply(win_rate_to_1st)
|
73 |
print(elo_df)
|
74 |
|
75 |
return elo_df
|
@@ -80,3 +81,42 @@ def elo_to_winrate(rating_a: float = None, rating_b: float = None) -> float:
|
|
80 |
rate_diff = rating_a - rating_b
|
81 |
win_rate = 1 / (1 + 10 ** (-rate_diff / 400))
|
82 |
return win_rate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
.reset_index(drop=True)
|
51 |
)
|
52 |
df.index = df.index + 1
|
53 |
+
df["Elo rating"] = df["Elo rating"]
|
54 |
|
55 |
return df
|
56 |
|
|
|
70 |
|
71 |
rating1st = elo_df["Elo rating"].max()
|
72 |
win_rate_to_1st = partial(elo_to_winrate, rating_b=rating1st)
|
73 |
+
elo_df["winrate_vs_1st"] = elo_df["Elo rating"].apply(win_rate_to_1st).round(3)
|
74 |
print(elo_df)
|
75 |
|
76 |
return elo_df
|
|
|
81 |
rate_diff = rating_a - rating_b
|
82 |
win_rate = 1 / (1 + 10 ** (-rate_diff / 400))
|
83 |
return win_rate
|
84 |
+
|
85 |
+
|
86 |
+
def compute_elo_with_ci(df, n_bootstrap=1000):
|
87 |
+
"""
|
88 |
+
Compute ELO ratings with 95% confidence intervals using bootstrapping.
|
89 |
+
"""
|
90 |
+
if isinstance(df, list):
|
91 |
+
df = pd.DataFrame(df)
|
92 |
+
|
93 |
+
bootstrap_elo_scores = []
|
94 |
+
for i in range(n_bootstrap):
|
95 |
+
# 복원추출로 샘플링
|
96 |
+
sample_df = df.sample(n=len(df), replace=True)
|
97 |
+
elo_scores = compute_mle_elo(sample_df)
|
98 |
+
elo_scores = elo_scores.set_index("Model")["Elo rating"]
|
99 |
+
bootstrap_elo_scores.append(elo_scores)
|
100 |
+
|
101 |
+
bootstrap_df = pd.DataFrame(bootstrap_elo_scores)
|
102 |
+
|
103 |
+
# 신뢰구간 계산
|
104 |
+
ci_lower = bootstrap_df.quantile(0.025).round(1)
|
105 |
+
ci_upper = bootstrap_df.quantile(0.975).round(1)
|
106 |
+
|
107 |
+
# 원본 데이터로 ELO 점수 계산
|
108 |
+
main_elo_df = compute_mle_elo(df)
|
109 |
+
main_elo_df = main_elo_df.set_index("Model")
|
110 |
+
|
111 |
+
# 결과 합치기
|
112 |
+
result_df = main_elo_df.copy()
|
113 |
+
|
114 |
+
result_df["95% CI_lower"] = ci_lower
|
115 |
+
result_df["95% CI_upper"] = ci_upper
|
116 |
+
|
117 |
+
result_df = result_df.sort_values("Elo rating", ascending=False)
|
118 |
+
result_df["Elo rating"] = result_df["Elo rating"].round(1)
|
119 |
+
result_df = result_df.reset_index()
|
120 |
+
result_df.index = result_df.index + 1
|
121 |
+
|
122 |
+
return result_df
|
varco_arena/varco_arena_core/prompts/llmbar.py
CHANGED
@@ -155,6 +155,7 @@ class LLMBarPrompt(ComparisonPromptBase):
|
|
155 |
out_b: str = None,
|
156 |
task: Optional[str] = None,
|
157 |
criteria_questions: Optional[str] = None,
|
|
|
158 |
) -> List[Dict]:
|
159 |
if (criteria_questions and task) or criteria_questions is None and task is None:
|
160 |
raise ValueError(
|
|
|
155 |
out_b: str = None,
|
156 |
task: Optional[str] = None,
|
157 |
criteria_questions: Optional[str] = None,
|
158 |
+
**kwargs,
|
159 |
) -> List[Dict]:
|
160 |
if (criteria_questions and task) or criteria_questions is None and task is None:
|
161 |
raise ValueError(
|
varco_arena/varco_arena_core/visualization.py
CHANGED
@@ -2,7 +2,7 @@ import pandas as pd
|
|
2 |
import plotly.express as px
|
3 |
import plotly.graph_objects as go
|
4 |
|
5 |
-
from .elo import compute_mle_elo
|
6 |
|
7 |
|
8 |
def fill_missing_values(df, default_value=0):
|
@@ -137,7 +137,7 @@ def visualization(results, is_overall=False):
|
|
137 |
figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
|
138 |
|
139 |
# Elo Rating
|
140 |
-
elo =
|
141 |
|
142 |
char_width = 16
|
143 |
header_char_width = 20
|
@@ -152,7 +152,7 @@ def visualization(results, is_overall=False):
|
|
152 |
data=[
|
153 |
go.Table(
|
154 |
header=dict(
|
155 |
-
values=["<b>Rank</b>", "<b>Model</b>", "<b>Elo rating</b>"],
|
156 |
fill_color="paleturquoise",
|
157 |
align="left",
|
158 |
font=dict(size=16),
|
@@ -161,7 +161,8 @@ def visualization(results, is_overall=False):
|
|
161 |
values=[
|
162 |
list(range(1, len(elo) + 1)),
|
163 |
elo["Model"],
|
164 |
-
elo["Elo rating"],
|
|
|
165 |
],
|
166 |
align="left",
|
167 |
font=dict(size=16),
|
|
|
2 |
import plotly.express as px
|
3 |
import plotly.graph_objects as go
|
4 |
|
5 |
+
from .elo import compute_elo_with_ci, compute_mle_elo
|
6 |
|
7 |
|
8 |
def fill_missing_values(df, default_value=0):
|
|
|
137 |
figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
|
138 |
|
139 |
# Elo Rating
|
140 |
+
elo = compute_elo_with_ci(results)
|
141 |
|
142 |
char_width = 16
|
143 |
header_char_width = 20
|
|
|
152 |
data=[
|
153 |
go.Table(
|
154 |
header=dict(
|
155 |
+
values=["<b>Rank</b>", "<b>Model</b>", "<b>Elo rating</b>", "<b>95% CI</b>"],
|
156 |
fill_color="paleturquoise",
|
157 |
align="left",
|
158 |
font=dict(size=16),
|
|
|
161 |
values=[
|
162 |
list(range(1, len(elo) + 1)),
|
163 |
elo["Model"],
|
164 |
+
elo["Elo rating"].round(2),
|
165 |
+
[f"{lower:.2f} - {upper:.2f}" for lower, upper in zip(elo['95% CI_lower'], elo['95% CI_upper'])]
|
166 |
],
|
167 |
align="left",
|
168 |
font=dict(size=16),
|
view_utils.py
CHANGED
@@ -154,6 +154,44 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
154 |
return df
|
155 |
|
156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
def fill_missing_values(df, default_value=0):
|
158 |
"""
|
159 |
This is used for completing pivot table
|
@@ -378,14 +416,16 @@ def visualization(results, is_overall=False):
|
|
378 |
figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
|
379 |
|
380 |
# Elo Rating
|
381 |
-
elo =
|
382 |
elo_wr = compute_relative_winrate_to_1st(elo)
|
383 |
# beautify
|
384 |
elo_wr["Elo rating"] = elo_wr["Elo rating"].astype(int)
|
385 |
elo_wr["winrate_vs_1st"] = elo_wr["winrate_vs_1st"].round(3)
|
386 |
elo_wr.index.name = "Rank"
|
|
|
|
|
387 |
|
388 |
-
figure_dict["elo_rating"] = elo_wr
|
389 |
|
390 |
# Elo Rating by Task: Radar chart
|
391 |
if is_overall:
|
|
|
154 |
return df
|
155 |
|
156 |
|
157 |
+
def compute_elo_with_ci(df, n_bootstrap=1000):
|
158 |
+
"""
|
159 |
+
Compute ELO ratings with 95% confidence intervals using bootstrapping.
|
160 |
+
"""
|
161 |
+
if isinstance(df, list):
|
162 |
+
df = pd.DataFrame(df)
|
163 |
+
|
164 |
+
bootstrap_elo_scores = []
|
165 |
+
for i in range(n_bootstrap):
|
166 |
+
# 복원추출로 샘플링
|
167 |
+
sample_df = df.sample(n=len(df), replace=True)
|
168 |
+
elo_scores = compute_mle_elo(sample_df)
|
169 |
+
elo_scores = elo_scores.set_index("Model")["Elo rating"]
|
170 |
+
bootstrap_elo_scores.append(elo_scores)
|
171 |
+
|
172 |
+
bootstrap_df = pd.DataFrame(bootstrap_elo_scores)
|
173 |
+
|
174 |
+
# 신뢰구간 계산
|
175 |
+
ci_lower = bootstrap_df.quantile(0.025)
|
176 |
+
ci_upper = bootstrap_df.quantile(0.975)
|
177 |
+
|
178 |
+
# 원본 데이터로 ELO 점수 계산
|
179 |
+
main_elo_df = compute_mle_elo(df)
|
180 |
+
main_elo_df = main_elo_df.set_index("Model")
|
181 |
+
|
182 |
+
# 결과 합치기
|
183 |
+
result_df = main_elo_df.copy()
|
184 |
+
result_df["95% CI_lower"] = ci_lower
|
185 |
+
result_df["95% CI_upper"] = ci_upper
|
186 |
+
|
187 |
+
result_df = result_df.sort_values("Elo rating", ascending=False)
|
188 |
+
result_df = result_df.reset_index()
|
189 |
+
result_df.index = result_df.index + 1
|
190 |
+
|
191 |
+
return result_df
|
192 |
+
|
193 |
+
|
194 |
+
|
195 |
def fill_missing_values(df, default_value=0):
|
196 |
"""
|
197 |
This is used for completing pivot table
|
|
|
416 |
figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
|
417 |
|
418 |
# Elo Rating
|
419 |
+
elo = compute_elo_with_ci(results)
|
420 |
elo_wr = compute_relative_winrate_to_1st(elo)
|
421 |
# beautify
|
422 |
elo_wr["Elo rating"] = elo_wr["Elo rating"].astype(int)
|
423 |
elo_wr["winrate_vs_1st"] = elo_wr["winrate_vs_1st"].round(3)
|
424 |
elo_wr.index.name = "Rank"
|
425 |
+
elo_wr["95% CI"] = elo_wr.apply(lambda row: f"({row['95% CI_upper']:.1f}, {row['95% CI_lower']:.1f})", axis=1)
|
426 |
+
elo_wr = elo_wr.rename(columns = {"95% CI": "95% CI (UB, LB)"})
|
427 |
|
428 |
+
figure_dict["elo_rating"] = elo_wr[["Model", "Elo rating", "95% CI (UB, LB)", "winrate_vs_1st"]]
|
429 |
|
430 |
# Elo Rating by Task: Radar chart
|
431 |
if is_overall:
|