freeman.genie777@gmail.com commited on
Commit
a88ccc4
·
1 Parent(s): ba378b3

95% CI added (rebuttal)

Browse files
app.py CHANGED
@@ -205,6 +205,7 @@ def run_varco_arena(
205
  last_update_time = time.time()
206
  terminal_output = st.empty()
207
  full_output = f"{command}\n"
 
208
  while True:
209
  # Check if we have output to read
210
  if select.select([process.stdout], [], [], 0)[0]:
@@ -229,6 +230,7 @@ def run_varco_arena(
229
  # Check if the process has finished
230
  if process.poll() is not None:
231
  # Read any remaining output
 
232
  remaining_output = process.stdout.read()
233
  if remaining_output:
234
  lines = remaining_output.split("\n")
 
205
  last_update_time = time.time()
206
  terminal_output = st.empty()
207
  full_output = f"{command}\n"
208
+ to_show = full_output
209
  while True:
210
  # Check if we have output to read
211
  if select.select([process.stdout], [], [], 0)[0]:
 
230
  # Check if the process has finished
231
  if process.poll() is not None:
232
  # Read any remaining output
233
+ os.set_blocking(process.stdout.fileno(), True)
234
  remaining_output = process.stdout.read()
235
  if remaining_output:
236
  lines = remaining_output.split("\n")
streamlit_app_local/app.py CHANGED
@@ -106,6 +106,7 @@ def run_varco_arena(
106
  last_update_time = time.time()
107
  terminal_output = st.empty()
108
  full_output = f"{command}\n"
 
109
  while True:
110
  # Check if we have output to read
111
  if select.select([process.stdout], [], [], 0)[0]:
@@ -130,6 +131,7 @@ def run_varco_arena(
130
  # Check if the process has finished
131
  if process.poll() is not None:
132
  # Read any remaining output
 
133
  remaining_output = process.stdout.read()
134
  if remaining_output:
135
  lines = remaining_output.split("\n")
 
106
  last_update_time = time.time()
107
  terminal_output = st.empty()
108
  full_output = f"{command}\n"
109
+ to_show = full_output
110
  while True:
111
  # Check if we have output to read
112
  if select.select([process.stdout], [], [], 0)[0]:
 
131
  # Check if the process has finished
132
  if process.poll() is not None:
133
  # Read any remaining output
134
+ os.set_blocking(process.stdout.fileno(), True)
135
  remaining_output = process.stdout.read()
136
  if remaining_output:
137
  lines = remaining_output.split("\n")
streamlit_app_local/view_utils.py CHANGED
@@ -154,6 +154,44 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
154
  return df
155
 
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  def fill_missing_values(df, default_value=0):
158
  """
159
  This is used for completing pivot table
@@ -378,14 +416,16 @@ def visualization(results, is_overall=False):
378
  figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
379
 
380
  # Elo Rating
381
- elo = compute_mle_elo(results)
382
  elo_wr = compute_relative_winrate_to_1st(elo)
383
  # beautify
384
  elo_wr["Elo rating"] = elo_wr["Elo rating"].astype(int)
385
  elo_wr["winrate_vs_1st"] = elo_wr["winrate_vs_1st"].round(3)
386
  elo_wr.index.name = "Rank"
 
 
387
 
388
- figure_dict["elo_rating"] = elo_wr
389
 
390
  # Elo Rating by Task: Radar chart
391
  if is_overall:
 
154
  return df
155
 
156
 
157
+ def compute_elo_with_ci(df, n_bootstrap=1000):
158
+ """
159
+ Compute ELO ratings with 95% confidence intervals using bootstrapping.
160
+ """
161
+ if isinstance(df, list):
162
+ df = pd.DataFrame(df)
163
+
164
+ bootstrap_elo_scores = []
165
+ for i in range(n_bootstrap):
166
+ # 복원추출로 샘플링
167
+ sample_df = df.sample(n=len(df), replace=True)
168
+ elo_scores = compute_mle_elo(sample_df)
169
+ elo_scores = elo_scores.set_index("Model")["Elo rating"]
170
+ bootstrap_elo_scores.append(elo_scores)
171
+
172
+ bootstrap_df = pd.DataFrame(bootstrap_elo_scores)
173
+
174
+ # 신뢰구간 계산
175
+ ci_lower = bootstrap_df.quantile(0.025)
176
+ ci_upper = bootstrap_df.quantile(0.975)
177
+
178
+ # 원본 데이터로 ELO 점수 계산
179
+ main_elo_df = compute_mle_elo(df)
180
+ main_elo_df = main_elo_df.set_index("Model")
181
+
182
+ # 결과 합치기
183
+ result_df = main_elo_df.copy()
184
+ result_df["95% CI_lower"] = ci_lower
185
+ result_df["95% CI_upper"] = ci_upper
186
+
187
+ result_df = result_df.sort_values("Elo rating", ascending=False)
188
+ result_df = result_df.reset_index()
189
+ result_df.index = result_df.index + 1
190
+
191
+ return result_df
192
+
193
+
194
+
195
  def fill_missing_values(df, default_value=0):
196
  """
197
  This is used for completing pivot table
 
416
  figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
417
 
418
  # Elo Rating
419
+ elo = compute_elo_with_ci(results)
420
  elo_wr = compute_relative_winrate_to_1st(elo)
421
  # beautify
422
  elo_wr["Elo rating"] = elo_wr["Elo rating"].astype(int)
423
  elo_wr["winrate_vs_1st"] = elo_wr["winrate_vs_1st"].round(3)
424
  elo_wr.index.name = "Rank"
425
+ elo_wr["95% CI"] = elo_wr.apply(lambda row: f"({row['95% CI_upper']:.1f}, {row['95% CI_lower']:.1f})", axis=1)
426
+ elo_wr = elo_wr.rename(columns = {"95% CI": "95% CI (UB, LB)"})
427
 
428
+ figure_dict["elo_rating"] = elo_wr[["Model", "Elo rating", "95% CI (UB, LB)", "winrate_vs_1st"]]
429
 
430
  # Elo Rating by Task: Radar chart
431
  if is_overall:
varco_arena/varco_arena_core/elo.py CHANGED
@@ -50,6 +50,7 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
50
  .reset_index(drop=True)
51
  )
52
  df.index = df.index + 1
 
53
 
54
  return df
55
 
@@ -69,7 +70,7 @@ def compute_relative_winrate_to_1st(elo_df):
69
 
70
  rating1st = elo_df["Elo rating"].max()
71
  win_rate_to_1st = partial(elo_to_winrate, rating_b=rating1st)
72
- elo_df["winrate_vs_1st"] = elo_df["Elo rating"].apply(win_rate_to_1st)
73
  print(elo_df)
74
 
75
  return elo_df
@@ -80,3 +81,42 @@ def elo_to_winrate(rating_a: float = None, rating_b: float = None) -> float:
80
  rate_diff = rating_a - rating_b
81
  win_rate = 1 / (1 + 10 ** (-rate_diff / 400))
82
  return win_rate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  .reset_index(drop=True)
51
  )
52
  df.index = df.index + 1
53
+ df["Elo rating"] = df["Elo rating"]
54
 
55
  return df
56
 
 
70
 
71
  rating1st = elo_df["Elo rating"].max()
72
  win_rate_to_1st = partial(elo_to_winrate, rating_b=rating1st)
73
+ elo_df["winrate_vs_1st"] = elo_df["Elo rating"].apply(win_rate_to_1st).round(3)
74
  print(elo_df)
75
 
76
  return elo_df
 
81
  rate_diff = rating_a - rating_b
82
  win_rate = 1 / (1 + 10 ** (-rate_diff / 400))
83
  return win_rate
84
+
85
+
86
+ def compute_elo_with_ci(df, n_bootstrap=1000):
87
+ """
88
+ Compute ELO ratings with 95% confidence intervals using bootstrapping.
89
+ """
90
+ if isinstance(df, list):
91
+ df = pd.DataFrame(df)
92
+
93
+ bootstrap_elo_scores = []
94
+ for i in range(n_bootstrap):
95
+ # 복원추출로 샘플링
96
+ sample_df = df.sample(n=len(df), replace=True)
97
+ elo_scores = compute_mle_elo(sample_df)
98
+ elo_scores = elo_scores.set_index("Model")["Elo rating"]
99
+ bootstrap_elo_scores.append(elo_scores)
100
+
101
+ bootstrap_df = pd.DataFrame(bootstrap_elo_scores)
102
+
103
+ # 신뢰구간 계산
104
+ ci_lower = bootstrap_df.quantile(0.025).round(1)
105
+ ci_upper = bootstrap_df.quantile(0.975).round(1)
106
+
107
+ # 원본 데이터로 ELO 점수 계산
108
+ main_elo_df = compute_mle_elo(df)
109
+ main_elo_df = main_elo_df.set_index("Model")
110
+
111
+ # 결과 합치기
112
+ result_df = main_elo_df.copy()
113
+
114
+ result_df["95% CI_lower"] = ci_lower
115
+ result_df["95% CI_upper"] = ci_upper
116
+
117
+ result_df = result_df.sort_values("Elo rating", ascending=False)
118
+ result_df["Elo rating"] = result_df["Elo rating"].round(1)
119
+ result_df = result_df.reset_index()
120
+ result_df.index = result_df.index + 1
121
+
122
+ return result_df
varco_arena/varco_arena_core/prompts/llmbar.py CHANGED
@@ -155,6 +155,7 @@ class LLMBarPrompt(ComparisonPromptBase):
155
  out_b: str = None,
156
  task: Optional[str] = None,
157
  criteria_questions: Optional[str] = None,
 
158
  ) -> List[Dict]:
159
  if (criteria_questions and task) or criteria_questions is None and task is None:
160
  raise ValueError(
 
155
  out_b: str = None,
156
  task: Optional[str] = None,
157
  criteria_questions: Optional[str] = None,
158
+ **kwargs,
159
  ) -> List[Dict]:
160
  if (criteria_questions and task) or criteria_questions is None and task is None:
161
  raise ValueError(
varco_arena/varco_arena_core/visualization.py CHANGED
@@ -2,7 +2,7 @@ import pandas as pd
2
  import plotly.express as px
3
  import plotly.graph_objects as go
4
 
5
- from .elo import compute_mle_elo
6
 
7
 
8
  def fill_missing_values(df, default_value=0):
@@ -137,7 +137,7 @@ def visualization(results, is_overall=False):
137
  figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
138
 
139
  # Elo Rating
140
- elo = compute_mle_elo(results)
141
 
142
  char_width = 16
143
  header_char_width = 20
@@ -152,7 +152,7 @@ def visualization(results, is_overall=False):
152
  data=[
153
  go.Table(
154
  header=dict(
155
- values=["<b>Rank</b>", "<b>Model</b>", "<b>Elo rating</b>"],
156
  fill_color="paleturquoise",
157
  align="left",
158
  font=dict(size=16),
@@ -161,7 +161,8 @@ def visualization(results, is_overall=False):
161
  values=[
162
  list(range(1, len(elo) + 1)),
163
  elo["Model"],
164
- elo["Elo rating"],
 
165
  ],
166
  align="left",
167
  font=dict(size=16),
 
2
  import plotly.express as px
3
  import plotly.graph_objects as go
4
 
5
+ from .elo import compute_elo_with_ci, compute_mle_elo
6
 
7
 
8
  def fill_missing_values(df, default_value=0):
 
137
  figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
138
 
139
  # Elo Rating
140
+ elo = compute_elo_with_ci(results)
141
 
142
  char_width = 16
143
  header_char_width = 20
 
152
  data=[
153
  go.Table(
154
  header=dict(
155
+ values=["<b>Rank</b>", "<b>Model</b>", "<b>Elo rating</b>", "<b>95% CI</b>"],
156
  fill_color="paleturquoise",
157
  align="left",
158
  font=dict(size=16),
 
161
  values=[
162
  list(range(1, len(elo) + 1)),
163
  elo["Model"],
164
+ elo["Elo rating"].round(2),
165
+ [f"{lower:.2f} - {upper:.2f}" for lower, upper in zip(elo['95% CI_lower'], elo['95% CI_upper'])]
166
  ],
167
  align="left",
168
  font=dict(size=16),
view_utils.py CHANGED
@@ -154,6 +154,44 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
154
  return df
155
 
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  def fill_missing_values(df, default_value=0):
158
  """
159
  This is used for completing pivot table
@@ -378,14 +416,16 @@ def visualization(results, is_overall=False):
378
  figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
379
 
380
  # Elo Rating
381
- elo = compute_mle_elo(results)
382
  elo_wr = compute_relative_winrate_to_1st(elo)
383
  # beautify
384
  elo_wr["Elo rating"] = elo_wr["Elo rating"].astype(int)
385
  elo_wr["winrate_vs_1st"] = elo_wr["winrate_vs_1st"].round(3)
386
  elo_wr.index.name = "Rank"
 
 
387
 
388
- figure_dict["elo_rating"] = elo_wr
389
 
390
  # Elo Rating by Task: Radar chart
391
  if is_overall:
 
154
  return df
155
 
156
 
157
+ def compute_elo_with_ci(df, n_bootstrap=1000):
158
+ """
159
+ Compute ELO ratings with 95% confidence intervals using bootstrapping.
160
+ """
161
+ if isinstance(df, list):
162
+ df = pd.DataFrame(df)
163
+
164
+ bootstrap_elo_scores = []
165
+ for i in range(n_bootstrap):
166
+ # 복원추출로 샘플링
167
+ sample_df = df.sample(n=len(df), replace=True)
168
+ elo_scores = compute_mle_elo(sample_df)
169
+ elo_scores = elo_scores.set_index("Model")["Elo rating"]
170
+ bootstrap_elo_scores.append(elo_scores)
171
+
172
+ bootstrap_df = pd.DataFrame(bootstrap_elo_scores)
173
+
174
+ # 신뢰구간 계산
175
+ ci_lower = bootstrap_df.quantile(0.025)
176
+ ci_upper = bootstrap_df.quantile(0.975)
177
+
178
+ # 원본 데이터로 ELO 점수 계산
179
+ main_elo_df = compute_mle_elo(df)
180
+ main_elo_df = main_elo_df.set_index("Model")
181
+
182
+ # 결과 합치기
183
+ result_df = main_elo_df.copy()
184
+ result_df["95% CI_lower"] = ci_lower
185
+ result_df["95% CI_upper"] = ci_upper
186
+
187
+ result_df = result_df.sort_values("Elo rating", ascending=False)
188
+ result_df = result_df.reset_index()
189
+ result_df.index = result_df.index + 1
190
+
191
+ return result_df
192
+
193
+
194
+
195
  def fill_missing_values(df, default_value=0):
196
  """
197
  This is used for completing pivot table
 
416
  figure_dict["fraction_of_model_a_wins_for_all_a_vs_b_matches"] = fig
417
 
418
  # Elo Rating
419
+ elo = compute_elo_with_ci(results)
420
  elo_wr = compute_relative_winrate_to_1st(elo)
421
  # beautify
422
  elo_wr["Elo rating"] = elo_wr["Elo rating"].astype(int)
423
  elo_wr["winrate_vs_1st"] = elo_wr["winrate_vs_1st"].round(3)
424
  elo_wr.index.name = "Rank"
425
+ elo_wr["95% CI"] = elo_wr.apply(lambda row: f"({row['95% CI_upper']:.1f}, {row['95% CI_lower']:.1f})", axis=1)
426
+ elo_wr = elo_wr.rename(columns = {"95% CI": "95% CI (UB, LB)"})
427
 
428
+ figure_dict["elo_rating"] = elo_wr[["Model", "Elo rating", "95% CI (UB, LB)", "winrate_vs_1st"]]
429
 
430
  # Elo Rating by Task: Radar chart
431
  if is_overall: