Yuxuan-Zhang-Dexter commited on
Commit
3856741
·
1 Parent(s): 8290468

update agent and model leaderboard two tabs

Browse files
app.py CHANGED
@@ -14,7 +14,6 @@ from leaderboard_utils import (
14
  get_sokoban_leaderboard,
15
  get_2048_leaderboard,
16
  get_candy_leaderboard,
17
- get_tetris_leaderboard,
18
  get_tetris_planning_leaderboard,
19
  get_ace_attorney_leaderboard,
20
  get_combined_leaderboard,
@@ -22,11 +21,7 @@ from leaderboard_utils import (
22
  )
23
  from data_visualization import (
24
  get_combined_leaderboard_with_group_bar,
25
- create_organization_radar_chart,
26
- create_top_players_radar_chart,
27
- create_player_radar_chart,
28
  create_horizontal_bar_chart,
29
- normalize_values,
30
  get_combined_leaderboard_with_single_radar
31
  )
32
  from gallery_tab import create_video_gallery
@@ -46,27 +41,31 @@ TIME_POINTS = {
46
  with open(TIME_POINTS["03/25/2025"], "r") as f:
47
  rank_data = json.load(f)
48
 
 
 
 
 
49
  # Add leaderboard state at the top level
50
  leaderboard_state = {
51
  "current_game": None,
52
  "previous_overall": {
53
  # "Super Mario Bros": True, # Commented out
54
- "Super Mario Bros (planning only)": True,
55
  "Sokoban": True,
56
  "2048": True,
57
  "Candy Crush": True,
58
- # "Tetris (complete)", # Commented out
59
- "Tetris (planning only)": True,
60
  "Ace Attorney": True
61
  },
62
  "previous_details": {
63
  # "Super Mario Bros": False, # Commented out
64
- "Super Mario Bros (planning only)": False,
65
  "Sokoban": False,
66
  "2048": False,
67
  "Candy Crush": False,
68
- # "Tetris (complete)": False, # Commented out
69
- "Tetris (planning only)": False,
70
  "Ace Attorney": False
71
  }
72
  }
@@ -184,29 +183,34 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
184
  candy_overall, candy_details,
185
  # tetris_overall, tetris_details, # Commented out
186
  tetris_plan_overall, tetris_plan_details,
187
- ace_attorney_overall, ace_attorney_details):
 
 
188
  global leaderboard_state
189
 
 
 
 
190
  # Convert current checkbox states to dictionary for easier comparison
191
  current_overall = {
192
  # "Super Mario Bros": mario_overall, # Commented out
193
- "Super Mario Bros (planning only)": mario_plan_overall,
194
  "Sokoban": sokoban_overall,
195
  "2048": _2048_overall,
196
  "Candy Crush": candy_overall,
197
- # "Tetris (complete)": tetris_overall, # Commented out
198
- "Tetris (planning only)": tetris_plan_overall,
199
  "Ace Attorney": ace_attorney_overall
200
  }
201
 
202
  current_details = {
203
  # "Super Mario Bros": mario_details, # Commented out
204
- "Super Mario Bros (planning only)": mario_plan_details,
205
  "Sokoban": sokoban_details,
206
  "2048": _2048_details,
207
  "Candy Crush": candy_details,
208
- # "Tetris (complete)": tetris_details, # Commented out
209
- "Tetris (planning only)": tetris_plan_details,
210
  "Ace Attorney": ace_attorney_details
211
  }
212
 
@@ -289,12 +293,12 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
289
  # Build dictionary for selected games
290
  selected_games = {
291
  # "Super Mario Bros": current_overall["Super Mario Bros"], # Commented out
292
- "Super Mario Bros (planning only)": current_overall["Super Mario Bros (planning only)"],
293
  "Sokoban": current_overall["Sokoban"],
294
  "2048": current_overall["2048"],
295
  "Candy Crush": current_overall["Candy Crush"],
296
- # "Tetris (complete)": current_overall["Tetris (complete)"], # Commented out
297
- "Tetris (planning only)": current_overall["Tetris (planning only)"],
298
  "Ace Attorney": current_overall["Ace Attorney"]
299
  }
300
 
@@ -302,19 +306,19 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
302
  if leaderboard_state["current_game"]:
303
  # For detailed view
304
  # if leaderboard_state["current_game"] == "Super Mario Bros": # Commented out
305
- # df = get_mario_leaderboard(rank_data)
306
- if leaderboard_state["current_game"] == "Super Mario Bros (planning only)":
307
- df = get_mario_planning_leaderboard(rank_data)
308
  elif leaderboard_state["current_game"] == "Sokoban":
309
- df = get_sokoban_leaderboard(rank_data)
310
  elif leaderboard_state["current_game"] == "2048":
311
- df = get_2048_leaderboard(rank_data)
312
  elif leaderboard_state["current_game"] == "Candy Crush":
313
- df = get_candy_leaderboard(rank_data)
314
- elif leaderboard_state["current_game"] == "Tetris (planning only)":
315
- df = get_tetris_planning_leaderboard(rank_data)
316
  elif leaderboard_state["current_game"] == "Ace Attorney":
317
- df = get_ace_attorney_leaderboard(rank_data)
318
  else: # Should not happen if current_game is one of the known games
319
  df = pd.DataFrame() # Empty df
320
 
@@ -324,18 +328,18 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
324
  group_bar_chart = chart
325
  else:
326
  # For overall view
327
- df, group_bar_chart = get_combined_leaderboard_with_group_bar(rank_data, selected_games)
328
  display_df = prepare_dataframe_for_display(df)
329
- _, radar_chart = get_combined_leaderboard_with_single_radar(rank_data, selected_games)
330
  chart = radar_chart # In overall view, the 'detailed' chart can be the radar chart
331
 
332
  # Return values, including all four plot placeholders
333
  return (update_df_with_height(display_df), chart, radar_chart, group_bar_chart,
334
- current_overall["Super Mario Bros (planning only)"], current_details["Super Mario Bros (planning only)"],
335
  current_overall["Sokoban"], current_details["Sokoban"],
336
  current_overall["2048"], current_details["2048"],
337
  current_overall["Candy Crush"], current_details["Candy Crush"],
338
- current_overall["Tetris (planning only)"], current_details["Tetris (planning only)"],
339
  current_overall["Ace Attorney"], current_details["Ace Attorney"])
340
 
341
  def update_leaderboard_with_time(time_point, # mario_overall, mario_details, # Commented out
@@ -352,7 +356,7 @@ def update_leaderboard_with_time(time_point, # mario_overall, mario_details, # C
352
  if new_rank_data is not None:
353
  rank_data = new_rank_data
354
 
355
- # Use the existing update_leaderboard function, including Super Mario (planning only)
356
  return update_leaderboard(# mario_overall, mario_details, # Commented out
357
  mario_plan_overall, mario_plan_details, # Added
358
  sokoban_overall, sokoban_details,
@@ -362,47 +366,63 @@ def update_leaderboard_with_time(time_point, # mario_overall, mario_details, # C
362
  tetris_plan_overall, tetris_plan_details,
363
  ace_attorney_overall, ace_attorney_details)
364
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  def get_initial_state():
366
  """Get the initial state for the leaderboard"""
367
  return {
368
  "current_game": None,
369
  "previous_overall": {
370
  # "Super Mario Bros": True, # Commented out
371
- "Super Mario Bros (planning only)": True,
372
  "Sokoban": True,
373
  "2048": True,
374
  "Candy Crush": True,
375
- # "Tetris (complete)", # Commented out
376
- "Tetris (planning only)": True,
377
  "Ace Attorney": True
378
  },
379
  "previous_details": {
380
  # "Super Mario Bros": False, # Commented out
381
- "Super Mario Bros (planning only)": False,
382
  "Sokoban": False,
383
  "2048": False,
384
  "Candy Crush": False,
385
- # "Tetris (complete)": False, # Commented out
386
- "Tetris (planning only)": False,
387
  "Ace Attorney": False
388
  }
389
  }
390
 
391
- def clear_filters():
392
  global leaderboard_state
393
 
 
 
 
394
  selected_games = {
395
- "Super Mario Bros (planning only)": True,
396
  "Sokoban": True,
397
  "2048": True,
398
  "Candy Crush": True,
399
- "Tetris (planning only)": True,
400
  "Ace Attorney": True
401
  }
402
 
403
- df, group_bar_chart = get_combined_leaderboard_with_group_bar(rank_data, selected_games)
404
  display_df = prepare_dataframe_for_display(df)
405
- _, radar_chart = get_combined_leaderboard_with_single_radar(rank_data, selected_games)
406
 
407
  leaderboard_state = get_initial_state()
408
 
@@ -412,7 +432,7 @@ def clear_filters():
412
  True, False, # sokoban
413
  True, False, # 2048
414
  True, False, # candy
415
- True, False, # tetris plan
416
  True, False) # ace attorney
417
 
418
  def create_timeline_slider():
@@ -527,7 +547,7 @@ def build_app():
527
  with gr.Blocks(css="""
528
  /* Fix for scrolling issues */
529
  html, body {
530
- overflow-y: hidden !important;
531
  overflow-x: hidden !important;
532
  width: 100% !important;
533
  height: 100% !important;
@@ -750,18 +770,18 @@ def build_app():
750
 
751
  let newContent = header.innerHTML;
752
 
753
- // Format Super Mario Bros header
754
  if (text.includes('Super Mario Bros')) {
755
  newContent = newContent.replace(/Super\s+Mario\s+Bros/g, 'Super<br>Mario Bros');
756
  }
757
 
758
- // Format Tetris headers
759
- if (text.includes('Tetris (complete)')) {
760
  newContent = newContent.replace(/Tetris\s+\(complete\)/g, 'Tetris<br>(complete)');
761
  }
762
 
763
- if (text.includes('Tetris (planning only)')) {
764
- newContent = newContent.replace(/Tetris\s+\(planning\s+only\)/g, 'Tetris<br>(planning)');
765
  }
766
 
767
  // Format Candy Crush header
@@ -853,7 +873,7 @@ def build_app():
853
  """)
854
 
855
  with gr.Tabs():
856
- with gr.Tab("🏆 Leaderboard"):
857
  # Visualization section
858
  with gr.Row():
859
  gr.Markdown("### 📊 Data Visualization")
@@ -879,6 +899,17 @@ def build_app():
879
  )
880
  # Comment out the Group Bar Chart tab
881
  with gr.Tab("📊 Group Bar Chart"):
 
 
 
 
 
 
 
 
 
 
 
882
  group_bar_visualization = gr.Plot(
883
  label="Comparative Analysis (Group Bar Chart)",
884
  elem_classes="visualization-container"
@@ -892,14 +923,14 @@ def build_app():
892
  with gr.Row():
893
  gr.Markdown("### 🎮 Game Selection")
894
  with gr.Row():
895
- # with gr.Column(): # Commented out Super Mario Bros UI
896
  # gr.Markdown("**🎮 Super Mario Bros**")
897
- # mario_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
898
- # mario_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
899
- with gr.Column(): # Added Super Mario Bros (planning only) UI
900
- gr.Markdown("**📝 Super Mario Bros (planning only)**")
901
- mario_plan_overall = gr.Checkbox(label="Super Mario Bros (planning only) Score", value=True)
902
- mario_plan_details = gr.Checkbox(label="Super Mario Bros (planning only) Details", value=False)
903
  with gr.Column(): # Sokoban is now after mario_plan
904
  gr.Markdown("**📦 Sokoban**")
905
  sokoban_overall = gr.Checkbox(label="Sokoban Score", value=True)
@@ -912,14 +943,14 @@ def build_app():
912
  gr.Markdown("**🍬 Candy Crush**")
913
  candy_overall = gr.Checkbox(label="Candy Crush Score", value=True)
914
  candy_details = gr.Checkbox(label="Candy Crush Details", value=False)
915
- # with gr.Column(): # Commented out Tetris (complete) UI
916
- # gr.Markdown("**🎯 Tetris (complete)**")
917
- # tetris_overall = gr.Checkbox(label="Tetris (complete) Score", value=True)
918
- # tetris_details = gr.Checkbox(label="Tetris (complete) Details", value=False)
919
  with gr.Column():
920
- gr.Markdown("**📋 Tetris (planning)**")
921
- tetris_plan_overall = gr.Checkbox(label="Tetris (planning) Score", value=True)
922
- tetris_plan_details = gr.Checkbox(label="Tetris (planning) Details", value=False)
923
  with gr.Column():
924
  gr.Markdown("**⚖️ Ace Attorney**")
925
  ace_attorney_overall = gr.Checkbox(label="Ace Attorney Score", value=True)
@@ -945,12 +976,12 @@ def build_app():
945
  # Get initial leaderboard dataframe
946
  initial_df = get_combined_leaderboard(rank_data, {
947
  # "Super Mario Bros": True, # Commented out
948
- "Super Mario Bros (planning only)": True,
949
  "Sokoban": True,
950
  "2048": True,
951
  "Candy Crush": True,
952
- # "Tetris (complete)": True, # Commented out
953
- "Tetris (planning only)": True,
954
  "Ace Attorney": True
955
  })
956
 
@@ -985,7 +1016,7 @@ def build_app():
985
  with gr.Row():
986
  score_note = add_score_note()
987
 
988
- # List of all checkboxes, including Super Mario Bros (planning only)
989
  checkbox_list = [
990
  # mario_overall, mario_details, # Commented out
991
  mario_plan_overall, mario_plan_details,
@@ -1000,13 +1031,13 @@ def build_app():
1000
  # Update visualizations when checkboxes change
1001
  def update_visualizations(*checkbox_states):
1002
  # Check if any details checkbox is selected
1003
- # Adjusted indices due to addition of Super Mario (planning only)
1004
  is_details_view = any([
1005
  checkbox_states[1], # Mario Plan details
1006
  checkbox_states[3], # Sokoban details
1007
  checkbox_states[5], # 2048 details
1008
  checkbox_states[7], # Candy Crush details
1009
- checkbox_states[9], # Tetris (planning only) details
1010
  checkbox_states[11] # Ace Attorney details
1011
  ])
1012
 
@@ -1027,40 +1058,252 @@ def build_app():
1027
  # Update leaderboard and visualizations when checkboxes change
1028
  for checkbox in checkbox_list:
1029
  checkbox.change(
1030
- update_leaderboard,
1031
- inputs=checkbox_list,
1032
  outputs=[
1033
  leaderboard_df,
1034
  detailed_visualization,
1035
  radar_visualization,
1036
- group_bar_visualization # RESTORED
1037
  ] + checkbox_list
1038
  )
1039
 
 
 
 
 
 
 
 
 
 
 
 
 
1040
  # Update when clear button is clicked
1041
  clear_btn.click(
1042
- clear_filters,
1043
- inputs=[],
1044
  outputs=[
1045
  leaderboard_df,
1046
  detailed_visualization,
1047
  radar_visualization,
1048
- group_bar_visualization # RESTORED
1049
  ] + checkbox_list
1050
  )
1051
 
1052
  # Initialize the app
1053
  demo.load(
1054
- fn=clear_filters,
1055
  inputs=[],
1056
  outputs=[
1057
  leaderboard_df,
1058
  detailed_visualization,
1059
  radar_visualization,
1060
- group_bar_visualization # RESTORED
1061
  ] + checkbox_list
1062
  )
1063
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1064
  with gr.Tab("🎥 Gallery"):
1065
  video_gallery = create_video_gallery()
1066
 
 
14
  get_sokoban_leaderboard,
15
  get_2048_leaderboard,
16
  get_candy_leaderboard,
 
17
  get_tetris_planning_leaderboard,
18
  get_ace_attorney_leaderboard,
19
  get_combined_leaderboard,
 
21
  )
22
  from data_visualization import (
23
  get_combined_leaderboard_with_group_bar,
 
 
 
24
  create_horizontal_bar_chart,
 
25
  get_combined_leaderboard_with_single_radar
26
  )
27
  from gallery_tab import create_video_gallery
 
41
  with open(TIME_POINTS["03/25/2025"], "r") as f:
42
  rank_data = json.load(f)
43
 
44
+ # Load the model leaderboard data
45
+ with open("rank_single_model_03_25_2025.json", "r") as f:
46
+ model_rank_data = json.load(f)
47
+
48
  # Add leaderboard state at the top level
49
  leaderboard_state = {
50
  "current_game": None,
51
  "previous_overall": {
52
  # "Super Mario Bros": True, # Commented out
53
+ "Super Mario Bros": True,
54
  "Sokoban": True,
55
  "2048": True,
56
  "Candy Crush": True,
57
+ # "Tetris(complete)", # Commented out
58
+ "Tetris": True,
59
  "Ace Attorney": True
60
  },
61
  "previous_details": {
62
  # "Super Mario Bros": False, # Commented out
63
+ "Super Mario Bros": False,
64
  "Sokoban": False,
65
  "2048": False,
66
  "Candy Crush": False,
67
+ # "Tetris(complete)": False, # Commented out
68
+ "Tetris": False,
69
  "Ace Attorney": False
70
  }
71
  }
 
183
  candy_overall, candy_details,
184
  # tetris_overall, tetris_details, # Commented out
185
  tetris_plan_overall, tetris_plan_details,
186
+ ace_attorney_overall, ace_attorney_details,
187
+ top_n=10,
188
+ data_source=None):
189
  global leaderboard_state
190
 
191
+ # Use provided data source or default to rank_data
192
+ data = data_source if data_source is not None else rank_data
193
+
194
  # Convert current checkbox states to dictionary for easier comparison
195
  current_overall = {
196
  # "Super Mario Bros": mario_overall, # Commented out
197
+ "Super Mario Bros": mario_plan_overall,
198
  "Sokoban": sokoban_overall,
199
  "2048": _2048_overall,
200
  "Candy Crush": candy_overall,
201
+ # "Tetris(complete)": tetris_overall, # Commented out
202
+ "Tetris": tetris_plan_overall,
203
  "Ace Attorney": ace_attorney_overall
204
  }
205
 
206
  current_details = {
207
  # "Super Mario Bros": mario_details, # Commented out
208
+ "Super Mario Bros": mario_plan_details,
209
  "Sokoban": sokoban_details,
210
  "2048": _2048_details,
211
  "Candy Crush": candy_details,
212
+ # "Tetris(complete)": tetris_details, # Commented out
213
+ "Tetris": tetris_plan_details,
214
  "Ace Attorney": ace_attorney_details
215
  }
216
 
 
293
  # Build dictionary for selected games
294
  selected_games = {
295
  # "Super Mario Bros": current_overall["Super Mario Bros"], # Commented out
296
+ "Super Mario Bros": current_overall["Super Mario Bros"],
297
  "Sokoban": current_overall["Sokoban"],
298
  "2048": current_overall["2048"],
299
  "Candy Crush": current_overall["Candy Crush"],
300
+ # "Tetris(complete)": current_overall["Tetris(complete)"], # Commented out
301
+ "Tetris": current_overall["Tetris"],
302
  "Ace Attorney": current_overall["Ace Attorney"]
303
  }
304
 
 
306
  if leaderboard_state["current_game"]:
307
  # For detailed view
308
  # if leaderboard_state["current_game"] == "Super Mario Bros": # Commented out
309
+ # df = get_mario_leaderboard(data)
310
+ if leaderboard_state["current_game"] == "Super Mario Bros":
311
+ df = get_mario_planning_leaderboard(data)
312
  elif leaderboard_state["current_game"] == "Sokoban":
313
+ df = get_sokoban_leaderboard(data)
314
  elif leaderboard_state["current_game"] == "2048":
315
+ df = get_2048_leaderboard(data)
316
  elif leaderboard_state["current_game"] == "Candy Crush":
317
+ df = get_candy_leaderboard(data)
318
+ elif leaderboard_state["current_game"] == "Tetris":
319
+ df = get_tetris_planning_leaderboard(data)
320
  elif leaderboard_state["current_game"] == "Ace Attorney":
321
+ df = get_ace_attorney_leaderboard(data)
322
  else: # Should not happen if current_game is one of the known games
323
  df = pd.DataFrame() # Empty df
324
 
 
328
  group_bar_chart = chart
329
  else:
330
  # For overall view
331
+ df, group_bar_chart = get_combined_leaderboard_with_group_bar(data, selected_games, top_n)
332
  display_df = prepare_dataframe_for_display(df)
333
+ _, radar_chart = get_combined_leaderboard_with_single_radar(data, selected_games)
334
  chart = radar_chart # In overall view, the 'detailed' chart can be the radar chart
335
 
336
  # Return values, including all four plot placeholders
337
  return (update_df_with_height(display_df), chart, radar_chart, group_bar_chart,
338
+ current_overall["Super Mario Bros"], current_details["Super Mario Bros"],
339
  current_overall["Sokoban"], current_details["Sokoban"],
340
  current_overall["2048"], current_details["2048"],
341
  current_overall["Candy Crush"], current_details["Candy Crush"],
342
+ current_overall["Tetris"], current_details["Tetris"],
343
  current_overall["Ace Attorney"], current_details["Ace Attorney"])
344
 
345
  def update_leaderboard_with_time(time_point, # mario_overall, mario_details, # Commented out
 
356
  if new_rank_data is not None:
357
  rank_data = new_rank_data
358
 
359
+ # Use the existing update_leaderboard function, including Super Mario
360
  return update_leaderboard(# mario_overall, mario_details, # Commented out
361
  mario_plan_overall, mario_plan_details, # Added
362
  sokoban_overall, sokoban_details,
 
366
  tetris_plan_overall, tetris_plan_details,
367
  ace_attorney_overall, ace_attorney_details)
368
 
369
+ def get_total_model_count(data_source):
370
+ """Get the total number of unique models in the data"""
371
+ selected_games = {
372
+ "Super Mario Bros": True,
373
+ "Sokoban": True,
374
+ "2048": True,
375
+ "Candy Crush": True,
376
+ "Tetris": True,
377
+ "Ace Attorney": True
378
+ }
379
+ df = get_combined_leaderboard(data_source, selected_games)
380
+ return len(df["Player"].unique())
381
+
382
  def get_initial_state():
383
  """Get the initial state for the leaderboard"""
384
  return {
385
  "current_game": None,
386
  "previous_overall": {
387
  # "Super Mario Bros": True, # Commented out
388
+ "Super Mario Bros": True,
389
  "Sokoban": True,
390
  "2048": True,
391
  "Candy Crush": True,
392
+ # "Tetris(complete)", # Commented out
393
+ "Tetris": True,
394
  "Ace Attorney": True
395
  },
396
  "previous_details": {
397
  # "Super Mario Bros": False, # Commented out
398
+ "Super Mario Bros": False,
399
  "Sokoban": False,
400
  "2048": False,
401
  "Candy Crush": False,
402
+ # "Tetris(complete)": False, # Commented out
403
+ "Tetris": False,
404
  "Ace Attorney": False
405
  }
406
  }
407
 
408
+ def clear_filters(top_n=10, data_source=None):
409
  global leaderboard_state
410
 
411
+ # Use provided data source or default to rank_data
412
+ data = data_source if data_source is not None else rank_data
413
+
414
  selected_games = {
415
+ "Super Mario Bros": True,
416
  "Sokoban": True,
417
  "2048": True,
418
  "Candy Crush": True,
419
+ "Tetris": True,
420
  "Ace Attorney": True
421
  }
422
 
423
+ df, group_bar_chart = get_combined_leaderboard_with_group_bar(data, selected_games, top_n)
424
  display_df = prepare_dataframe_for_display(df)
425
+ _, radar_chart = get_combined_leaderboard_with_single_radar(data, selected_games)
426
 
427
  leaderboard_state = get_initial_state()
428
 
 
432
  True, False, # sokoban
433
  True, False, # 2048
434
  True, False, # candy
435
+ True, False, # Tetrisplan
436
  True, False) # ace attorney
437
 
438
  def create_timeline_slider():
 
547
  with gr.Blocks(css="""
548
  /* Fix for scrolling issues */
549
  html, body {
550
+ overflow-y: auto !important;
551
  overflow-x: hidden !important;
552
  width: 100% !important;
553
  height: 100% !important;
 
770
 
771
  let newContent = header.innerHTML;
772
 
773
+ // Format Super Mario Brosheader
774
  if (text.includes('Super Mario Bros')) {
775
  newContent = newContent.replace(/Super\s+Mario\s+Bros/g, 'Super<br>Mario Bros');
776
  }
777
 
778
+ // Format Tetrisheaders
779
+ if (text.includes('Tetris(complete)')) {
780
  newContent = newContent.replace(/Tetris\s+\(complete\)/g, 'Tetris<br>(complete)');
781
  }
782
 
783
+ if (text.includes('Tetris')) {
784
+ newContent = newContent.replace(/Tetris\s+\(planning\s+only\)/g, 'Tetris');
785
  }
786
 
787
  // Format Candy Crush header
 
873
  """)
874
 
875
  with gr.Tabs():
876
+ with gr.Tab("🏆 Agent Leaderboard"):
877
  # Visualization section
878
  with gr.Row():
879
  gr.Markdown("### 📊 Data Visualization")
 
899
  )
900
  # Comment out the Group Bar Chart tab
901
  with gr.Tab("📊 Group Bar Chart"):
902
+ with gr.Row():
903
+ # Calculate dynamic maximum based on total models
904
+ agent_max_models = get_total_model_count(rank_data)
905
+ top_n_slider = gr.Slider(
906
+ minimum=1,
907
+ maximum=agent_max_models,
908
+ step=1,
909
+ value=min(10, agent_max_models),
910
+ label=f"Number of Top Models to Display (max: {agent_max_models})",
911
+ elem_classes="top-n-slider"
912
+ )
913
  group_bar_visualization = gr.Plot(
914
  label="Comparative Analysis (Group Bar Chart)",
915
  elem_classes="visualization-container"
 
923
  with gr.Row():
924
  gr.Markdown("### 🎮 Game Selection")
925
  with gr.Row():
926
+ # with gr.Column(): # Commented out Super Mario BrosUI
927
  # gr.Markdown("**🎮 Super Mario Bros**")
928
+ # mario_overall = gr.Checkbox(label="Super Mario BrosScore", value=True)
929
+ # mario_details = gr.Checkbox(label="Super Mario BrosDetails", value=False)
930
+ with gr.Column(): # Added Super Mario BrosUI
931
+ gr.Markdown("**🎮 Super Mario Bros**")
932
+ mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
933
+ mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
934
  with gr.Column(): # Sokoban is now after mario_plan
935
  gr.Markdown("**📦 Sokoban**")
936
  sokoban_overall = gr.Checkbox(label="Sokoban Score", value=True)
 
943
  gr.Markdown("**🍬 Candy Crush**")
944
  candy_overall = gr.Checkbox(label="Candy Crush Score", value=True)
945
  candy_details = gr.Checkbox(label="Candy Crush Details", value=False)
946
+ # with gr.Column(): # Commented out Tetris(complete) UI
947
+ # gr.Markdown("**🎯 Tetris(complete)**")
948
+ # tetris_overall = gr.Checkbox(label="Tetris(complete) Score", value=True)
949
+ # tetris_details = gr.Checkbox(label="Tetris(complete) Details", value=False)
950
  with gr.Column():
951
+ gr.Markdown("**🎯 Tetris**")
952
+ tetris_plan_overall = gr.Checkbox(label="Tetris Score", value=True)
953
+ tetris_plan_details = gr.Checkbox(label="Tetris Details", value=False)
954
  with gr.Column():
955
  gr.Markdown("**⚖️ Ace Attorney**")
956
  ace_attorney_overall = gr.Checkbox(label="Ace Attorney Score", value=True)
 
976
  # Get initial leaderboard dataframe
977
  initial_df = get_combined_leaderboard(rank_data, {
978
  # "Super Mario Bros": True, # Commented out
979
+ "Super Mario Bros": True,
980
  "Sokoban": True,
981
  "2048": True,
982
  "Candy Crush": True,
983
+ # "Tetris(complete)": True, # Commented out
984
+ "Tetris": True,
985
  "Ace Attorney": True
986
  })
987
 
 
1016
  with gr.Row():
1017
  score_note = add_score_note()
1018
 
1019
+ # List of all checkboxes, including Super Mario Bros
1020
  checkbox_list = [
1021
  # mario_overall, mario_details, # Commented out
1022
  mario_plan_overall, mario_plan_details,
 
1031
  # Update visualizations when checkboxes change
1032
  def update_visualizations(*checkbox_states):
1033
  # Check if any details checkbox is selected
1034
+ # Adjusted indices due to addition of Super Mario
1035
  is_details_view = any([
1036
  checkbox_states[1], # Mario Plan details
1037
  checkbox_states[3], # Sokoban details
1038
  checkbox_states[5], # 2048 details
1039
  checkbox_states[7], # Candy Crush details
1040
+ checkbox_states[9], # Tetris details
1041
  checkbox_states[11] # Ace Attorney details
1042
  ])
1043
 
 
1058
  # Update leaderboard and visualizations when checkboxes change
1059
  for checkbox in checkbox_list:
1060
  checkbox.change(
1061
+ lambda *args: update_leaderboard(*args, data_source=rank_data),
1062
+ inputs=checkbox_list + [top_n_slider],
1063
  outputs=[
1064
  leaderboard_df,
1065
  detailed_visualization,
1066
  radar_visualization,
1067
+ group_bar_visualization
1068
  ] + checkbox_list
1069
  )
1070
 
1071
+ # Update when top_n_slider changes
1072
+ top_n_slider.change(
1073
+ lambda *args: update_leaderboard(*args, data_source=rank_data),
1074
+ inputs=checkbox_list + [top_n_slider],
1075
+ outputs=[
1076
+ leaderboard_df,
1077
+ detailed_visualization,
1078
+ radar_visualization,
1079
+ group_bar_visualization
1080
+ ] + checkbox_list
1081
+ )
1082
+
1083
  # Update when clear button is clicked
1084
  clear_btn.click(
1085
+ lambda *args: clear_filters(*args, data_source=rank_data),
1086
+ inputs=[top_n_slider],
1087
  outputs=[
1088
  leaderboard_df,
1089
  detailed_visualization,
1090
  radar_visualization,
1091
+ group_bar_visualization
1092
  ] + checkbox_list
1093
  )
1094
 
1095
  # Initialize the app
1096
  demo.load(
1097
+ lambda: clear_filters(data_source=rank_data),
1098
  inputs=[],
1099
  outputs=[
1100
  leaderboard_df,
1101
  detailed_visualization,
1102
  radar_visualization,
1103
+ group_bar_visualization
1104
  ] + checkbox_list
1105
  )
1106
 
1107
+ with gr.Tab("🤖 Model Leaderboard"):
1108
+ # Visualization section
1109
+ with gr.Row():
1110
+ gr.Markdown("### 📊 Data Visualization")
1111
+
1112
+ # Detailed view visualization (single chart)
1113
+ model_detailed_visualization = gr.Plot(
1114
+ label="Performance Visualization",
1115
+ visible=False,
1116
+ elem_classes="visualization-container"
1117
+ )
1118
+
1119
+ with gr.Column(visible=True) as model_overall_visualizations:
1120
+ with gr.Tabs():
1121
+ with gr.Tab("📈 Radar Chart"):
1122
+ model_radar_visualization = gr.Plot(
1123
+ label="Comparative Analysis (Radar Chart)",
1124
+ elem_classes="visualization-container"
1125
+ )
1126
+ gr.Markdown(
1127
+ "*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*",
1128
+ elem_classes="radar-tip"
1129
+ )
1130
+ with gr.Tab("📊 Group Bar Chart"):
1131
+ with gr.Row():
1132
+ # Calculate dynamic maximum based on total models
1133
+ model_max_models = get_total_model_count(model_rank_data)
1134
+ model_top_n_slider = gr.Slider(
1135
+ minimum=1,
1136
+ maximum=model_max_models,
1137
+ step=1,
1138
+ value=min(10, model_max_models),
1139
+ label=f"Number of Top Models to Display (max: {model_max_models})",
1140
+ elem_classes="top-n-slider"
1141
+ )
1142
+ model_group_bar_visualization = gr.Plot(
1143
+ label="Comparative Analysis (Group Bar Chart)",
1144
+ elem_classes="visualization-container"
1145
+ )
1146
+
1147
+ # Game selection section
1148
+ with gr.Row():
1149
+ gr.Markdown("### 🎮 Game Selection")
1150
+ with gr.Row():
1151
+ with gr.Column():
1152
+ gr.Markdown("**🎮 Super Mario Bros**")
1153
+ model_mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
1154
+ model_mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
1155
+ with gr.Column():
1156
+ gr.Markdown("**📦 Sokoban**")
1157
+ model_sokoban_overall = gr.Checkbox(label="Sokoban Score", value=True)
1158
+ model_sokoban_details = gr.Checkbox(label="Sokoban Details", value=False)
1159
+ with gr.Column():
1160
+ gr.Markdown("**🔢 2048**")
1161
+ model_2048_overall = gr.Checkbox(label="2048 Score", value=True)
1162
+ model_2048_details = gr.Checkbox(label="2048 Details", value=False)
1163
+ with gr.Column():
1164
+ gr.Markdown("**🍬 Candy Crush**")
1165
+ model_candy_overall = gr.Checkbox(label="Candy Crush Score", value=True)
1166
+ model_candy_details = gr.Checkbox(label="Candy Crush Details", value=False)
1167
+ with gr.Column():
1168
+ gr.Markdown("**🎯 Tetris**")
1169
+ model_tetris_plan_overall = gr.Checkbox(label="Tetris Score", value=True)
1170
+ model_tetris_plan_details = gr.Checkbox(label="Tetris Details", value=False)
1171
+ with gr.Column():
1172
+ gr.Markdown("**⚖️ Ace Attorney**")
1173
+ model_ace_attorney_overall = gr.Checkbox(label="Ace Attorney Score", value=True)
1174
+ model_ace_attorney_details = gr.Checkbox(label="Ace Attorney Details", value=False)
1175
+
1176
+ # Controls
1177
+ with gr.Row():
1178
+ with gr.Column(scale=2):
1179
+ gr.Markdown("**⏰ Time Tracker**")
1180
+ model_timeline = create_timeline_slider()
1181
+ with gr.Column(scale=1):
1182
+ gr.Markdown("**🔄 Controls**")
1183
+ model_clear_btn = gr.Button("Reset Filters", variant="secondary")
1184
+
1185
+ # Leaderboard table
1186
+ with gr.Row():
1187
+ gr.Markdown("### 📋 Detailed Results")
1188
+
1189
+ # Get initial leaderboard dataframe
1190
+ model_initial_df = get_combined_leaderboard(model_rank_data, {
1191
+ "Super Mario Bros": True,
1192
+ "Sokoban": True,
1193
+ "2048": True,
1194
+ "Candy Crush": True,
1195
+ "Tetris": True,
1196
+ "Ace Attorney": True
1197
+ })
1198
+
1199
+ # Format the DataFrame for display
1200
+ model_initial_display_df = prepare_dataframe_for_display(model_initial_df)
1201
+
1202
+ # Create a standard DataFrame component with enhanced styling
1203
+ with gr.Row():
1204
+ model_leaderboard_df = gr.DataFrame(
1205
+ value=model_initial_display_df,
1206
+ interactive=True,
1207
+ elem_id="model-leaderboard-table",
1208
+ elem_classes="table-container",
1209
+ wrap=True,
1210
+ show_row_numbers=True,
1211
+ show_fullscreen_button=True,
1212
+ line_breaks=True,
1213
+ max_height=1000,
1214
+ show_search="search",
1215
+ column_widths=col_widths
1216
+ )
1217
+
1218
+ # Add the score note below the table
1219
+ with gr.Row():
1220
+ model_score_note = add_score_note()
1221
+
1222
+ # List of all checkboxes for model leaderboard
1223
+ model_checkbox_list = [
1224
+ model_mario_plan_overall, model_mario_plan_details,
1225
+ model_sokoban_overall, model_sokoban_details,
1226
+ model_2048_overall, model_2048_details,
1227
+ model_candy_overall, model_candy_details,
1228
+ model_tetris_plan_overall, model_tetris_plan_details,
1229
+ model_ace_attorney_overall, model_ace_attorney_details
1230
+ ]
1231
+
1232
+ # Update visualizations when checkboxes change
1233
+ def update_model_visualizations(*checkbox_states):
1234
+ # Check if any details checkbox is selected
1235
+ is_details_view = any([
1236
+ checkbox_states[1], # Mario Plan details
1237
+ checkbox_states[3], # Sokoban details
1238
+ checkbox_states[5], # 2048 details
1239
+ checkbox_states[7], # Candy Crush details
1240
+ checkbox_states[9], # Tetris details
1241
+ checkbox_states[11] # Ace Attorney details
1242
+ ])
1243
+
1244
+ # Update visibility of visualization blocks
1245
+ return {
1246
+ model_detailed_visualization: gr.update(visible=is_details_view),
1247
+ model_overall_visualizations: gr.update(visible=not is_details_view)
1248
+ }
1249
+
1250
+ # Add change event to all checkboxes
1251
+ for checkbox in model_checkbox_list:
1252
+ checkbox.change(
1253
+ update_model_visualizations,
1254
+ inputs=model_checkbox_list,
1255
+ outputs=[model_detailed_visualization, model_overall_visualizations]
1256
+ )
1257
+
1258
+ # Update leaderboard and visualizations when checkboxes change
1259
+ for checkbox in model_checkbox_list:
1260
+ checkbox.change(
1261
+ lambda *args: update_leaderboard(*args, data_source=model_rank_data),
1262
+ inputs=model_checkbox_list + [model_top_n_slider],
1263
+ outputs=[
1264
+ model_leaderboard_df,
1265
+ model_detailed_visualization,
1266
+ model_radar_visualization,
1267
+ model_group_bar_visualization
1268
+ ] + model_checkbox_list
1269
+ )
1270
+
1271
+ # Update when model top_n_slider changes
1272
+ model_top_n_slider.change(
1273
+ lambda *args: update_leaderboard(*args, data_source=model_rank_data),
1274
+ inputs=model_checkbox_list + [model_top_n_slider],
1275
+ outputs=[
1276
+ model_leaderboard_df,
1277
+ model_detailed_visualization,
1278
+ model_radar_visualization,
1279
+ model_group_bar_visualization
1280
+ ] + model_checkbox_list
1281
+ )
1282
+
1283
+ # Update when clear button is clicked
1284
+ model_clear_btn.click(
1285
+ lambda *args: clear_filters(*args, data_source=model_rank_data),
1286
+ inputs=[model_top_n_slider],
1287
+ outputs=[
1288
+ model_leaderboard_df,
1289
+ model_detailed_visualization,
1290
+ model_radar_visualization,
1291
+ model_group_bar_visualization
1292
+ ] + model_checkbox_list
1293
+ )
1294
+
1295
+ # Initialize the model leaderboard
1296
+ demo.load(
1297
+ lambda: clear_filters(data_source=model_rank_data),
1298
+ inputs=[],
1299
+ outputs=[
1300
+ model_leaderboard_df,
1301
+ model_detailed_visualization,
1302
+ model_radar_visualization,
1303
+ model_group_bar_visualization
1304
+ ] + model_checkbox_list
1305
+ )
1306
+
1307
  with gr.Tab("🎥 Gallery"):
1308
  video_gallery = create_video_gallery()
1309
 
assets/model_color.json CHANGED
@@ -3,12 +3,16 @@
3
  "claude-3-7-sonnet-20250219 (thinking)": "#2E5C8A",
4
  "claude-3-5-haiku-20241022": "#7FB5E6",
5
  "claude-3-5-sonnet-20241022": "#1A4C7C",
 
 
6
  "gemini-2.0-flash": "#FF4081",
7
  "gemini-2.0-flash-thinking-exp-1219": "#C2185B",
8
  "gemini-2.5-pro-exp-03-25": "#FF80AB",
9
  "gemini-2.5-flash-preview-04-17": "#F06292",
10
  "gemini-2.5-flash-preview-04-17 (thinking)": "#E91E63",
 
11
  "gemini-2.5-pro-preview-05-06 (thinking)": "#AD1457",
 
12
  "gpt-4o-2024-11-20": "#00BFA5",
13
  "gpt-4.5-preview-2025-02-27": "#00796B",
14
  "gpt-4.1-2025-04-14": "#00897B",
@@ -21,7 +25,39 @@
21
  "grok-3-mini-beta": "#FF8A65",
22
  "grok-3-mini-beta (thinking)": "#F57C00",
23
  "deepseek-v3": "#FFC107",
24
- "deepseek-r1": "#FFA000",
 
25
  "llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
26
- "Random (x30)": "#9E9E9E"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  }
 
3
  "claude-3-7-sonnet-20250219 (thinking)": "#2E5C8A",
4
  "claude-3-5-haiku-20241022": "#7FB5E6",
5
  "claude-3-5-sonnet-20241022": "#1A4C7C",
6
+ "claude-opus-4-20250514": "#3A80D2",
7
+ "claude-sonnet-4-20250514": "#5A9FE2",
8
  "gemini-2.0-flash": "#FF4081",
9
  "gemini-2.0-flash-thinking-exp-1219": "#C2185B",
10
  "gemini-2.5-pro-exp-03-25": "#FF80AB",
11
  "gemini-2.5-flash-preview-04-17": "#F06292",
12
  "gemini-2.5-flash-preview-04-17 (thinking)": "#E91E63",
13
+ "gemini-2.5-flash-preview-05-20": "#F8BBD9",
14
  "gemini-2.5-pro-preview-05-06 (thinking)": "#AD1457",
15
+ "gemini-2.5-pro-preview-06-05": "#EC407A",
16
  "gpt-4o-2024-11-20": "#00BFA5",
17
  "gpt-4.5-preview-2025-02-27": "#00796B",
18
  "gpt-4.1-2025-04-14": "#00897B",
 
25
  "grok-3-mini-beta": "#FF8A65",
26
  "grok-3-mini-beta (thinking)": "#F57C00",
27
  "deepseek-v3": "#FFC107",
28
+ "deepseek-r1-0120": "#FFA000",
29
+ "deepseek-r1-0528": "#FFB300",
30
  "llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
31
+ "qwen3-235B-A22B-fp8": "#6A1B9A",
32
+ "random (x30)": "#9E9E9E",
33
+ "gamingagent + claude-3-7-sonnet-20250219": "#4A90E2",
34
+ "gamingagent + claude-3-7-sonnet-20250219 (thinking)": "#2E5C8A",
35
+ "gamingagent + claude-3-5-haiku-20241022": "#7FB5E6",
36
+ "gamingagent + claude-3-5-sonnet-20241022": "#1A4C7C",
37
+ "gamingagent + claude-opus-4-20250514": "#3A80D2",
38
+ "gamingagent + claude-sonnet-4-20250514": "#5A9FE2",
39
+ "gamingagent + gemini-2.0-flash": "#FF4081",
40
+ "gamingagent + gemini-2.0-flash-thinking-exp-1219": "#C2185B",
41
+ "gamingagent + gemini-2.5-pro-exp-03-25": "#FF80AB",
42
+ "gamingagent + gemini-2.5-flash-preview-04-17": "#F06292",
43
+ "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)": "#E91E63",
44
+ "gamingagent + gemini-2.5-flash-preview-05-20": "#F8BBD9",
45
+ "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)": "#AD1457",
46
+ "gamingagent + gemini-2.5-pro-preview-06-05": "#EC407A",
47
+ "gamingagent + gpt-4o-2024-11-20": "#00BFA5",
48
+ "gamingagent + gpt-4.5-preview-2025-02-27": "#00796B",
49
+ "gamingagent + gpt-4.1-2025-04-14": "#00897B",
50
+ "gamingagent + o1-2024-12-17": "#4DB6AC",
51
+ "gamingagent + o1-mini-2024-09-12": "#26A69A",
52
+ "gamingagent + o3-mini-2025-01-31(medium)": "#80CBC4",
53
+ "gamingagent + o3-2025-04-16": "#26C6DA",
54
+ "gamingagent + o4-mini-2025-04-16": "#00ACC1",
55
+ "gamingagent + grok-3-beta": "#FF7043",
56
+ "gamingagent + grok-3-mini-beta": "#FF8A65",
57
+ "gamingagent + grok-3-mini-beta (thinking)": "#F57C00",
58
+ "gamingagent + deepseek-v3": "#FFC107",
59
+ "gamingagent + deepseek-r1-0120": "#FFA000",
60
+ "gamingagent + deepseek-r1-0528": "#FFB300",
61
+ "gamingagent + llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
62
+ "gamingagent + qwen3-235B-A22B-fp8": "#6A1B9A"
63
  }
data_visualization.py CHANGED
@@ -3,13 +3,6 @@ import numpy as np
3
  import pandas as pd
4
  import json
5
  from leaderboard_utils import (
6
- get_organization,
7
- get_mario_leaderboard,
8
- get_sokoban_leaderboard,
9
- get_2048_leaderboard,
10
- get_candy_leaderboard,
11
- get_tetris_leaderboard,
12
- get_tetris_planning_leaderboard,
13
  get_combined_leaderboard,
14
  GAME_ORDER
15
  )
@@ -186,7 +179,7 @@ def get_combined_leaderboard_with_radar(rank_data, selected_games):
186
  df_viz = df.copy()
187
  return df, create_radar_charts(df_viz)
188
 
189
- def create_group_bar_chart(df):
190
  game_cols = {}
191
  for game in GAME_ORDER:
192
  col = f"{game} Score"
@@ -231,56 +224,89 @@ def create_group_bar_chart(df):
231
  # Create mapping from original to formatted names
232
  game_display_map = dict(zip(sorted_games, formatted_games))
233
 
234
- # Group models by prefix, then sort alphabetically
235
- model_groups = {}
236
- for player in df["Player"].unique():
237
- prefix = player.split('-')[0]
238
- model_groups.setdefault(prefix, []).append(player)
239
-
240
- ordered_players = []
241
- for prefix in sorted(model_groups):
242
- ordered_players.extend(sorted(model_groups[prefix]))
243
-
244
- # Create one trace per player
245
  fig = go.Figure()
246
- for player in ordered_players:
247
- row = df[df["Player"] == player]
248
- if row.empty:
249
- continue
250
- row = row.iloc[0]
251
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  y_vals = []
253
- has_data = False
254
  for game in sorted_games:
255
- col = f"norm_{game} Score"
256
- val = row.get(col, np.nan)
257
- if not np.isnan(val):
258
- has_data = True
259
- y_vals.append(val if not np.isnan(val) else 0)
260
-
261
- if not has_data:
262
- continue
263
 
264
- fig.add_trace(go.Bar(
265
- name=row["Player"],
266
- x=[game_display_map[game] for game in sorted_games],
267
- y=y_vals,
268
- marker_color=MODEL_COLORS.get(player, '#808080'),
269
- hovertemplate="<b>%{fullData.name}</b><br>Score: %{y:.1f}<extra></extra>"
270
- ))
 
 
 
 
 
271
 
272
  fig.update_layout(
273
- autosize=False,
274
- width=1000,
275
- height=800,
276
- margin=dict(l=200, r=200, t=20, b=20),
277
- title=dict(text="Grouped Bar Chart of AI Models (Consistent Trace Grouping)", pad=dict(t=10)),
278
- xaxis_title="Games",
279
  yaxis_title="Normalized Score",
280
  xaxis=dict(
281
  categoryorder='array',
282
- categoryarray=[game_display_map[g] for g in sorted_games],
283
- tickangle=0 # Keep text horizontal since we're using line breaks
 
 
284
  ),
285
  barmode='group',
286
  bargap=0.2, # Gap between game categories
@@ -303,11 +329,11 @@ def create_group_bar_chart(df):
303
 
304
 
305
 
306
- def get_combined_leaderboard_with_group_bar(rank_data, selected_games):
307
  df = get_combined_leaderboard(rank_data, selected_games)
308
  # Create a copy for visualization to avoid modifying the original
309
  df_viz = df.copy()
310
- return df, create_group_bar_chart(df_viz)
311
 
312
  def hex_to_rgba(hex_color, alpha=0.2):
313
  hex_color = hex_color.lstrip('#')
@@ -324,10 +350,8 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
324
  # Format game names
325
  formatted_games = []
326
  for game in selected_games:
327
- if game == 'Super Mario Bros (planning only)':
328
- formatted_games.append('Super Mario') # Simplified name
329
- elif game == 'Tetris (planning only)':
330
- formatted_games.append('Tetris')
331
  else:
332
  formatted_games.append(game) # Keep other names as is
333
 
@@ -387,10 +411,9 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
387
  ))
388
 
389
  fig.update_layout(
390
- autosize=False,
391
- width=1000,
392
- height=700, # Increased height to accommodate legend
393
- margin=dict(l=400, r=200, t=20, b=20),
394
  title=dict(
395
  text="AI Normalized Performance Across Games",
396
  x=0.5,
 
3
  import pandas as pd
4
  import json
5
  from leaderboard_utils import (
 
 
 
 
 
 
 
6
  get_combined_leaderboard,
7
  GAME_ORDER
8
  )
 
179
  df_viz = df.copy()
180
  return df, create_radar_charts(df_viz)
181
 
182
+ def create_group_bar_chart(df, top_n=10):
183
  game_cols = {}
184
  for game in GAME_ORDER:
185
  col = f"{game} Score"
 
224
  # Create mapping from original to formatted names
225
  game_display_map = dict(zip(sorted_games, formatted_games))
226
 
227
+ # For each game, get top performers and create combined x-axis categories
 
 
 
 
 
 
 
 
 
 
228
  fig = go.Figure()
229
+ all_x_categories = []
230
+ all_players = set()
231
+ unique_x_labels = []
232
+
233
+ # First pass: collect all players and create x-axis categories
234
+ game_rankings = {}
235
+ for game in sorted_games:
236
+ col = f"norm_{game} Score"
237
+ # Get valid scores for this game and sort by score (highest first)
238
+ game_data = df[df[col].notna()].copy()
239
+ game_data = game_data.sort_values(by=col, ascending=False)
240
+
241
+ # Store rankings for this game (limit to top_n)
242
+ game_rankings[game] = []
243
+ for i, (_, row) in enumerate(game_data.iterrows()):
244
+ if i >= top_n: # Limit to top_n performers
245
+ break
246
+
247
+ player = row["Player"]
248
+ score = row[col]
249
+ rank = i + 1
250
+ x_category = f"{game_display_map[game]}<br>#{rank}"
251
+ game_rankings[game].append({
252
+ 'player': player,
253
+ 'score': score,
254
+ 'x_category': x_category,
255
+ 'rank': rank
256
+ })
257
+ all_x_categories.append(x_category)
258
+ all_players.add(player)
259
+
260
+ # Show label at the middle position based on number of models
261
+ middle_position = (top_n + 1) // 2
262
+ if rank == middle_position:
263
+ # Special case for Super Mario Bros (planning only)
264
+ if game == "Super Mario Bros":
265
+ unique_x_labels.append("SMB")
266
+ else:
267
+ unique_x_labels.append(game_display_map[game]) # Show just game name without rank
268
+ else:
269
+ unique_x_labels.append("") # Empty string for other ranks
270
+
271
+ # Second pass: create traces for each player
272
+ for player in sorted(all_players):
273
+ x_vals = []
274
  y_vals = []
275
+
276
  for game in sorted_games:
277
+ # Find this player's data for this game
278
+ player_data = None
279
+ for data in game_rankings[game]:
280
+ if data['player'] == player:
281
+ player_data = data
282
+ break
 
 
283
 
284
+ if player_data:
285
+ x_vals.append(player_data['x_category'])
286
+ y_vals.append(player_data['score'])
287
+
288
+ if x_vals: # Only add trace if player has data
289
+ fig.add_trace(go.Bar(
290
+ name=player,
291
+ x=x_vals,
292
+ y=y_vals,
293
+ marker_color=MODEL_COLORS.get(player, '#808080'),
294
+ hovertemplate="<b>%{fullData.name}</b><br>Score: %{y:.1f}<extra></extra>"
295
+ ))
296
 
297
  fig.update_layout(
298
+ autosize=True,
299
+ height=550,
300
+ margin=dict(l=50, r=50, t=20, b=20),
301
+ title=dict(text=f"Grouped Bar Chart - Top {top_n} Performers by Game", pad=dict(t=10)),
302
+ xaxis_title="Games (Ranked by Performance)",
 
303
  yaxis_title="Normalized Score",
304
  xaxis=dict(
305
  categoryorder='array',
306
+ categoryarray=all_x_categories,
307
+ tickangle=0, # Keep text horizontal since we're using line breaks
308
+ ticktext=unique_x_labels, # Show labels only for first occurrence
309
+ tickvals=all_x_categories
310
  ),
311
  barmode='group',
312
  bargap=0.2, # Gap between game categories
 
329
 
330
 
331
 
332
+ def get_combined_leaderboard_with_group_bar(rank_data, selected_games, top_n=10):
333
  df = get_combined_leaderboard(rank_data, selected_games)
334
  # Create a copy for visualization to avoid modifying the original
335
  df_viz = df.copy()
336
+ return df, create_group_bar_chart(df_viz, top_n)
337
 
338
  def hex_to_rgba(hex_color, alpha=0.2):
339
  hex_color = hex_color.lstrip('#')
 
350
  # Format game names
351
  formatted_games = []
352
  for game in selected_games:
353
+ if game == 'Super Mario Bros':
354
+ formatted_games.append('SMB') # Clean name without planning only
 
 
355
  else:
356
  formatted_games.append(game) # Keep other names as is
357
 
 
411
  ))
412
 
413
  fig.update_layout(
414
+ autosize=True,
415
+ height=550, # Reduced height for better proportion with legend
416
+ margin=dict(l=400, r=100, t=20, b=20),
 
417
  title=dict(
418
  text="AI Normalized Performance Across Games",
419
  x=0.5,
leaderboard_utils.py CHANGED
@@ -5,12 +5,12 @@ import numpy as np
5
  # Define game order
6
  GAME_ORDER = [
7
  # "Super Mario Bros", # Commented out
8
- "Super Mario Bros (planning only)",
9
  "Sokoban",
10
  "2048",
11
  "Candy Crush",
12
  # "Tetris (complete)", # Commented out
13
- "Tetris (planning only)",
14
  "Ace Attorney"
15
  ]
16
 
@@ -31,20 +31,6 @@ def get_organization(model_name):
31
  else:
32
  return "unknown"
33
 
34
- def get_mario_leaderboard(rank_data):
35
- data = rank_data.get("Super Mario Bros", {}).get("results", [])
36
- df = pd.DataFrame(data)
37
- df = df.rename(columns={
38
- "model": "Player",
39
- "progress": "Progress (current/total)",
40
- "score": "Score",
41
- "time_s": "Time (s)"
42
- })
43
- df["Organization"] = df["Player"].apply(get_organization)
44
- df = df[["Player", "Organization", "Progress (current/total)", "Score", "Time (s)"]]
45
- if "Score" in df.columns:
46
- df = df.sort_values("Score", ascending=False)
47
- return df
48
 
49
  def get_sokoban_leaderboard(rank_data):
50
  data = rank_data.get("Sokoban", {}).get("results", [])
@@ -143,20 +129,8 @@ def get_candy_leaderboard(rank_data):
143
  df = df.sort_values("Score", ascending=False)
144
  return df
145
 
146
- def get_tetris_leaderboard(rank_data):
147
- data = rank_data.get("Tetris (complete)", {}).get("results", [])
148
- df = pd.DataFrame(data)
149
- df = df.rename(columns={
150
- "model": "Player",
151
- "score": "Score",
152
- "steps_blocks": "Steps"
153
- })
154
- df["Organization"] = df["Player"].apply(get_organization)
155
- df = df[["Player", "Organization", "Score", "Steps"]]
156
- return df
157
-
158
  def get_tetris_planning_leaderboard(rank_data):
159
- data = rank_data.get("Tetris (planning only)", {}).get("results", [])
160
  df = pd.DataFrame(data)
161
  df = df.rename(columns={
162
  "model": "Player",
@@ -181,13 +155,12 @@ def get_ace_attorney_leaderboard(rank_data):
181
  df = df.rename(columns={
182
  "model": "Player",
183
  "score": "Score",
184
- "progress": "Progress",
185
- "evaluator result": "Evaluator Result"
186
  })
187
  df["Organization"] = df["Player"].apply(get_organization)
188
 
189
- # Define columns to keep, including Evaluator Result
190
- columns_to_keep = ["Player", "Organization", "Score", "Progress", "Evaluator Result"]
191
  # Filter to only columns that actually exist in the DataFrame after renaming
192
  df_columns = [col for col in columns_to_keep if col in df.columns]
193
  df = df[df_columns]
@@ -198,7 +171,7 @@ def get_ace_attorney_leaderboard(rank_data):
198
  return df
199
 
200
  def get_mario_planning_leaderboard(rank_data):
201
- data = rank_data.get("Super Mario Bros (planning only)", {}).get("results", [])
202
  df = pd.DataFrame(data)
203
  df = df.rename(columns={
204
  "model": "Player",
@@ -224,8 +197,8 @@ def calculate_rank_and_completeness(rank_data, selected_games):
224
  # Get DataFrames for selected games
225
  # if selected_games.get("Super Mario Bros"): # Commented out
226
  # game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
227
- if selected_games.get("Super Mario Bros (planning only)"):
228
- game_dfs["Super Mario Bros (planning only)"] = get_mario_planning_leaderboard(rank_data)
229
  if selected_games.get("Sokoban"):
230
  game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
231
  if selected_games.get("2048"):
@@ -234,8 +207,8 @@ def calculate_rank_and_completeness(rank_data, selected_games):
234
  game_dfs["Candy Crush"] = get_candy_leaderboard(rank_data)
235
  # if selected_games.get("Tetris (complete)"): # Commented out
236
  # game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
237
- if selected_games.get("Tetris (planning only)"):
238
- game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
239
  if selected_games.get("Ace Attorney"):
240
  game_dfs["Ace Attorney"] = get_ace_attorney_leaderboard(rank_data)
241
 
@@ -265,7 +238,7 @@ def calculate_rank_and_completeness(rank_data, selected_games):
265
  # if game == "Super Mario Bros": # Commented out
266
  # player_score = df[df["Player"] == player]["Score"].iloc[0]
267
  # rank = len(df[df["Score"] > player_score]) + 1
268
- if game == "Super Mario Bros (planning only)":
269
  player_score = df[df["Player"] == player]["Score"].iloc[0]
270
  rank = len(df[df["Score"] > player_score]) + 1
271
  elif game == "Sokoban":
@@ -277,7 +250,7 @@ def calculate_rank_and_completeness(rank_data, selected_games):
277
  elif game == "Candy Crush":
278
  player_score = df[df["Player"] == player]["Score"].iloc[0]
279
  rank = len(df[df["Score"] > player_score]) + 1
280
- elif game in ["Tetris (planning only)"]:
281
  player_score = df[df["Player"] == player]["Score"].iloc[0]
282
  rank = len(df[df["Score"] > player_score]) + 1
283
  elif game == "Ace Attorney":
@@ -329,8 +302,8 @@ def get_combined_leaderboard(rank_data, selected_games):
329
  # Get DataFrames for selected games
330
  # if selected_games.get("Super Mario Bros"): # Commented out
331
  # game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
332
- if selected_games.get("Super Mario Bros (planning only)"):
333
- game_dfs["Super Mario Bros (planning only)"] = get_mario_planning_leaderboard(rank_data)
334
  if selected_games.get("Sokoban"):
335
  game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
336
  if selected_games.get("2048"):
@@ -339,8 +312,8 @@ def get_combined_leaderboard(rank_data, selected_games):
339
  game_dfs["Candy Crush"] = get_candy_leaderboard(rank_data)
340
  # if selected_games.get("Tetris (complete)"): # Commented out
341
  # game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
342
- if selected_games.get("Tetris (planning only)"):
343
- game_dfs["Tetris (planning only)"] = get_tetris_planning_leaderboard(rank_data)
344
  if selected_games.get("Ace Attorney"):
345
  game_dfs["Ace Attorney"] = get_ace_attorney_leaderboard(rank_data)
346
 
@@ -365,7 +338,7 @@ def get_combined_leaderboard(rank_data, selected_games):
365
  if player in df["Player"].values:
366
  # if game == "Super Mario Bros": # Commented out
367
  # player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
368
- if game == "Super Mario Bros (planning only)":
369
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
370
  elif game == "Sokoban":
371
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
@@ -373,7 +346,7 @@ def get_combined_leaderboard(rank_data, selected_games):
373
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
374
  elif game == "Candy Crush":
375
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
376
- elif game in ["Tetris (planning only)"]:
377
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
378
  elif game == "Ace Attorney":
379
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
 
5
  # Define game order
6
  GAME_ORDER = [
7
  # "Super Mario Bros", # Commented out
8
+ "Super Mario Bros",
9
  "Sokoban",
10
  "2048",
11
  "Candy Crush",
12
  # "Tetris (complete)", # Commented out
13
+ "Tetris",
14
  "Ace Attorney"
15
  ]
16
 
 
31
  else:
32
  return "unknown"
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  def get_sokoban_leaderboard(rank_data):
36
  data = rank_data.get("Sokoban", {}).get("results", [])
 
129
  df = df.sort_values("Score", ascending=False)
130
  return df
131
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  def get_tetris_planning_leaderboard(rank_data):
133
+ data = rank_data.get("Tetris", {}).get("results", [])
134
  df = pd.DataFrame(data)
135
  df = df.rename(columns={
136
  "model": "Player",
 
155
  df = df.rename(columns={
156
  "model": "Player",
157
  "score": "Score",
158
+ "progress": "Progress"
 
159
  })
160
  df["Organization"] = df["Player"].apply(get_organization)
161
 
162
+ # Define columns to keep
163
+ columns_to_keep = ["Player", "Organization", "Score", "Progress"]
164
  # Filter to only columns that actually exist in the DataFrame after renaming
165
  df_columns = [col for col in columns_to_keep if col in df.columns]
166
  df = df[df_columns]
 
171
  return df
172
 
173
  def get_mario_planning_leaderboard(rank_data):
174
+ data = rank_data.get("Super Mario Bros", {}).get("results", [])
175
  df = pd.DataFrame(data)
176
  df = df.rename(columns={
177
  "model": "Player",
 
197
  # Get DataFrames for selected games
198
  # if selected_games.get("Super Mario Bros"): # Commented out
199
  # game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
200
+ if selected_games.get("Super Mario Bros"):
201
+ game_dfs["Super Mario Bros"] = get_mario_planning_leaderboard(rank_data)
202
  if selected_games.get("Sokoban"):
203
  game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
204
  if selected_games.get("2048"):
 
207
  game_dfs["Candy Crush"] = get_candy_leaderboard(rank_data)
208
  # if selected_games.get("Tetris (complete)"): # Commented out
209
  # game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
210
+ if selected_games.get("Tetris"):
211
+ game_dfs["Tetris"] = get_tetris_planning_leaderboard(rank_data)
212
  if selected_games.get("Ace Attorney"):
213
  game_dfs["Ace Attorney"] = get_ace_attorney_leaderboard(rank_data)
214
 
 
238
  # if game == "Super Mario Bros": # Commented out
239
  # player_score = df[df["Player"] == player]["Score"].iloc[0]
240
  # rank = len(df[df["Score"] > player_score]) + 1
241
+ if game == "Super Mario Bros":
242
  player_score = df[df["Player"] == player]["Score"].iloc[0]
243
  rank = len(df[df["Score"] > player_score]) + 1
244
  elif game == "Sokoban":
 
250
  elif game == "Candy Crush":
251
  player_score = df[df["Player"] == player]["Score"].iloc[0]
252
  rank = len(df[df["Score"] > player_score]) + 1
253
+ elif game in ["Tetris"]:
254
  player_score = df[df["Player"] == player]["Score"].iloc[0]
255
  rank = len(df[df["Score"] > player_score]) + 1
256
  elif game == "Ace Attorney":
 
302
  # Get DataFrames for selected games
303
  # if selected_games.get("Super Mario Bros"): # Commented out
304
  # game_dfs["Super Mario Bros"] = get_mario_leaderboard(rank_data)
305
+ if selected_games.get("Super Mario Bros"):
306
+ game_dfs["Super Mario Bros"] = get_mario_planning_leaderboard(rank_data)
307
  if selected_games.get("Sokoban"):
308
  game_dfs["Sokoban"] = get_sokoban_leaderboard(rank_data)
309
  if selected_games.get("2048"):
 
312
  game_dfs["Candy Crush"] = get_candy_leaderboard(rank_data)
313
  # if selected_games.get("Tetris (complete)"): # Commented out
314
  # game_dfs["Tetris (complete)"] = get_tetris_leaderboard(rank_data)
315
+ if selected_games.get("Tetris"):
316
+ game_dfs["Tetris"] = get_tetris_planning_leaderboard(rank_data)
317
  if selected_games.get("Ace Attorney"):
318
  game_dfs["Ace Attorney"] = get_ace_attorney_leaderboard(rank_data)
319
 
 
338
  if player in df["Player"].values:
339
  # if game == "Super Mario Bros": # Commented out
340
  # player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
341
+ if game == "Super Mario Bros":
342
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
343
  elif game == "Sokoban":
344
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
 
346
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
347
  elif game == "Candy Crush":
348
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
349
+ elif game in ["Tetris"]:
350
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
351
  elif game == "Ace Attorney":
352
  player_data[f"{game} Score"] = df[df["Player"] == player]["Score"].iloc[0]
rank_data_03_25_2025.json CHANGED
@@ -1,112 +1,71 @@
1
  {
2
  "Super Mario Bros": {
3
- "runs": 5,
4
- "results": [
5
- {
6
- "model": "claude-3-7-sonnet-20250219",
7
- "score": 710,
8
- "progress": "1-1",
9
- "time_s": 64.2
10
- },
11
- {
12
- "model": "gpt-4.1-2025-04-14",
13
- "score": 740,
14
- "progress": "1-1",
15
- "time_s": 68.6
16
- },
17
- {
18
- "model": "gpt-4o-2024-11-20",
19
- "score": 560,
20
- "progress": "1-1",
21
- "time_s": 58.6
22
- },
23
- {
24
- "model": "gemini-2.0-flash",
25
- "score": 320,
26
- "progress": "1-1",
27
- "time_s": 51.8
28
- },
29
- {
30
- "model": "claude-3-5-haiku-20241022",
31
- "score": 140,
32
- "progress": "1-1",
33
- "time_s": 76.4
34
- },
35
- {
36
- "model": "gpt-4.5-preview-2025-02-27",
37
- "score": 160,
38
- "progress": "1-1",
39
- "time_s": 62.8
40
- }
41
- ]
42
- },
43
- "Super Mario Bros (planning only)": {
44
  "runs": 3,
45
  "results": [
46
  {
47
- "model": "claude-3-5-sonnet-20241022",
48
  "score": 1267.7,
49
- "detail_data": "709;1532;1562",
50
  "progress": "1-1"
51
  },
52
  {
53
- "model": "claude-3-7-sonnet-20250219 (thinking)",
54
  "score": 1418.7,
55
- "detail_data": "2015;709;1532",
56
  "progress": "1-1"
57
  },
58
  {
59
- "model": "gemini-2.5-flash-preview-04-17 (thinking)",
60
  "score": 1385.0,
61
- "detail_data": "1672;1266;1247",
62
  "progress": "1-1"
63
  },
64
  {
65
- "model": "gemini-2.5-pro-preview-05-06 (thinking)",
66
  "score": 1498.3,
67
- "detail_data": "1561;1271;1663",
68
  "progress": "1-1"
69
  },
70
  {
71
- "model": "llama-4-maverick-17b-128e-instruct-fp8",
72
  "score": 1468.7,
73
- "detail_data": "898;2008;1500",
74
  "progress": "1-1"
75
  },
76
  {
77
- "model": "gpt-4.1-2025-04-14",
78
  "score": 2126.3,
79
- "detail_data": "1531;722;4126",
80
  "progress": "1-1"
81
  },
82
  {
83
- "model": "gpt-4o-2024-11-20",
84
  "score": 2047.3,
85
- "detail_data": "2017;2590;1535",
86
  "progress": "1-1"
87
  },
88
  {
89
- "model": "o1-2024-12-17",
90
  "score": 855,
91
- "detail_data": "855",
92
  "progress": "1-1"
93
  },
94
  {
95
- "model": "o3-2025-04-16",
96
  "score": 3445,
97
- "detail_data": "3445",
98
  "progress": "1-1"
99
  },
100
  {
101
- "model": "o4-mini-2025-04-16",
102
  "score": 1448.0,
103
- "detail_data": "1525;1263;1556",
104
  "progress": "1-1"
105
  },
106
  {
107
- "model": "Random (x30)",
108
  "score": 986.97,
109
- "detail_data": "986.97",
110
  "progress": "1-1"
111
  }
112
  ]
@@ -115,192 +74,207 @@
115
  "runs": 3,
116
  "results": [
117
  {
118
- "model": "claude-3-5-sonnet-20241022",
119
- "score": 108.2,
120
- "details": "1352;2860;1532",
121
- "highest_tail": 128
122
  },
123
  {
124
- "model": "claude-3-7-sonnet-20250219 (thinking)",
125
- "score": 113.3,
126
- "details": "2560;3224;2088",
127
  "highest_tail": 256
128
  },
129
  {
130
- "model": "deepseek-r1",
131
- "score": 105.2,
132
- "details": "700;1240;3680",
133
- "highest_tail": 128
134
  },
135
  {
136
- "model": "gemini-2.5-flash-preview-04-17 (thinking)",
137
- "score": 106.6,
138
- "details": "1304;1316;2472",
139
  "highest_tail": 256
140
  },
141
  {
142
- "model": "gemini-2.5-pro-preview-05-06 (thinking)",
143
- "score": 117.3,
144
- "details": "5300;2400;3060",
145
- "highest_tail": 256
146
  },
147
  {
148
- "model": "grok-3-mini-beta (thinking)",
149
- "score": 118.6,
150
- "details": "6412;2492;3204",
151
- "highest_tail": 256
152
  },
153
  {
154
- "model": "llama-4-maverick-17b-128e-instruct-fp8",
155
- "score": 106,
156
- "details": "1404;1272;2084",
157
  "highest_tail": 128
158
  },
159
  {
160
- "model": "gpt-4.1-2025-04-14",
161
- "score": 105.7,
162
- "details": "1156;2664;1148",
163
- "highest_tail": 128
164
  },
165
  {
166
- "model": "gpt-4o-2024-11-20",
167
- "score": 106.7,
168
- "details": "1604;1284;2080",
169
  "highest_tail": 256
170
  },
171
  {
172
- "model": "o1-2024-12-17",
173
- "score": 128.9,
174
- "details": "3132;2004;3136",
175
  "highest_tail": 512
176
  },
177
  {
178
- "model": "o1-mini-2024-09-12",
179
- "score": 114.0,
180
- "details": "21;86;37",
181
  "highest_tail": 256
182
  },
183
  {
184
- "model": "o3-2025-04-16",
185
- "score": 128.0,
186
  "details": "7120",
187
  "highest_tail": 512
188
  },
189
  {
190
- "model": "o4-mini-2025-04-16",
191
- "score": 120.6,
192
- "details": "4928;5456;2912",
193
- "highest_tail": 256
194
  },
195
  {
196
- "model": "Random (x30)",
197
- "score": 100.4,
198
  "details": "",
199
  "highest_tail": 128
200
- }
201
- ]
202
- },
203
- "Tetris (complete)": {
204
- "runs": 3,
205
- "results": [
206
  {
207
- "model": "claude-3-7-sonnet-20250219",
208
- "score": 95,
209
- "steps_blocks": 27,
210
- "rank": 1
211
  },
212
  {
213
- "model": "claude-3-5-haiku-20241022",
214
- "score": 90,
215
- "steps_blocks": 25,
216
- "rank": 2
217
  },
218
  {
219
- "model": "gemini-2.0-flash",
220
- "score": 82,
221
- "steps_blocks": 23,
222
- "rank": 3
223
  },
224
  {
225
- "model": "gpt-4o-2024-11-20",
226
- "score": 54,
227
- "steps_blocks": 19,
228
- "rank": 4
229
  }
230
  ]
231
  },
232
- "Tetris (planning only)": {
233
  "runs": 3,
234
  "results": [
235
  {
236
- "model": "claude-3-5-sonnet-20241022",
237
  "score": 14.7,
238
- "details": "16;14;14"
239
  },
240
  {
241
- "model": "claude-3-7-sonnet-20250219 (thinking)",
242
  "score": 16.3,
243
- "details": "19;15;15"
244
  },
245
  {
246
- "model": "deepseek-r1",
247
  "score": 14.3,
248
- "details": "15;14;14"
249
  },
250
  {
251
- "model": "gemini-2.5-flash-preview-04-17 (thinking)",
252
  "score": 16.3,
253
- "details": "20;14;15"
254
  },
255
  {
256
- "model": "gemini-2.5-pro-preview-05-06 (thinking)",
257
  "score": 23.3,
258
- "details": "23;23;24"
259
  },
260
  {
261
- "model": "grok-3-mini-beta (thinking)",
262
  "score": 21.3,
263
- "details": "20;15;29"
264
  },
265
  {
266
- "model": "llama-4-maverick-17b-128e-instruct-fp8",
267
  "score": 10.3,
268
- "details": "9;10;12"
269
  },
270
  {
271
- "model": "gpt-4.1-2025-04-14",
272
  "score": 13.7,
273
- "details": "13;14;14"
274
  },
275
  {
276
- "model": "gpt-4o-2024-11-20",
277
  "score": 14,
278
- "details": "18;11;13"
279
  },
280
  {
281
- "model": "o1-2024-12-17",
282
  "score": 35,
283
  "details": "35"
284
  },
285
  {
286
- "model": "o1-mini-2024-09-12",
287
  "score": 11.7,
288
- "details": "11;11;13"
289
  },
290
  {
291
- "model": "o3-2025-04-16",
292
  "score": 42,
293
  "details": "42"
294
  },
295
  {
296
- "model": "o4-mini-2025-04-16",
297
  "score": 25.3,
298
- "details": "22;35;19"
299
  },
300
  {
301
- "model": "Random (x30)",
302
  "score": 10.2,
303
  "details": ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
  }
305
  ]
306
  },
@@ -308,74 +282,94 @@
308
  "runs": 3,
309
  "results": [
310
  {
311
- "model": "claude-3-5-sonnet-20241022",
312
  "score": 106,
313
- "details": "92;165;61"
314
  },
315
  {
316
- "model": "claude-3-7-sonnet-20250219 (thinking)",
317
  "score": 484,
318
- "details": "535;428;489"
319
  },
320
  {
321
- "model": "deepseek-r1",
322
  "score": 447.3,
323
- "details": "409;436;497"
324
  },
325
  {
326
- "model": "gemini-2.5-flash-preview-04-17 (thinking)",
327
  "score": 334.7,
328
- "details": "259;372;373"
329
  },
330
  {
331
- "model": "gemini-2.5-pro-preview-05-06 (thinking)",
332
  "score": 416.3,
333
- "details": "411;414;424"
334
  },
335
  {
336
- "model": "grok-3-mini-beta (thinking)",
337
  "score": 254,
338
- "details": "299;332;131"
339
  },
340
  {
341
- "model": "llama-4-maverick-17b-128e-instruct-fp8",
342
  "score": 128.7,
343
- "details": "67;139;180"
344
  },
345
  {
346
- "model": "gpt-4.1-2025-04-14",
347
  "score": 182,
348
- "details": "163;215;168"
349
  },
350
  {
351
- "model": "gpt-4o-2024-11-20",
352
  "score": 147.3,
353
- "details": "131;104;207"
354
  },
355
  {
356
- "model": "o1-2024-12-17",
357
  "score": 159,
358
  "details": "159"
359
  },
360
  {
361
- "model": "o1-mini-2024-09-12",
362
  "score": 48,
363
- "details": "21;86;37"
364
  },
365
  {
366
- "model": "o3-2025-04-16",
367
  "score": 647,
368
  "details": "647"
369
  },
370
  {
371
- "model": "o4-mini-2025-04-16",
372
  "score": 487.3,
373
- "details": "259;591;612"
374
  },
375
  {
376
- "model": "Random (x30)",
377
  "score": 116.5,
378
  "details": ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
  }
380
  ]
381
  },
@@ -383,88 +377,108 @@
383
  "runs": 3,
384
  "results": [
385
  {
386
- "model": "claude-3-5-sonnet-20241022",
387
  "score": 0,
388
- "detail_box_on_target": "0;0;0",
389
- "cracked_levels": "0;0;0"
390
  },
391
  {
392
- "model": "claude-3-7-sonnet-20250219 (thinking)",
393
  "score": 2.33,
394
- "detail_box_on_target": "2;4;1",
395
- "cracked_levels": "1;2;0"
396
  },
397
  {
398
- "model": "deepseek-r1",
399
  "score": 1.33,
400
- "detail_box_on_target": "2;0;2",
401
- "cracked_levels": "1;0;1"
402
  },
403
  {
404
- "model": "gemini-2.5-flash-preview-04-17 (thinking)",
405
  "score": 1.67,
406
- "detail_box_on_target": "3;0;2",
407
- "cracked_levels": "2;0;1"
408
  },
409
  {
410
- "model": "gemini-2.5-pro-preview-05-06 (thinking)",
411
  "score": 4.33,
412
- "detail_box_on_target": "4;4;5",
413
- "cracked_levels": "2;2;3"
414
  },
415
  {
416
- "model": "grok-3-mini-beta (thinking)",
417
  "score": 5.67,
418
- "detail_box_on_target": "5;6;6",
419
- "cracked_levels": "3;3;3"
420
  },
421
  {
422
- "model": "llama-4-maverick-17b-128e-instruct-fp8",
423
  "score": 0,
424
- "detail_box_on_target": "0;0;0",
425
- "cracked_levels": "0;0;0"
426
  },
427
  {
428
- "model": "gpt-4.1-2025-04-14",
429
  "score": 0,
430
- "detail_box_on_target": "0;0;0",
431
- "cracked_levels": "0;0;0"
432
  },
433
  {
434
- "model": "gpt-4o-2024-11-20",
435
  "score": 0,
436
- "detail_box_on_target": "0;0;0",
437
- "cracked_levels": "0;0;0"
438
  },
439
  {
440
- "model": "o1-2024-12-17",
441
  "score": 2.33,
442
- "detail_box_on_target": "2;2;3",
443
- "cracked_levels": "1;1;2"
444
  },
445
  {
446
- "model": "o1-mini-2024-09-12",
447
  "score": 1.33,
448
- "detail_box_on_target": "1;2;1",
449
- "cracked_levels": "0;1;0"
450
  },
451
  {
452
- "model": "o3-2025-04-16",
453
  "score": 8,
454
- "detail_box_on_target": "10;6",
455
- "cracked_levels": "5;3"
456
  },
457
  {
458
- "model": "o4-mini-2025-04-16",
459
  "score": 5.33,
460
- "detail_box_on_target": "4;6;6",
461
- "cracked_levels": "2;2;3"
462
  },
463
  {
464
- "model": "Random (x30)",
465
  "score": 0,
466
- "detail_box_on_target": "0,0,0",
467
  "cracked_levels": "0,0,0"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
  }
469
  ]
470
  },
@@ -472,88 +486,103 @@
472
  "runs": 1,
473
  "results": [
474
  {
475
- "model": "claude-3-5-sonnet-20241022",
476
  "score": 2,
477
  "progress": "1:2/5",
478
  "evaluator result": "1/3"
479
  },
480
  {
481
- "model": "claude-3-7-sonnet-20250219 (thinking)",
482
  "score": 7,
483
  "progress": "2:2/9",
484
  "evaluator result": "5/11"
485
  },
486
  {
487
- "model": "deepseek-r1",
488
  "score": 0,
489
  "progress": "0",
490
  "evaluator result": "1/5"
491
  },
492
  {
493
- "model": "gemini-2.5-flash-preview-04-17 (thinking)",
494
  "score": 4,
495
  "progress": "1:4/5",
496
  "evaluator result": "1/7"
497
  },
498
  {
499
- "model": "gemini-2.5-pro-preview-05-06 (thinking)",
500
  "score": 7,
501
  "progress": "2:2/9",
502
  "evaluator result": "2/3"
503
  },
504
  {
505
- "model": "grok-3-mini-beta (thinking)",
506
  "score": 0,
507
  "progress": "0",
508
  "evaluator result": "0"
509
  },
510
  {
511
- "model": "llama-4-maverick-17b-128e-instruct-fp8",
512
  "score": 0,
513
  "progress": "0",
514
  "evaluator result": "0"
515
  },
516
  {
517
- "model": "gpt-4.1-2025-04-14",
518
  "score": 2,
519
  "progress": "1:2/5",
520
  "evaluator result": "2/3"
521
  },
522
  {
523
- "model": "gpt-4o-2024-11-20",
524
  "score": 0,
525
  "progress": "0",
526
  "evaluator result": "0"
527
  },
528
  {
529
- "model": "o1-2024-12-17",
530
  "score": 16,
531
  "progress": "3: 2/8",
532
  "evaluator result": "6/11"
533
  },
534
  {
535
- "model": "o1-mini-2024-09-12",
536
  "score": 0,
537
  "progress": "0",
538
  "evaluator result": "1/5"
539
  },
540
  {
541
- "model": "o3-2025-04-16",
542
  "score": 16,
543
  "progress": "3: 2/8",
544
  "evaluator result": "1/2"
545
  },
546
  {
547
- "model": "o4-mini-2025-04-16",
548
  "score": 4,
549
  "progress": "1:4/5",
550
  "evaluator result": "2/5"
551
  },
552
  {
553
- "model": "Random (x30)",
554
  "score": 0,
555
  "progress": "0",
556
  "evaluator result": "0"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
557
  }
558
  ]
559
  }
 
1
  {
2
  "Super Mario Bros": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  "runs": 3,
4
  "results": [
5
  {
6
+ "model": "gamingagent + claude-3-5-sonnet-20241022",
7
  "score": 1267.7,
8
+ "detail_data":"709,1532,1562",
9
  "progress": "1-1"
10
  },
11
  {
12
+ "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
13
  "score": 1418.7,
14
+ "detail_data":"2015,709,1532",
15
  "progress": "1-1"
16
  },
17
  {
18
+ "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
19
  "score": 1385.0,
20
+ "detail_data":"1672,1266,1247",
21
  "progress": "1-1"
22
  },
23
  {
24
+ "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
25
  "score": 1498.3,
26
+ "detail_data":"1561,1271,1663",
27
  "progress": "1-1"
28
  },
29
  {
30
+ "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
31
  "score": 1468.7,
32
+ "detail_data":"898,2008,1500",
33
  "progress": "1-1"
34
  },
35
  {
36
+ "model": "gamingagent + gpt-4.1-2025-04-14",
37
  "score": 2126.3,
38
+ "detail_data":"1531,722,4126",
39
  "progress": "1-1"
40
  },
41
  {
42
+ "model": "gamingagent + gpt-4o-2024-11-20",
43
  "score": 2047.3,
44
+ "detail_data":"2017,2590,1535",
45
  "progress": "1-1"
46
  },
47
  {
48
+ "model": "gamingagent + o1-2024-12-17",
49
  "score": 855,
50
+ "detail_data":"855",
51
  "progress": "1-1"
52
  },
53
  {
54
+ "model": "gamingagent + o3-2025-04-16",
55
  "score": 3445,
56
+ "detail_data":"3445",
57
  "progress": "1-1"
58
  },
59
  {
60
+ "model": "gamingagent + o4-mini-2025-04-16",
61
  "score": 1448.0,
62
+ "detail_data":"1525,1263,1556",
63
  "progress": "1-1"
64
  },
65
  {
66
+ "model": "random (x30)",
67
  "score": 986.97,
68
+ "detail_data":"986.97",
69
  "progress": "1-1"
70
  }
71
  ]
 
74
  "runs": 3,
75
  "results": [
76
  {
77
+ "model": "gamingagent + claude-3-5-sonnet-20241022",
78
+ "score": 1914.67,
79
+ "details": "1352,2860,1532",
80
+ "highest_tail": 256
81
  },
82
  {
83
+ "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
84
+ "score": 2624,
85
+ "details": "2560,3224,2088",
86
  "highest_tail": 256
87
  },
88
  {
89
+ "model": "gamingagent + deepseek-r1-0120",
90
+ "score": 1873.33,
91
+ "details": "700,1240,3680",
92
+ "highest_tail": 256
93
  },
94
  {
95
+ "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
96
+ "score": 1697.33,
97
+ "details": "1304,1316,2472",
98
  "highest_tail": 256
99
  },
100
  {
101
+ "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
102
+ "score": 3586.67,
103
+ "details": "5300,2400,3060",
104
+ "highest_tail": 512
105
  },
106
  {
107
+ "model": "gamingagent + grok-3-mini-beta (thinking)",
108
+ "score": 4036,
109
+ "details": "6412,2492,3204",
110
+ "highest_tail": 512
111
  },
112
  {
113
+ "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
114
+ "score": 1586.67,
115
+ "details": "1404,1272,2084",
116
  "highest_tail": 128
117
  },
118
  {
119
+ "model": "gamingagent + gpt-4.1-2025-04-14",
120
+ "score": 1656,
121
+ "details": "1156,2664,1148",
122
+ "highest_tail": 256
123
  },
124
  {
125
+ "model": "gamingagent + gpt-4o-2024-11-20",
126
+ "score": 1656,
127
+ "details": "1604,1284,2080",
128
  "highest_tail": 256
129
  },
130
  {
131
+ "model": "gamingagent + o1-2024-12-17",
132
+ "score": 7580,
133
+ "details": "7580",
134
  "highest_tail": 512
135
  },
136
  {
137
+ "model": "gamingagent + o1-mini-2024-09-12",
138
+ "score": 2757.33,
139
+ "details": "3132,2004,3136",
140
  "highest_tail": 256
141
  },
142
  {
143
+ "model": "gamingagent + o3-2025-04-16",
144
+ "score": 7120,
145
  "details": "7120",
146
  "highest_tail": 512
147
  },
148
  {
149
+ "model": "gamingagent + o4-mini-2025-04-16",
150
+ "score": 4432.0,
151
+ "details": "4928,5456,2912",
152
+ "highest_tail": 512
153
  },
154
  {
155
+ "model": "random (x30)",
156
+ "score": 1213.33,
157
  "details": "",
158
  "highest_tail": 128
159
+ },
 
 
 
 
 
160
  {
161
+ "model": "gamingagent + claude-opus-4-20250514",
162
+ "score": 3036.0,
163
+ "details": "3036.0",
164
+ "highest_tail": 256
165
  },
166
  {
167
+ "model": "gamingagent + claude-sonnet-4-20250514",
168
+ "score": 3136,
169
+ "details": "2148,2360,4900",
170
+ "highest_tail": 256
171
  },
172
  {
173
+ "model": "gamingagent + deepseek-r1-0528",
174
+ "score": 3330.0,
175
+ "details": "3260,3400",
176
+ "highest_tail": 256
177
  },
178
  {
179
+ "model": "gamingagent + qwen3-235B-A22B-fp8",
180
+ "score": 2144.0,
181
+ "details": "1436,2556,2440",
182
+ "highest_tail": 256
183
  }
184
  ]
185
  },
186
+ "Tetris": {
187
  "runs": 3,
188
  "results": [
189
  {
190
+ "model": "gamingagent + claude-3-5-sonnet-20241022",
191
  "score": 14.7,
192
+ "details": "16,14,14"
193
  },
194
  {
195
+ "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
196
  "score": 16.3,
197
+ "details": "19,15,15"
198
  },
199
  {
200
+ "model": "gamingagent + deepseek-r1-0120",
201
  "score": 14.3,
202
+ "details": "15,14,14"
203
  },
204
  {
205
+ "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
206
  "score": 16.3,
207
+ "details": "20,14,15"
208
  },
209
  {
210
+ "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
211
  "score": 23.3,
212
+ "details": "23,23,24"
213
  },
214
  {
215
+ "model": "gamingagent + grok-3-mini-beta (thinking)",
216
  "score": 21.3,
217
+ "details": "20,15,29"
218
  },
219
  {
220
+ "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
221
  "score": 10.3,
222
+ "details": "9,10,12"
223
  },
224
  {
225
+ "model": "gamingagent + gpt-4.1-2025-04-14",
226
  "score": 13.7,
227
+ "details": "13,14,14"
228
  },
229
  {
230
+ "model": "gamingagent + gpt-4o-2024-11-20",
231
  "score": 14,
232
+ "details": "18,11,13"
233
  },
234
  {
235
+ "model": "gamingagent + o1-2024-12-17",
236
  "score": 35,
237
  "details": "35"
238
  },
239
  {
240
+ "model": "gamingagent + o1-mini-2024-09-12",
241
  "score": 11.7,
242
+ "details": "11,11,13"
243
  },
244
  {
245
+ "model": "gamingagent + o3-2025-04-16",
246
  "score": 42,
247
  "details": "42"
248
  },
249
  {
250
+ "model": "gamingagent + o4-mini-2025-04-16",
251
  "score": 25.3,
252
+ "details": "22,35,19"
253
  },
254
  {
255
+ "model": "random (x30)",
256
  "score": 10.2,
257
  "details": ""
258
+ },
259
+ {
260
+ "model": "gamingagent + claude-opus-4-20250514",
261
+ "score": 20,
262
+ "details": "17,18,25"
263
+ },
264
+ {
265
+ "model": "gamingagent + claude-sonnet-4-20250514",
266
+ "score": 19.33,
267
+ "details": "20,17,21"
268
+ },
269
+ {
270
+ "model": "gamingagent + deepseek-r1-0528",
271
+ "score": 33.67,
272
+ "details": "26,34,41"
273
+ },
274
+ {
275
+ "model": "gamingagent + qwen3-235B-A22B-fp8",
276
+ "score": 11.67,
277
+ "details": "13,14,8"
278
  }
279
  ]
280
  },
 
282
  "runs": 3,
283
  "results": [
284
  {
285
+ "model": "gamingagent + claude-3-5-sonnet-20241022",
286
  "score": 106,
287
+ "details": "92,165,61"
288
  },
289
  {
290
+ "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
291
  "score": 484,
292
+ "details": "535,428,489"
293
  },
294
  {
295
+ "model": "gamingagent + deepseek-r1-0120",
296
  "score": 447.3,
297
+ "details": "409,436,497"
298
  },
299
  {
300
+ "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
301
  "score": 334.7,
302
+ "details": "259,372,373"
303
  },
304
  {
305
+ "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
306
  "score": 416.3,
307
+ "details": "411,414,424"
308
  },
309
  {
310
+ "model": "gamingagent + grok-3-mini-beta (thinking)",
311
  "score": 254,
312
+ "details": "299,332,131"
313
  },
314
  {
315
+ "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
316
  "score": 128.7,
317
+ "details": "67,139,180"
318
  },
319
  {
320
+ "model": "gamingagent + gpt-4.1-2025-04-14",
321
  "score": 182,
322
+ "details": "163,215,168"
323
  },
324
  {
325
+ "model": "gamingagent + gpt-4o-2024-11-20",
326
  "score": 147.3,
327
+ "details": "131,104,207"
328
  },
329
  {
330
+ "model": "gamingagent + o1-2024-12-17",
331
  "score": 159,
332
  "details": "159"
333
  },
334
  {
335
+ "model": "gamingagent + o1-mini-2024-09-12",
336
  "score": 48,
337
+ "details": "21,86,37"
338
  },
339
  {
340
+ "model": "gamingagent + o3-2025-04-16",
341
  "score": 647,
342
  "details": "647"
343
  },
344
  {
345
+ "model": "gamingagent + o4-mini-2025-04-16",
346
  "score": 487.3,
347
+ "details": "259,591,612"
348
  },
349
  {
350
+ "model": "random (x30)",
351
  "score": 116.5,
352
  "details": ""
353
+ },
354
+ {
355
+ "model": "gamingagent + claude-opus-4-20250514",
356
+ "score": 464,
357
+ "details": "593,406,393"
358
+ },
359
+ {
360
+ "model": "gamingagent + claude-sonnet-4-20250514",
361
+ "score": 478.33,
362
+ "details": "545,468,422"
363
+ },
364
+ {
365
+ "model": "gamingagent + deepseek-r1-0528",
366
+ "score": 491.67,
367
+ "details": "464,463,548"
368
+ },
369
+ {
370
+ "model": "gamingagent + qwen3-235B-A22B-fp8",
371
+ "score": 363.33,
372
+ "details": "365,372,353"
373
  }
374
  ]
375
  },
 
377
  "runs": 3,
378
  "results": [
379
  {
380
+ "model": "gamingagent + claude-3-5-sonnet-20241022",
381
  "score": 0,
382
+ "detail_box_on_target":"0,0,0",
383
+ "cracked_levels": "0,0,0"
384
  },
385
  {
386
+ "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
387
  "score": 2.33,
388
+ "detail_box_on_target":"2,4,1",
389
+ "cracked_levels": "1,2,0"
390
  },
391
  {
392
+ "model": "gamingagent + deepseek-r1-0120",
393
  "score": 1.33,
394
+ "detail_box_on_target":"2,0,2",
395
+ "cracked_levels": "1,0,1"
396
  },
397
  {
398
+ "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
399
  "score": 1.67,
400
+ "detail_box_on_target":"3,0,2",
401
+ "cracked_levels": "2,0,1"
402
  },
403
  {
404
+ "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
405
  "score": 4.33,
406
+ "detail_box_on_target":"4,4,5",
407
+ "cracked_levels": "2,2,3"
408
  },
409
  {
410
+ "model": "gamingagent + grok-3-mini-beta (thinking)",
411
  "score": 5.67,
412
+ "detail_box_on_target":"5,6,6",
413
+ "cracked_levels": "3,3,3"
414
  },
415
  {
416
+ "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
417
  "score": 0,
418
+ "detail_box_on_target":"0,0,0",
419
+ "cracked_levels": "0,0,0"
420
  },
421
  {
422
+ "model": "gamingagent + gpt-4.1-2025-04-14",
423
  "score": 0,
424
+ "detail_box_on_target":"0,0,0",
425
+ "cracked_levels": "0,0,0"
426
  },
427
  {
428
+ "model": "gamingagent + gpt-4o-2024-11-20",
429
  "score": 0,
430
+ "detail_box_on_target":"0,0,0",
431
+ "cracked_levels": "0,0,0"
432
  },
433
  {
434
+ "model": "gamingagent + o1-2024-12-17",
435
  "score": 2.33,
436
+ "detail_box_on_target":"2,2,3",
437
+ "cracked_levels": "1,1,2"
438
  },
439
  {
440
+ "model": "gamingagent + o1-mini-2024-09-12",
441
  "score": 1.33,
442
+ "detail_box_on_target":"1,2,1",
443
+ "cracked_levels": "0,1,0"
444
  },
445
  {
446
+ "model": "gamingagent + o3-2025-04-16",
447
  "score": 8,
448
+ "detail_box_on_target":"10,6",
449
+ "cracked_levels": "5,3"
450
  },
451
  {
452
+ "model": "gamingagent + o4-mini-2025-04-16",
453
  "score": 5.33,
454
+ "detail_box_on_target":"4,6,6",
455
+ "cracked_levels": "2,2,3"
456
  },
457
  {
458
+ "model": "random (x30)",
459
  "score": 0,
460
+ "detail_box_on_target":"0,0,0",
461
  "cracked_levels": "0,0,0"
462
+ },
463
+ {
464
+ "model": "gamingagent + claude-opus-4-20250514",
465
+ "score": 4,
466
+ "details": "4,4,4"
467
+ },
468
+ {
469
+ "model": "gamingagent + claude-sonnet-4-20250514",
470
+ "score": 3,
471
+ "details": "2,2,5"
472
+ },
473
+ {
474
+ "model": "gamingagent + deepseek-r1-0528",
475
+ "score": 4.67,
476
+ "details": "4,4,6"
477
+ },
478
+ {
479
+ "model": "gamingagent + qwen3-235B-A22B-fp8",
480
+ "score": 2.33,
481
+ "details": "1,2,4"
482
  }
483
  ]
484
  },
 
486
  "runs": 1,
487
  "results": [
488
  {
489
+ "model": "gamingagent + claude-3-5-sonnet-20241022",
490
  "score": 2,
491
  "progress": "1:2/5",
492
  "evaluator result": "1/3"
493
  },
494
  {
495
+ "model": "gamingagent + claude-3-7-sonnet-20250219 (thinking)",
496
  "score": 7,
497
  "progress": "2:2/9",
498
  "evaluator result": "5/11"
499
  },
500
  {
501
+ "model": "gamingagent + deepseek-r1-0120",
502
  "score": 0,
503
  "progress": "0",
504
  "evaluator result": "1/5"
505
  },
506
  {
507
+ "model": "gamingagent + gemini-2.5-flash-preview-04-17 (thinking)",
508
  "score": 4,
509
  "progress": "1:4/5",
510
  "evaluator result": "1/7"
511
  },
512
  {
513
+ "model": "gamingagent + gemini-2.5-pro-preview-05-06 (thinking)",
514
  "score": 7,
515
  "progress": "2:2/9",
516
  "evaluator result": "2/3"
517
  },
518
  {
519
+ "model": "gamingagent + grok-3-mini-beta (thinking)",
520
  "score": 0,
521
  "progress": "0",
522
  "evaluator result": "0"
523
  },
524
  {
525
+ "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
526
  "score": 0,
527
  "progress": "0",
528
  "evaluator result": "0"
529
  },
530
  {
531
+ "model": "gamingagent + gpt-4.1-2025-04-14",
532
  "score": 2,
533
  "progress": "1:2/5",
534
  "evaluator result": "2/3"
535
  },
536
  {
537
+ "model": "gamingagent + gpt-4o-2024-11-20",
538
  "score": 0,
539
  "progress": "0",
540
  "evaluator result": "0"
541
  },
542
  {
543
+ "model": "gamingagent + o1-2024-12-17",
544
  "score": 16,
545
  "progress": "3: 2/8",
546
  "evaluator result": "6/11"
547
  },
548
  {
549
+ "model": "gamingagent + o1-mini-2024-09-12",
550
  "score": 0,
551
  "progress": "0",
552
  "evaluator result": "1/5"
553
  },
554
  {
555
+ "model": "gamingagent + o3-2025-04-16",
556
  "score": 16,
557
  "progress": "3: 2/8",
558
  "evaluator result": "1/2"
559
  },
560
  {
561
+ "model": "gamingagent + o4-mini-2025-04-16",
562
  "score": 4,
563
  "progress": "1:4/5",
564
  "evaluator result": "2/5"
565
  },
566
  {
567
+ "model": "random (x30)",
568
  "score": 0,
569
  "progress": "0",
570
  "evaluator result": "0"
571
+ },
572
+ {
573
+ "model": "gamingagent + claude-opus-4-20250514",
574
+ "score": 6,
575
+ "details": "6"
576
+ },
577
+ {
578
+ "model": "gamingagent + claude-sonnet-4-20250514",
579
+ "score": 3.67,
580
+ "details": "3,4,4"
581
+ },
582
+ {
583
+ "model": "gamingagent + gemini-2.5-flash-preview-05-20",
584
+ "score": 4.33,
585
+ "details": "3,4,6"
586
  }
587
  ]
588
  }
rank_single_model_03_25_2025.json ADDED
@@ -0,0 +1,473 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Super Mario Bros": {
3
+ "runs": 3,
4
+ "results": [
5
+ {
6
+ "model": "claude-3-5-sonnet-20241022",
7
+ "score": 1540.0,
8
+ "detail_data":"1551,1515,1554",
9
+ "progress": "1-1"
10
+ },
11
+ {
12
+ "model": "claude-3-7-sonnet-20250219 (thinking)",
13
+ "score": 1430.0,
14
+ "detail_data":"1532,1515,1243",
15
+ "progress": "1-1"
16
+ },
17
+ {
18
+ "model": "gemini-2.5-flash-preview-04-17 (thinking)",
19
+ "score": 1540.7,
20
+ "detail_data":"1794,1270,1558",
21
+ "progress": "1-1"
22
+ },
23
+ {
24
+ "model": "gemini-2.5-pro-preview-05-06 (thinking)",
25
+ "score": 1025.3,
26
+ "detail_data":"820,1534,722",
27
+ "progress": "1-1"
28
+ },
29
+ {
30
+ "model": "llama-4-maverick-17b-128e-instruct-fp8",
31
+ "score": 786.0,
32
+ "detail_data":"837,300,1221",
33
+ "progress": "1-1"
34
+ },
35
+ {
36
+ "model": "gpt-4.1-2025-04-14",
37
+ "score": 1991.3,
38
+ "detail_data":"1563,1257,3154",
39
+ "progress": "1-1"
40
+ },
41
+ {
42
+ "model": "gpt-4o-2024-11-20",
43
+ "score": 1028.3,
44
+ "detail_data":"1565,297,1223",
45
+ "progress": "1-1"
46
+ },
47
+ {
48
+ "model": "o1-2024-12-17",
49
+ "score": 1434.0,
50
+ "detail_data":"1434",
51
+ "progress": "1-1"
52
+ },
53
+ {
54
+ "model": "o3-2025-04-16",
55
+ "score": 1955.0,
56
+ "detail_data":"1955",
57
+ "progress": "1-1"
58
+ },
59
+ {
60
+ "model": "o4-mini-2025-04-16",
61
+ "score": 1348.3,
62
+ "detail_data":"1554,1245,1246",
63
+ "progress": "1-1"
64
+ },
65
+ {
66
+ "model": "random (x30)",
67
+ "score": 986.97,
68
+ "detail_data":"986.97",
69
+ "progress": "1-1"
70
+ }
71
+ ]
72
+ },
73
+ "2048": {
74
+ "runs": 3,
75
+ "results": [
76
+ {
77
+ "model": "claude-3-5-sonnet-20241022",
78
+ "score": 17.0,
79
+ "details": "188,20,44",
80
+ "highest_tail": 32
81
+ },
82
+ {
83
+ "model": "claude-3-7-sonnet-20250219 (thinking)",
84
+ "score": 126.3,
85
+ "details": "1596,4256,3008",
86
+ "highest_tail": 512
87
+ },
88
+ {
89
+ "model": "gemini-2.5-flash-preview-04-17 (thinking)",
90
+ "score": 97.7,
91
+ "details": "2228,1424,1564",
92
+ "highest_tail": 256
93
+ },
94
+ {
95
+ "model": "gemini-2.5-pro-preview-05-06 (thinking)",
96
+ "score": 120.5,
97
+ "details": "5784,3544,3704",
98
+ "highest_tail": 512
99
+ },
100
+ {
101
+ "model": "llama-4-maverick-17b-128e-instruct-fp8",
102
+ "score": 44.6,
103
+ "details": "16,56,12",
104
+ "highest_tail": 64
105
+ },
106
+ {
107
+ "model": "gpt-4.1-2025-04-14",
108
+ "score": 94.5,
109
+ "details": "264,500,2576",
110
+ "highest_tail": 256
111
+ },
112
+ {
113
+ "model": "gpt-4o-2024-11-20",
114
+ "score": 70.4,
115
+ "details": "292,196,40",
116
+ "highest_tail": 32
117
+ },
118
+ {
119
+ "model": "o1-2024-12-17",
120
+ "score": 128.1,
121
+ "details": "7176",
122
+ "highest_tail": 512
123
+ },
124
+ {
125
+ "model": "o3-2025-04-16",
126
+ "score": 128.2,
127
+ "details": "7220",
128
+ "highest_tail": 512
129
+ },
130
+ {
131
+ "model": "o4-mini-2025-04-16",
132
+ "score": 97.6,
133
+ "details": "3004,84,2560",
134
+ "highest_tail": 256
135
+ },
136
+ {
137
+ "model": "random (x30)",
138
+ "score": 100.4,
139
+ "details": "",
140
+ "highest_tail": 128
141
+ },
142
+ {
143
+ "model": "gemini-2.5-flash-preview-05-20",
144
+ "score": 2750,
145
+ "details": "3128, 2758, 2364",
146
+ "highest_tail": 128
147
+ },
148
+ {
149
+ "model": "claude-sonnet-4-20250514",
150
+ "score": 3844,
151
+ "details": "3280,5024,3228",
152
+ "highest_tail": 512
153
+ },
154
+ {
155
+ "model": "gemini-2.5-pro-preview-06-05",
156
+ "score": 1,
157
+ "details": "3232,x,1628",
158
+ "highest_tail": 128
159
+ }
160
+ ]
161
+ },
162
+ "Tetris": {
163
+ "runs": 3,
164
+ "results": [
165
+ {
166
+ "model": "claude-3-5-sonnet-20241022",
167
+ "score": 12.3,
168
+ "details": "10,15,12"
169
+ },
170
+ {
171
+ "model": "claude-3-7-sonnet-20250219 (thinking)",
172
+ "score": 13.0,
173
+ "details": "13,13,13"
174
+ },
175
+ {
176
+ "model": "gemini-2.5-flash-preview-04-17 (thinking)",
177
+ "score": 19.0,
178
+ "details": "15,18,24"
179
+ },
180
+ {
181
+ "model": "gemini-2.5-pro-preview-05-06 (thinking)",
182
+ "score": 12.3,
183
+ "details": "15,9,13"
184
+ },
185
+ {
186
+ "model": "llama-4-maverick-17b-128e-instruct-fp8",
187
+ "score": 11.7,
188
+ "details": "13,11,11"
189
+ },
190
+ {
191
+ "model": "gpt-4.1-2025-04-14",
192
+ "score": 13.0,
193
+ "details": "11,14,14"
194
+ },
195
+ {
196
+ "model": "gpt-4o-2024-11-20",
197
+ "score": 14.7,
198
+ "details": "14,17,13"
199
+ },
200
+ {
201
+ "model": "o1-2024-12-17",
202
+ "score": 13.0,
203
+ "details": "13.0"
204
+ },
205
+ {
206
+ "model": "o3-2025-04-16",
207
+ "score": 31.0,
208
+ "details": "31.0"
209
+ },
210
+ {
211
+ "model": "o4-mini-2025-04-16",
212
+ "score": 15.0,
213
+ "details": "14,12,19"
214
+ },
215
+ {
216
+ "model": "random (x30)",
217
+ "score": 10.2,
218
+ "details": ""
219
+ },
220
+ {
221
+ "model": "gemini-2.5-flash-preview-05-20",
222
+ "score": 16,
223
+ "details": "15,15,18"
224
+ },
225
+ {
226
+ "model": "claude-sonnet-4-20250514",
227
+ "score": 13.67,
228
+ "details": "15,12,14"
229
+ },
230
+ {
231
+ "model": "gemini-2.5-pro-preview-06-05",
232
+ "score": 12,
233
+ "details": "12"
234
+ }
235
+ ]
236
+ },
237
+ "Candy Crush": {
238
+ "runs": 3,
239
+ "results": [
240
+ {
241
+ "model": "claude-3-5-sonnet-20241022",
242
+ "score": 17.0,
243
+ "details": "15,36,0"
244
+ },
245
+ {
246
+ "model": "claude-3-7-sonnet-20250219 (thinking)",
247
+ "score": 126.3,
248
+ "details": "148,182,49"
249
+ },
250
+ {
251
+ "model": "gemini-2.5-flash-preview-04-17 (thinking)",
252
+ "score": 97.7,
253
+ "details": "60,101,132"
254
+ },
255
+ {
256
+ "model": "gemini-2.5-pro-preview-05-06 (thinking)",
257
+ "score": 177.3,
258
+ "details": "117,169,246"
259
+ },
260
+ {
261
+ "model": "llama-4-maverick-17b-128e-instruct-fp8",
262
+ "score": 32.3,
263
+ "details": "18,79,0"
264
+ },
265
+ {
266
+ "model": "gpt-4.1-2025-04-14",
267
+ "score": 101.0,
268
+ "details": "13,238,52"
269
+ },
270
+ {
271
+ "model": "gpt-4o-2024-11-20",
272
+ "score": 59.0,
273
+ "details": "29,122,26"
274
+ },
275
+ {
276
+ "model": "o1-2024-12-17",
277
+ "score": 90.0,
278
+ "details": "90.0"
279
+ },
280
+ {
281
+ "model": "o3-2025-04-16",
282
+ "score": 106.0,
283
+ "details": "106.0"
284
+ },
285
+ {
286
+ "model": "o4-mini-2025-04-16",
287
+ "score": 110.7,
288
+ "details": "259,591,612"
289
+ },
290
+ {
291
+ "model": "random (x30)",
292
+ "score": 116.5,
293
+ "details": ""
294
+ },
295
+ {
296
+ "model": "gemini-2.5-flash-preview-05-20",
297
+ "score": 254,
298
+ "details": "276,191,295"
299
+ },
300
+ {
301
+ "model": "claude-sonnet-4-20250514",
302
+ "score": 557.67,
303
+ "details": "503, 557, 613"
304
+ },
305
+ {
306
+ "model": "gemini-2.5-pro-preview-06-05",
307
+ "score": 496,
308
+ "details": "461,556,471"
309
+ }
310
+ ]
311
+ },
312
+ "Sokoban": {
313
+ "runs": 3,
314
+ "results": [
315
+ {
316
+ "model": "claude-3-5-sonnet-20241022",
317
+ "score": 0,
318
+ "detail_box_on_target":"0,0,0",
319
+ "cracked_levels": "0,0,0"
320
+ },
321
+ {
322
+ "model": "claude-3-7-sonnet-20250219 (thinking)",
323
+ "score": 0,
324
+ "detail_box_on_target":"0,0,0",
325
+ "cracked_levels": "0,0,0"
326
+ },
327
+ {
328
+ "model": "gemini-2.5-flash-preview-04-17 (thinking)",
329
+ "score": 0,
330
+ "detail_box_on_target":"0,0,0",
331
+ "cracked_levels": "0,0,0"
332
+ },
333
+ {
334
+ "model": "gemini-2.5-pro-preview-05-06 (thinking)",
335
+ "score": 1,
336
+ "detail_box_on_target":"1,1,1",
337
+ "cracked_levels": "0,0,0"
338
+ },
339
+ {
340
+ "model": "llama-4-maverick-17b-128e-instruct-fp8",
341
+ "score": 0,
342
+ "detail_box_on_target":"0,0,0",
343
+ "cracked_levels": "0,0,0"
344
+ },
345
+ {
346
+ "model": "gpt-4.1-2025-04-14",
347
+ "score": 0,
348
+ "detail_box_on_target":"0,0,0",
349
+ "cracked_levels": "0,0,0"
350
+ },
351
+ {
352
+ "model": "gpt-4o-2024-11-20",
353
+ "score": 0,
354
+ "detail_box_on_target":"0,0,0",
355
+ "cracked_levels": "0,0,0"
356
+ },
357
+ {
358
+ "model": "o1-2024-12-17",
359
+ "score": 0,
360
+ "detail_box_on_target":"0",
361
+ "cracked_levels": "0"
362
+ },
363
+ {
364
+ "model": "o3-2025-04-16",
365
+ "score": 2,
366
+ "detail_box_on_target":"2",
367
+ "cracked_levels": "1"
368
+ },
369
+ {
370
+ "model": "o4-mini-2025-04-16",
371
+ "score": 1.33,
372
+ "detail_box_on_target":"1,2,1",
373
+ "cracked_levels": "0,1,0"
374
+ },
375
+ {
376
+ "model": "random (x30)",
377
+ "score": 0,
378
+ "detail_box_on_target":"0,0,0",
379
+ "cracked_levels": "0,0,0"
380
+ },
381
+ {
382
+ "model": "claude-sonnet-4-20250514",
383
+ "score": 0,
384
+ "detail_box_on_target":"0,0,0",
385
+ "cracked_levels": "0,0,0"
386
+ },
387
+ {
388
+ "model": "gemini-2.5-pro-preview-06-05",
389
+ "score": 0.33,
390
+ "detail_box_on_target": "0,0,1",
391
+ "cracked_levels": "0,0,0"
392
+ },
393
+ {
394
+ "model": "gemini-2.5-flash-preview-05-20",
395
+ "score": 0,
396
+ "detail_box_on_target": "0,0,0",
397
+ "cracked_levels": "0,0,0"
398
+ }
399
+ ]
400
+ },
401
+ "Ace Attorney": {
402
+ "runs": 1,
403
+ "results": [
404
+ {
405
+ "model": "claude-3-5-sonnet-20241022",
406
+ "score": 1,
407
+ "progress": "1:1/5"
408
+ },
409
+ {
410
+ "model": "claude-3-7-sonnet-20250219 (thinking)",
411
+ "score": 3,
412
+ "progress": "1:3/5"
413
+ },
414
+ {
415
+ "model": "gemini-2.5-flash-preview-04-17 (thinking)",
416
+ "score": 1,
417
+ "progress": "1:1/5"
418
+ },
419
+ {
420
+ "model": "gemini-2.5-pro-preview-05-06 (thinking)",
421
+ "score": 8,
422
+ "progress": "2:3/9"
423
+ },
424
+ {
425
+ "model": "llama-4-maverick-17b-128e-instruct-fp8",
426
+ "score": 0,
427
+ "progress": "0"
428
+ },
429
+ {
430
+ "model": "gpt-4.1-2025-04-14",
431
+ "score": 0,
432
+ "progress": "0"
433
+ },
434
+ {
435
+ "model": "gpt-4o-2024-11-20",
436
+ "score": 0,
437
+ "progress": "0"
438
+ },
439
+ {
440
+ "model": "o1-2024-12-17",
441
+ "score": 3,
442
+ "progress": "1:3/5"
443
+ },
444
+ {
445
+ "model": "o3-2025-04-16",
446
+ "score": 8,
447
+ "progress": "2:3/9"
448
+ },
449
+ {
450
+ "model": "o4-mini-2025-04-16",
451
+ "score": 2,
452
+ "progress": "1:2/5"
453
+ },
454
+ {
455
+ "model": "random (x30)",
456
+ "score": 0,
457
+ "progress": "0"
458
+ },
459
+ {
460
+ "model": "gemini-2.5-flash-preview-05-20",
461
+ "score": 2.33,
462
+ "details": "1,4,2",
463
+ "progress": "1:4/5"
464
+ },
465
+ {
466
+ "model": "claude-sonnet-4-20250514",
467
+ "score": 1.33,
468
+ "details": "0,2,2",
469
+ "progress": "1:2/5"
470
+ }
471
+ ]
472
+ }
473
+ }