Yuxuan-Zhang-Dexter commited on
Commit
0d20259
ยท
1 Parent(s): b3f1539

swtich agent and model leaderboard tabs and rename instances in the agent leaderboard

Browse files
Files changed (3) hide show
  1. app.py +250 -210
  2. assets/model_color.json +27 -27
  3. rank_data_03_25_2025.json +94 -94
app.py CHANGED
@@ -935,187 +935,166 @@ def build_app():
935
  """)
936
 
937
  with gr.Tabs():
938
- with gr.Tab("๐Ÿ† Agent Leaderboard"):
939
- # Visualization section
940
-
941
  with gr.Row():
942
  gr.Markdown("""
943
  **๐ŸŽฎ Welcome to LMGame Bench!**
944
 
945
- We welcome everyone to implement their own gaming agents by replacing our baseAgent in `customer_runner.py` and test them on our benchmark. Join the competition and see how your agent performs!
946
  """, elem_classes="welcome-message")
947
-
 
948
  with gr.Row():
949
  gr.Markdown("### ๐Ÿ“Š Data Visualization")
950
 
951
  # Detailed view visualization (single chart)
952
- detailed_visualization = gr.Plot(
953
  label="Performance Visualization",
954
  visible=False,
955
  elem_classes="visualization-container"
956
  )
957
- # with gr.Row():
958
- # # Calculate dynamic maximum based on total models
959
- # agent_max_models = get_total_model_count(rank_data)
960
- # top_n_slider = gr.Slider(
961
- # minimum=1,
962
- # maximum=agent_max_models,
963
- # step=1,
964
- # value=min(3, agent_max_models),
965
- # label=f"Number of Top Models to Display in All Views (max: {agent_max_models})",
966
- # elem_classes="top-n-slider"
967
- # )
968
 
 
 
 
 
 
 
 
 
 
 
 
969
 
 
970
 
971
- with gr.Column(visible=True) as overall_visualizations:
972
  with gr.Tabs():
973
  with gr.Tab("๐Ÿ“ˆ Radar Chart"):
974
-
975
- radar_visualization = gr.Plot(
976
  label="Comparative Analysis (Radar Chart)",
977
  elem_classes="visualization-container"
978
  )
979
  gr.Markdown(
980
- "*๐Ÿ’ก Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*\n\n*๐ŸŽฎ GamingAgent - Our specialized gaming agents*",
981
  elem_classes="radar-tip"
982
  )
983
  with gr.Tab("๐Ÿ“Š Group Bar Chart"):
984
- group_bar_visualization = gr.Plot(
985
  label="Comparative Analysis (Group Bar Chart)",
986
  elem_classes="visualization-container"
987
  )
988
  gr.Markdown(
989
- "*๐Ÿ’ก Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*\n\n*๐ŸŽฎ GamingAgent - Our specialized gaming agents*",
990
  elem_classes="radar-tip"
991
  )
992
 
993
- # Hidden placeholder for group bar visualization (to maintain code references)
994
- # group_bar_visualization = gr.Plot(visible=False)
995
-
996
  # Game selection section
997
  with gr.Row():
998
  gr.Markdown("### ๐Ÿ•น๏ธ Game Selection")
999
  with gr.Row():
1000
- # with gr.Column(): # Commented out Super Mario BrosUI
1001
- # gr.Markdown("**๐ŸŽฎ Super Mario Bros**")
1002
- # mario_overall = gr.Checkbox(label="Super Mario BrosScore", value=True)
1003
- # mario_details = gr.Checkbox(label="Super Mario BrosDetails", value=False)
1004
- with gr.Column(): # Added Super Mario BrosUI
1005
  gr.Markdown("**๐Ÿ„ Super Mario Bros**")
1006
- mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
1007
- mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
1008
- with gr.Column(): # Sokoban is now after mario_plan
1009
  gr.Markdown("**๐Ÿ“ฆ Sokoban**")
1010
- sokoban_overall = gr.Checkbox(label="Sokoban Score", value=True)
1011
- sokoban_details = gr.Checkbox(label="Sokoban Details", value=False)
1012
  with gr.Column():
1013
  gr.Markdown("**๐Ÿ”ข 2048**")
1014
- _2048_overall = gr.Checkbox(label="2048 Score", value=True)
1015
- _2048_details = gr.Checkbox(label="2048 Details", value=False)
1016
  with gr.Column():
1017
  gr.Markdown("**๐Ÿฌ Candy Crush**")
1018
- candy_overall = gr.Checkbox(label="Candy Crush Score", value=True)
1019
- candy_details = gr.Checkbox(label="Candy Crush Details", value=False)
1020
- # with gr.Column(): # Commented out Tetris(complete) UI
1021
- # gr.Markdown("**๐ŸŽฏ Tetris(complete)**")
1022
- # tetris_overall = gr.Checkbox(label="Tetris(complete) Score", value=True)
1023
- # tetris_details = gr.Checkbox(label="Tetris(complete) Details", value=False)
1024
  with gr.Column():
1025
  gr.Markdown("**๐ŸŽฏ Tetris**")
1026
- tetris_plan_overall = gr.Checkbox(label="Tetris Score", value=True)
1027
- tetris_plan_details = gr.Checkbox(label="Tetris Details", value=False)
1028
  with gr.Column():
1029
  gr.Markdown("**โš–๏ธ Ace Attorney**")
1030
- ace_attorney_overall = gr.Checkbox(label="Ace Attorney Score", value=True)
1031
- ace_attorney_details = gr.Checkbox(label="Ace Attorney Details", value=False)
1032
 
1033
  # Controls
1034
  with gr.Row():
1035
  with gr.Column(scale=2):
1036
  gr.Markdown("**โฐ Time Tracker**")
1037
- timeline = create_timeline_slider()
1038
  with gr.Column(scale=1):
1039
  gr.Markdown("**๐Ÿ”„ Controls**")
1040
- clear_btn = gr.Button("Reset Filters", variant="secondary")
1041
 
1042
  # Leaderboard table
1043
  with gr.Row():
1044
  gr.Markdown("### ๐Ÿ“‹ Detailed Results")
1045
  with gr.Row():
1046
- gr.Markdown("*๐ŸŽฎ GamingAgent - Our specialized gaming agents*", elem_classes="radar-tip")
1047
-
1048
- # Welcome message for custom gaming agents
1049
-
1050
- # Add reference to Jupyter notebook
1051
- with gr.Row():
1052
- gr.Markdown("*All data analysis can be replicated by checking [this Jupyter notebook](https://colab.research.google.com/drive/1CYFiJGm3EoBXXI8vICPVR82J9qrmmRvc#scrollTo=qft1Oald-21J)*")
1053
 
1054
- # Get initial leaderboard dataframe (limited by default slider value for agent leaderboard)
1055
- initial_df = get_combined_leaderboard(rank_data, {
1056
- # "Super Mario Bros": True, # Commented out
1057
  "Super Mario Bros": True,
1058
  "Sokoban": True,
1059
  "2048": True,
1060
  "Candy Crush": True,
1061
- # "Tetris(complete)": True, # Commented out
1062
  "Tetris": True,
1063
  "Ace Attorney": True
1064
- }, limit_to_top_n=5)
1065
 
1066
  # Format the DataFrame for display
1067
- initial_display_df = prepare_dataframe_for_display(initial_df)
1068
 
1069
- # Custom column widths including row numbers
1070
- col_widths = ["40px"] # Row number column width
1071
- col_widths.append("230px") # Player column - reduced by 20px
1072
- col_widths.append("120px") # Organization column
1073
 
1074
  # Check if there's an Avg Normalized Score column
1075
- if any('Avg Normalized' in col for col in initial_display_df.columns):
1076
- col_widths.append("140px") # Avg Normalized Score column - slightly wider
1077
 
1078
  # Add game score columns
1079
- remaining_cols = len(initial_display_df.columns) - len(col_widths) + 1 # +1 because we subtracted row number column
1080
  for _ in range(remaining_cols):
1081
- col_widths.append("120px")
1082
 
1083
  # Create a standard DataFrame component with enhanced styling
1084
  with gr.Row():
1085
- leaderboard_df = gr.DataFrame(
1086
- value=initial_display_df,
1087
  interactive=True,
1088
- elem_id="leaderboard-table",
1089
  elem_classes="table-container",
1090
  wrap=True,
1091
  show_row_numbers=True,
1092
  show_fullscreen_button=True,
1093
  line_breaks=True,
1094
- max_height=1000, # Set a larger fixed height
1095
  show_search="search",
1096
- column_widths=col_widths
1097
  )
1098
 
1099
  # Add the score note below the table
1100
  with gr.Row():
1101
- score_note = add_score_note()
1102
 
1103
- # List of all checkboxes, including Super Mario Bros
1104
- checkbox_list = [
1105
- # mario_overall, mario_details, # Commented out
1106
- mario_plan_overall, mario_plan_details,
1107
- sokoban_overall, sokoban_details,
1108
- _2048_overall, _2048_details,
1109
- candy_overall, candy_details,
1110
- # tetris_overall, tetris_details, # Commented out
1111
- tetris_plan_overall, tetris_plan_details,
1112
- ace_attorney_overall, ace_attorney_details
1113
  ]
1114
 
1115
  # Update visualizations when checkboxes change
1116
- def update_visualizations(*checkbox_states):
1117
  # Check if any details checkbox is selected
1118
- # Adjusted indices due to addition of Super Mario
1119
  is_details_view = any([
1120
  checkbox_states[1], # Mario Plan details
1121
  checkbox_states[3], # Sokoban details
@@ -1127,192 +1106,253 @@ def build_app():
1127
 
1128
  # Update visibility of visualization blocks
1129
  return {
1130
- detailed_visualization: gr.update(visible=is_details_view),
1131
- overall_visualizations: gr.update(visible=not is_details_view)
1132
  }
1133
 
1134
  # Add change event to all checkboxes
1135
- for checkbox in checkbox_list:
1136
  checkbox.change(
1137
- update_visualizations,
1138
- inputs=checkbox_list,
1139
- outputs=[detailed_visualization, overall_visualizations]
1140
  )
1141
 
1142
  # Update leaderboard and visualizations when checkboxes change
1143
- for checkbox in checkbox_list:
1144
  checkbox.change(
1145
- lambda *args: update_leaderboard(*args, top_n=5, data_source=rank_data),
1146
- inputs=checkbox_list,
1147
  outputs=[
1148
- leaderboard_df,
1149
- detailed_visualization,
1150
- radar_visualization,
1151
- group_bar_visualization
1152
- ] + checkbox_list
1153
  )
1154
 
 
 
 
 
 
 
 
 
 
 
 
 
1155
  # Update when clear button is clicked
1156
- clear_btn.click(
1157
- lambda: clear_filters(top_n=5, data_source=rank_data),
1158
- inputs=[],
1159
  outputs=[
1160
- leaderboard_df,
1161
- detailed_visualization,
1162
- radar_visualization,
1163
- group_bar_visualization
1164
- ] + checkbox_list
1165
  )
1166
 
1167
- # Initialize the agent leaderboard (with top 5 limit)
1168
  demo.load(
1169
- lambda: clear_filters(top_n=5, data_source=rank_data),
1170
  inputs=[],
1171
  outputs=[
1172
- leaderboard_df,
1173
- detailed_visualization,
1174
- radar_visualization,
1175
- group_bar_visualization
1176
- ] + checkbox_list
1177
  )
1178
 
1179
- with gr.Tab("๐Ÿค– Model Leaderboard"):
1180
  # Visualization section
 
 
 
 
 
 
 
 
 
 
 
 
 
1181
  with gr.Row():
1182
  gr.Markdown("### ๐Ÿ“Š Data Visualization")
1183
 
1184
  # Detailed view visualization (single chart)
1185
- model_detailed_visualization = gr.Plot(
1186
  label="Performance Visualization",
1187
  visible=False,
1188
  elem_classes="visualization-container"
1189
  )
 
 
 
 
 
 
 
 
 
 
 
1190
 
1191
- with gr.Row():
1192
- # Calculate dynamic maximum based on total models
1193
- model_max_models = get_total_model_count(model_rank_data)
1194
- model_top_n_slider = gr.Slider(
1195
- minimum=1,
1196
- maximum=model_max_models,
1197
- step=1,
1198
- value=model_max_models,
1199
- label=f"Number of Top Models to Display in All Views (max: {model_max_models})",
1200
- elem_classes="top-n-slider"
1201
- )
1202
 
1203
-
1204
 
1205
- with gr.Column(visible=True) as model_overall_visualizations:
1206
  with gr.Tabs():
1207
  with gr.Tab("๐Ÿ“ˆ Radar Chart"):
1208
- model_radar_visualization = gr.Plot(
 
1209
  label="Comparative Analysis (Radar Chart)",
1210
  elem_classes="visualization-container"
1211
  )
1212
  gr.Markdown(
1213
- "*๐Ÿ’ก Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*",
1214
  elem_classes="radar-tip"
1215
  )
1216
  with gr.Tab("๐Ÿ“Š Group Bar Chart"):
1217
- model_group_bar_visualization = gr.Plot(
1218
  label="Comparative Analysis (Group Bar Chart)",
1219
  elem_classes="visualization-container"
1220
  )
1221
  gr.Markdown(
1222
- "*๐Ÿ’ก Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*",
1223
  elem_classes="radar-tip"
1224
  )
1225
 
 
 
 
1226
  # Game selection section
1227
  with gr.Row():
1228
  gr.Markdown("### ๐Ÿ•น๏ธ Game Selection")
1229
  with gr.Row():
1230
- with gr.Column():
 
 
 
 
1231
  gr.Markdown("**๐Ÿ„ Super Mario Bros**")
1232
- model_mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
1233
- model_mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
1234
- with gr.Column():
1235
  gr.Markdown("**๐Ÿ“ฆ Sokoban**")
1236
- model_sokoban_overall = gr.Checkbox(label="Sokoban Score", value=True)
1237
- model_sokoban_details = gr.Checkbox(label="Sokoban Details", value=False)
1238
  with gr.Column():
1239
  gr.Markdown("**๐Ÿ”ข 2048**")
1240
- model_2048_overall = gr.Checkbox(label="2048 Score", value=True)
1241
- model_2048_details = gr.Checkbox(label="2048 Details", value=False)
1242
  with gr.Column():
1243
  gr.Markdown("**๐Ÿฌ Candy Crush**")
1244
- model_candy_overall = gr.Checkbox(label="Candy Crush Score", value=True)
1245
- model_candy_details = gr.Checkbox(label="Candy Crush Details", value=False)
 
 
 
 
1246
  with gr.Column():
1247
  gr.Markdown("**๐ŸŽฏ Tetris**")
1248
- model_tetris_plan_overall = gr.Checkbox(label="Tetris Score", value=True)
1249
- model_tetris_plan_details = gr.Checkbox(label="Tetris Details", value=False)
1250
  with gr.Column():
1251
  gr.Markdown("**โš–๏ธ Ace Attorney**")
1252
- model_ace_attorney_overall = gr.Checkbox(label="Ace Attorney Score", value=True)
1253
- model_ace_attorney_details = gr.Checkbox(label="Ace Attorney Details", value=False)
1254
 
1255
  # Controls
1256
  with gr.Row():
1257
  with gr.Column(scale=2):
1258
  gr.Markdown("**โฐ Time Tracker**")
1259
- model_timeline = create_timeline_slider()
1260
  with gr.Column(scale=1):
1261
  gr.Markdown("**๐Ÿ”„ Controls**")
1262
- model_clear_btn = gr.Button("Reset Filters", variant="secondary")
1263
 
1264
  # Leaderboard table
1265
  with gr.Row():
1266
  gr.Markdown("### ๐Ÿ“‹ Detailed Results")
1267
  with gr.Row():
1268
- gr.Markdown("*๐Ÿ’ก The slider above controls how many top models are shown in the radar chart, bar chart, and data table.*", elem_classes="radar-tip")
1269
 
1270
- # Get initial leaderboard dataframe (limited by default slider value for model leaderboard)
1271
- model_initial_df = get_combined_leaderboard(model_rank_data, {
 
 
 
 
 
 
 
1272
  "Super Mario Bros": True,
1273
  "Sokoban": True,
1274
  "2048": True,
1275
  "Candy Crush": True,
 
1276
  "Tetris": True,
1277
  "Ace Attorney": True
1278
- }, limit_to_top_n=None)
1279
 
1280
  # Format the DataFrame for display
1281
- model_initial_display_df = prepare_dataframe_for_display(model_initial_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1282
 
1283
  # Create a standard DataFrame component with enhanced styling
1284
  with gr.Row():
1285
- model_leaderboard_df = gr.DataFrame(
1286
- value=model_initial_display_df,
1287
  interactive=True,
1288
- elem_id="model-leaderboard-table",
1289
  elem_classes="table-container",
1290
  wrap=True,
1291
  show_row_numbers=True,
1292
  show_fullscreen_button=True,
1293
  line_breaks=True,
1294
- max_height=1000,
1295
  show_search="search",
1296
  column_widths=col_widths
1297
  )
1298
 
1299
  # Add the score note below the table
1300
  with gr.Row():
1301
- model_score_note = add_score_note()
1302
 
1303
- # List of all checkboxes for model leaderboard
1304
- model_checkbox_list = [
1305
- model_mario_plan_overall, model_mario_plan_details,
1306
- model_sokoban_overall, model_sokoban_details,
1307
- model_2048_overall, model_2048_details,
1308
- model_candy_overall, model_candy_details,
1309
- model_tetris_plan_overall, model_tetris_plan_details,
1310
- model_ace_attorney_overall, model_ace_attorney_details
 
 
1311
  ]
1312
 
1313
  # Update visualizations when checkboxes change
1314
- def update_model_visualizations(*checkbox_states):
1315
  # Check if any details checkbox is selected
 
1316
  is_details_view = any([
1317
  checkbox_states[1], # Mario Plan details
1318
  checkbox_states[3], # Sokoban details
@@ -1324,65 +1364,65 @@ def build_app():
1324
 
1325
  # Update visibility of visualization blocks
1326
  return {
1327
- model_detailed_visualization: gr.update(visible=is_details_view),
1328
- model_overall_visualizations: gr.update(visible=not is_details_view)
1329
  }
1330
 
1331
  # Add change event to all checkboxes
1332
- for checkbox in model_checkbox_list:
1333
  checkbox.change(
1334
- update_model_visualizations,
1335
- inputs=model_checkbox_list,
1336
- outputs=[model_detailed_visualization, model_overall_visualizations]
1337
  )
1338
 
1339
  # Update leaderboard and visualizations when checkboxes change
1340
- for checkbox in model_checkbox_list:
1341
  checkbox.change(
1342
- lambda *args: update_leaderboard(*args, data_source=model_rank_data),
1343
- inputs=model_checkbox_list + [model_top_n_slider],
1344
  outputs=[
1345
- model_leaderboard_df,
1346
- model_detailed_visualization,
1347
- model_radar_visualization,
1348
- model_group_bar_visualization
1349
- ] + model_checkbox_list
1350
  )
1351
 
1352
- # Update when model top_n_slider changes
1353
- model_top_n_slider.change(
1354
- lambda *args: update_leaderboard(*args, data_source=model_rank_data),
1355
- inputs=model_checkbox_list + [model_top_n_slider],
1356
  outputs=[
1357
- model_leaderboard_df,
1358
- model_detailed_visualization,
1359
- model_radar_visualization,
1360
- model_group_bar_visualization
1361
- ] + model_checkbox_list
1362
  )
1363
 
1364
  # Update when clear button is clicked
1365
- model_clear_btn.click(
1366
- lambda *args: clear_filters(*args, data_source=model_rank_data),
1367
- inputs=[model_top_n_slider],
1368
  outputs=[
1369
- model_leaderboard_df,
1370
- model_detailed_visualization,
1371
- model_radar_visualization,
1372
- model_group_bar_visualization
1373
- ] + model_checkbox_list
1374
  )
1375
 
1376
- # Initialize the model leaderboard (with all models shown by default)
1377
  demo.load(
1378
- lambda: clear_filters(top_n=get_total_model_count(model_rank_data), data_source=model_rank_data),
1379
  inputs=[],
1380
  outputs=[
1381
- model_leaderboard_df,
1382
- model_detailed_visualization,
1383
- model_radar_visualization,
1384
- model_group_bar_visualization
1385
- ] + model_checkbox_list
1386
  )
1387
 
1388
  with gr.Tab("๐ŸŽฅ Gallery"):
 
935
  """)
936
 
937
  with gr.Tabs():
938
+
939
+
940
+ with gr.Tab("๐Ÿค– Model Leaderboard"):
941
  with gr.Row():
942
  gr.Markdown("""
943
  **๐ŸŽฎ Welcome to LMGame Bench!**
944
 
945
+ We invite developers to implement their own gaming agents by replacing our `baseAgent` in `customer_runner.py` and evaluate them on our comprehensive benchmark. Visit our repository at https://github.com/lmgame-org/GamingAgent to get started and join the competition to see how your agent performs!
946
  """, elem_classes="welcome-message")
947
+
948
+ # Visualization section
949
  with gr.Row():
950
  gr.Markdown("### ๐Ÿ“Š Data Visualization")
951
 
952
  # Detailed view visualization (single chart)
953
+ model_detailed_visualization = gr.Plot(
954
  label="Performance Visualization",
955
  visible=False,
956
  elem_classes="visualization-container"
957
  )
 
 
 
 
 
 
 
 
 
 
 
958
 
959
+ with gr.Row():
960
+ # Calculate dynamic maximum based on total models
961
+ model_max_models = get_total_model_count(model_rank_data)
962
+ model_top_n_slider = gr.Slider(
963
+ minimum=1,
964
+ maximum=model_max_models,
965
+ step=1,
966
+ value=model_max_models,
967
+ label=f"Number of Top Models to Display in All Views (max: {model_max_models})",
968
+ elem_classes="top-n-slider"
969
+ )
970
 
971
+
972
 
973
+ with gr.Column(visible=True) as model_overall_visualizations:
974
  with gr.Tabs():
975
  with gr.Tab("๐Ÿ“ˆ Radar Chart"):
976
+ model_radar_visualization = gr.Plot(
 
977
  label="Comparative Analysis (Radar Chart)",
978
  elem_classes="visualization-container"
979
  )
980
  gr.Markdown(
981
+ "*๐Ÿ’ก Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*",
982
  elem_classes="radar-tip"
983
  )
984
  with gr.Tab("๐Ÿ“Š Group Bar Chart"):
985
+ model_group_bar_visualization = gr.Plot(
986
  label="Comparative Analysis (Group Bar Chart)",
987
  elem_classes="visualization-container"
988
  )
989
  gr.Markdown(
990
+ "*๐Ÿ’ก Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*",
991
  elem_classes="radar-tip"
992
  )
993
 
 
 
 
994
  # Game selection section
995
  with gr.Row():
996
  gr.Markdown("### ๐Ÿ•น๏ธ Game Selection")
997
  with gr.Row():
998
+ with gr.Column():
 
 
 
 
999
  gr.Markdown("**๐Ÿ„ Super Mario Bros**")
1000
+ model_mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
1001
+ model_mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
1002
+ with gr.Column():
1003
  gr.Markdown("**๐Ÿ“ฆ Sokoban**")
1004
+ model_sokoban_overall = gr.Checkbox(label="Sokoban Score", value=True)
1005
+ model_sokoban_details = gr.Checkbox(label="Sokoban Details", value=False)
1006
  with gr.Column():
1007
  gr.Markdown("**๐Ÿ”ข 2048**")
1008
+ model_2048_overall = gr.Checkbox(label="2048 Score", value=True)
1009
+ model_2048_details = gr.Checkbox(label="2048 Details", value=False)
1010
  with gr.Column():
1011
  gr.Markdown("**๐Ÿฌ Candy Crush**")
1012
+ model_candy_overall = gr.Checkbox(label="Candy Crush Score", value=True)
1013
+ model_candy_details = gr.Checkbox(label="Candy Crush Details", value=False)
 
 
 
 
1014
  with gr.Column():
1015
  gr.Markdown("**๐ŸŽฏ Tetris**")
1016
+ model_tetris_plan_overall = gr.Checkbox(label="Tetris Score", value=True)
1017
+ model_tetris_plan_details = gr.Checkbox(label="Tetris Details", value=False)
1018
  with gr.Column():
1019
  gr.Markdown("**โš–๏ธ Ace Attorney**")
1020
+ model_ace_attorney_overall = gr.Checkbox(label="Ace Attorney Score", value=True)
1021
+ model_ace_attorney_details = gr.Checkbox(label="Ace Attorney Details", value=False)
1022
 
1023
  # Controls
1024
  with gr.Row():
1025
  with gr.Column(scale=2):
1026
  gr.Markdown("**โฐ Time Tracker**")
1027
+ model_timeline = create_timeline_slider()
1028
  with gr.Column(scale=1):
1029
  gr.Markdown("**๐Ÿ”„ Controls**")
1030
+ model_clear_btn = gr.Button("Reset Filters", variant="secondary")
1031
 
1032
  # Leaderboard table
1033
  with gr.Row():
1034
  gr.Markdown("### ๐Ÿ“‹ Detailed Results")
1035
  with gr.Row():
1036
+ gr.Markdown("*๐Ÿ’ก The slider above controls how many top models are shown in the radar chart, bar chart, and data table.*", elem_classes="radar-tip")
 
 
 
 
 
 
1037
 
1038
+ # Get initial leaderboard dataframe (limited by default slider value for model leaderboard)
1039
+ model_initial_df = get_combined_leaderboard(model_rank_data, {
 
1040
  "Super Mario Bros": True,
1041
  "Sokoban": True,
1042
  "2048": True,
1043
  "Candy Crush": True,
 
1044
  "Tetris": True,
1045
  "Ace Attorney": True
1046
+ }, limit_to_top_n=None)
1047
 
1048
  # Format the DataFrame for display
1049
+ model_initial_display_df = prepare_dataframe_for_display(model_initial_df)
1050
 
1051
+ # Custom column widths including row numbers for model leaderboard
1052
+ model_col_widths = ["40px"] # Row number column width
1053
+ model_col_widths.append("230px") # Player column - reduced by 20px
1054
+ model_col_widths.append("120px") # Organization column
1055
 
1056
  # Check if there's an Avg Normalized Score column
1057
+ if any('Avg Normalized' in col for col in model_initial_display_df.columns):
1058
+ model_col_widths.append("140px") # Avg Normalized Score column - slightly wider
1059
 
1060
  # Add game score columns
1061
+ remaining_cols = len(model_initial_display_df.columns) - len(model_col_widths) + 1 # +1 because we subtracted row number column
1062
  for _ in range(remaining_cols):
1063
+ model_col_widths.append("120px")
1064
 
1065
  # Create a standard DataFrame component with enhanced styling
1066
  with gr.Row():
1067
+ model_leaderboard_df = gr.DataFrame(
1068
+ value=model_initial_display_df,
1069
  interactive=True,
1070
+ elem_id="model-leaderboard-table",
1071
  elem_classes="table-container",
1072
  wrap=True,
1073
  show_row_numbers=True,
1074
  show_fullscreen_button=True,
1075
  line_breaks=True,
1076
+ max_height=1000,
1077
  show_search="search",
1078
+ column_widths=model_col_widths
1079
  )
1080
 
1081
  # Add the score note below the table
1082
  with gr.Row():
1083
+ model_score_note = add_score_note()
1084
 
1085
+ # List of all checkboxes for model leaderboard
1086
+ model_checkbox_list = [
1087
+ model_mario_plan_overall, model_mario_plan_details,
1088
+ model_sokoban_overall, model_sokoban_details,
1089
+ model_2048_overall, model_2048_details,
1090
+ model_candy_overall, model_candy_details,
1091
+ model_tetris_plan_overall, model_tetris_plan_details,
1092
+ model_ace_attorney_overall, model_ace_attorney_details
 
 
1093
  ]
1094
 
1095
  # Update visualizations when checkboxes change
1096
+ def update_model_visualizations(*checkbox_states):
1097
  # Check if any details checkbox is selected
 
1098
  is_details_view = any([
1099
  checkbox_states[1], # Mario Plan details
1100
  checkbox_states[3], # Sokoban details
 
1106
 
1107
  # Update visibility of visualization blocks
1108
  return {
1109
+ model_detailed_visualization: gr.update(visible=is_details_view),
1110
+ model_overall_visualizations: gr.update(visible=not is_details_view)
1111
  }
1112
 
1113
  # Add change event to all checkboxes
1114
+ for checkbox in model_checkbox_list:
1115
  checkbox.change(
1116
+ update_model_visualizations,
1117
+ inputs=model_checkbox_list,
1118
+ outputs=[model_detailed_visualization, model_overall_visualizations]
1119
  )
1120
 
1121
  # Update leaderboard and visualizations when checkboxes change
1122
+ for checkbox in model_checkbox_list:
1123
  checkbox.change(
1124
+ lambda *args: update_leaderboard(*args, data_source=model_rank_data),
1125
+ inputs=model_checkbox_list + [model_top_n_slider],
1126
  outputs=[
1127
+ model_leaderboard_df,
1128
+ model_detailed_visualization,
1129
+ model_radar_visualization,
1130
+ model_group_bar_visualization
1131
+ ] + model_checkbox_list
1132
  )
1133
 
1134
+ # Update when model top_n_slider changes
1135
+ model_top_n_slider.change(
1136
+ lambda *args: update_leaderboard(*args, data_source=model_rank_data),
1137
+ inputs=model_checkbox_list + [model_top_n_slider],
1138
+ outputs=[
1139
+ model_leaderboard_df,
1140
+ model_detailed_visualization,
1141
+ model_radar_visualization,
1142
+ model_group_bar_visualization
1143
+ ] + model_checkbox_list
1144
+ )
1145
+
1146
  # Update when clear button is clicked
1147
+ model_clear_btn.click(
1148
+ lambda *args: clear_filters(*args, data_source=model_rank_data),
1149
+ inputs=[model_top_n_slider],
1150
  outputs=[
1151
+ model_leaderboard_df,
1152
+ model_detailed_visualization,
1153
+ model_radar_visualization,
1154
+ model_group_bar_visualization
1155
+ ] + model_checkbox_list
1156
  )
1157
 
1158
+ # Initialize the model leaderboard (with all models shown by default)
1159
  demo.load(
1160
+ lambda: clear_filters(top_n=get_total_model_count(model_rank_data), data_source=model_rank_data),
1161
  inputs=[],
1162
  outputs=[
1163
+ model_leaderboard_df,
1164
+ model_detailed_visualization,
1165
+ model_radar_visualization,
1166
+ model_group_bar_visualization
1167
+ ] + model_checkbox_list
1168
  )
1169
 
1170
+ with gr.Tab("๐Ÿ† Agent Leaderboard"):
1171
  # Visualization section
1172
+
1173
+ with gr.Row():
1174
+ # Calculate dynamic maximum based on total models
1175
+ agent_max_models = get_total_model_count(rank_data)
1176
+ agent_top_n_slider = gr.Slider(
1177
+ minimum=1,
1178
+ maximum=agent_max_models,
1179
+ step=1,
1180
+ value=5,
1181
+ label=f"Number of Top Models to Display in All Views (max: {agent_max_models})",
1182
+ elem_classes="top-n-slider"
1183
+ )
1184
+
1185
  with gr.Row():
1186
  gr.Markdown("### ๐Ÿ“Š Data Visualization")
1187
 
1188
  # Detailed view visualization (single chart)
1189
+ detailed_visualization = gr.Plot(
1190
  label="Performance Visualization",
1191
  visible=False,
1192
  elem_classes="visualization-container"
1193
  )
1194
+ # with gr.Row():
1195
+ # # Calculate dynamic maximum based on total models
1196
+ # agent_max_models = get_total_model_count(rank_data)
1197
+ # top_n_slider = gr.Slider(
1198
+ # minimum=1,
1199
+ # maximum=agent_max_models,
1200
+ # step=1,
1201
+ # value=min(3, agent_max_models),
1202
+ # label=f"Number of Top Models to Display in All Views (max: {agent_max_models})",
1203
+ # elem_classes="top-n-slider"
1204
+ # )
1205
 
 
 
 
 
 
 
 
 
 
 
 
1206
 
 
1207
 
1208
+ with gr.Column(visible=True) as overall_visualizations:
1209
  with gr.Tabs():
1210
  with gr.Tab("๐Ÿ“ˆ Radar Chart"):
1211
+
1212
+ radar_visualization = gr.Plot(
1213
  label="Comparative Analysis (Radar Chart)",
1214
  elem_classes="visualization-container"
1215
  )
1216
  gr.Markdown(
1217
+ "*๐Ÿ’ก Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*\n\n*๐ŸŽฎ Model Name (GamingAgent) - Our specialized gaming agents*",
1218
  elem_classes="radar-tip"
1219
  )
1220
  with gr.Tab("๐Ÿ“Š Group Bar Chart"):
1221
+ group_bar_visualization = gr.Plot(
1222
  label="Comparative Analysis (Group Bar Chart)",
1223
  elem_classes="visualization-container"
1224
  )
1225
  gr.Markdown(
1226
+ "*๐Ÿ’ก Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*\n\n*๐ŸŽฎ Model Name (GamingAgent) - Our specialized gaming agents*",
1227
  elem_classes="radar-tip"
1228
  )
1229
 
1230
+ # Hidden placeholder for group bar visualization (to maintain code references)
1231
+ # group_bar_visualization = gr.Plot(visible=False)
1232
+
1233
  # Game selection section
1234
  with gr.Row():
1235
  gr.Markdown("### ๐Ÿ•น๏ธ Game Selection")
1236
  with gr.Row():
1237
+ # with gr.Column(): # Commented out Super Mario BrosUI
1238
+ # gr.Markdown("**๐ŸŽฎ Super Mario Bros**")
1239
+ # mario_overall = gr.Checkbox(label="Super Mario BrosScore", value=True)
1240
+ # mario_details = gr.Checkbox(label="Super Mario BrosDetails", value=False)
1241
+ with gr.Column(): # Added Super Mario BrosUI
1242
  gr.Markdown("**๐Ÿ„ Super Mario Bros**")
1243
+ mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
1244
+ mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
1245
+ with gr.Column(): # Sokoban is now after mario_plan
1246
  gr.Markdown("**๐Ÿ“ฆ Sokoban**")
1247
+ sokoban_overall = gr.Checkbox(label="Sokoban Score", value=True)
1248
+ sokoban_details = gr.Checkbox(label="Sokoban Details", value=False)
1249
  with gr.Column():
1250
  gr.Markdown("**๐Ÿ”ข 2048**")
1251
+ _2048_overall = gr.Checkbox(label="2048 Score", value=True)
1252
+ _2048_details = gr.Checkbox(label="2048 Details", value=False)
1253
  with gr.Column():
1254
  gr.Markdown("**๐Ÿฌ Candy Crush**")
1255
+ candy_overall = gr.Checkbox(label="Candy Crush Score", value=True)
1256
+ candy_details = gr.Checkbox(label="Candy Crush Details", value=False)
1257
+ # with gr.Column(): # Commented out Tetris(complete) UI
1258
+ # gr.Markdown("**๐ŸŽฏ Tetris(complete)**")
1259
+ # tetris_overall = gr.Checkbox(label="Tetris(complete) Score", value=True)
1260
+ # tetris_details = gr.Checkbox(label="Tetris(complete) Details", value=False)
1261
  with gr.Column():
1262
  gr.Markdown("**๐ŸŽฏ Tetris**")
1263
+ tetris_plan_overall = gr.Checkbox(label="Tetris Score", value=True)
1264
+ tetris_plan_details = gr.Checkbox(label="Tetris Details", value=False)
1265
  with gr.Column():
1266
  gr.Markdown("**โš–๏ธ Ace Attorney**")
1267
+ ace_attorney_overall = gr.Checkbox(label="Ace Attorney Score", value=True)
1268
+ ace_attorney_details = gr.Checkbox(label="Ace Attorney Details", value=False)
1269
 
1270
  # Controls
1271
  with gr.Row():
1272
  with gr.Column(scale=2):
1273
  gr.Markdown("**โฐ Time Tracker**")
1274
+ timeline = create_timeline_slider()
1275
  with gr.Column(scale=1):
1276
  gr.Markdown("**๐Ÿ”„ Controls**")
1277
+ clear_btn = gr.Button("Reset Filters", variant="secondary")
1278
 
1279
  # Leaderboard table
1280
  with gr.Row():
1281
  gr.Markdown("### ๐Ÿ“‹ Detailed Results")
1282
  with gr.Row():
1283
+ gr.Markdown("*๐ŸŽฎ Model Name (GamingAgent) - Our specialized gaming agents*", elem_classes="radar-tip")
1284
 
1285
+ # Welcome message for custom gaming agents
1286
+
1287
+ # Add reference to Jupyter notebook
1288
+ with gr.Row():
1289
+ gr.Markdown("*All data analysis can be replicated by checking [this Jupyter notebook](https://colab.research.google.com/drive/1CYFiJGm3EoBXXI8vICPVR82J9qrmmRvc#scrollTo=qft1Oald-21J)*")
1290
+
1291
+ # Get initial leaderboard dataframe (limited by default slider value for agent leaderboard)
1292
+ initial_df = get_combined_leaderboard(rank_data, {
1293
+ # "Super Mario Bros": True, # Commented out
1294
  "Super Mario Bros": True,
1295
  "Sokoban": True,
1296
  "2048": True,
1297
  "Candy Crush": True,
1298
+ # "Tetris(complete)": True, # Commented out
1299
  "Tetris": True,
1300
  "Ace Attorney": True
1301
+ }, limit_to_top_n=5)
1302
 
1303
  # Format the DataFrame for display
1304
+ initial_display_df = prepare_dataframe_for_display(initial_df)
1305
+
1306
+ # Custom column widths including row numbers
1307
+ col_widths = ["40px"] # Row number column width
1308
+ col_widths.append("230px") # Player column - reduced by 20px
1309
+ col_widths.append("120px") # Organization column
1310
+
1311
+ # Check if there's an Avg Normalized Score column
1312
+ if any('Avg Normalized' in col for col in initial_display_df.columns):
1313
+ col_widths.append("140px") # Avg Normalized Score column - slightly wider
1314
+
1315
+ # Add game score columns
1316
+ remaining_cols = len(initial_display_df.columns) - len(col_widths) + 1 # +1 because we subtracted row number column
1317
+ for _ in range(remaining_cols):
1318
+ col_widths.append("120px")
1319
 
1320
  # Create a standard DataFrame component with enhanced styling
1321
  with gr.Row():
1322
+ leaderboard_df = gr.DataFrame(
1323
+ value=initial_display_df,
1324
  interactive=True,
1325
+ elem_id="leaderboard-table",
1326
  elem_classes="table-container",
1327
  wrap=True,
1328
  show_row_numbers=True,
1329
  show_fullscreen_button=True,
1330
  line_breaks=True,
1331
+ max_height=1000, # Set a larger fixed height
1332
  show_search="search",
1333
  column_widths=col_widths
1334
  )
1335
 
1336
  # Add the score note below the table
1337
  with gr.Row():
1338
+ score_note = add_score_note()
1339
 
1340
+ # List of all checkboxes, including Super Mario Bros
1341
+ checkbox_list = [
1342
+ # mario_overall, mario_details, # Commented out
1343
+ mario_plan_overall, mario_plan_details,
1344
+ sokoban_overall, sokoban_details,
1345
+ _2048_overall, _2048_details,
1346
+ candy_overall, candy_details,
1347
+ # tetris_overall, tetris_details, # Commented out
1348
+ tetris_plan_overall, tetris_plan_details,
1349
+ ace_attorney_overall, ace_attorney_details
1350
  ]
1351
 
1352
  # Update visualizations when checkboxes change
1353
+ def update_visualizations(*checkbox_states):
1354
  # Check if any details checkbox is selected
1355
+ # Adjusted indices due to addition of Super Mario
1356
  is_details_view = any([
1357
  checkbox_states[1], # Mario Plan details
1358
  checkbox_states[3], # Sokoban details
 
1364
 
1365
  # Update visibility of visualization blocks
1366
  return {
1367
+ detailed_visualization: gr.update(visible=is_details_view),
1368
+ overall_visualizations: gr.update(visible=not is_details_view)
1369
  }
1370
 
1371
  # Add change event to all checkboxes
1372
+ for checkbox in checkbox_list:
1373
  checkbox.change(
1374
+ update_visualizations,
1375
+ inputs=checkbox_list,
1376
+ outputs=[detailed_visualization, overall_visualizations]
1377
  )
1378
 
1379
  # Update leaderboard and visualizations when checkboxes change
1380
+ for checkbox in checkbox_list:
1381
  checkbox.change(
1382
+ lambda *args: update_leaderboard(*args, data_source=rank_data),
1383
+ inputs=checkbox_list + [agent_top_n_slider],
1384
  outputs=[
1385
+ leaderboard_df,
1386
+ detailed_visualization,
1387
+ radar_visualization,
1388
+ group_bar_visualization
1389
+ ] + checkbox_list
1390
  )
1391
 
1392
+ # Update when agent top_n_slider changes
1393
+ agent_top_n_slider.change(
1394
+ lambda *args: update_leaderboard(*args, data_source=rank_data),
1395
+ inputs=checkbox_list + [agent_top_n_slider],
1396
  outputs=[
1397
+ leaderboard_df,
1398
+ detailed_visualization,
1399
+ radar_visualization,
1400
+ group_bar_visualization
1401
+ ] + checkbox_list
1402
  )
1403
 
1404
  # Update when clear button is clicked
1405
+ clear_btn.click(
1406
+ lambda *args: clear_filters(*args, data_source=rank_data),
1407
+ inputs=[agent_top_n_slider],
1408
  outputs=[
1409
+ leaderboard_df,
1410
+ detailed_visualization,
1411
+ radar_visualization,
1412
+ group_bar_visualization
1413
+ ] + checkbox_list
1414
  )
1415
 
1416
+ # Initialize the agent leaderboard (with top 5 limit)
1417
  demo.load(
1418
+ lambda: clear_filters(top_n=5, data_source=rank_data),
1419
  inputs=[],
1420
  outputs=[
1421
+ leaderboard_df,
1422
+ detailed_visualization,
1423
+ radar_visualization,
1424
+ group_bar_visualization
1425
+ ] + checkbox_list
1426
  )
1427
 
1428
  with gr.Tab("๐ŸŽฅ Gallery"):
assets/model_color.json CHANGED
@@ -27,31 +27,31 @@
27
  "llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
28
  "qwen3-235B-A22B-fp8": "#6A1B9A",
29
  "random (x30)": "#9E9E9E",
30
- "๐ŸŽฎ GamingAgent (claude-3-7-sonnet-20250219)": "#4A90E2",
31
- "๐ŸŽฎ GamingAgent (claude-3-5-haiku-20241022)": "#7FB5E6",
32
- "๐ŸŽฎ GamingAgent (claude-3-5-sonnet-20241022)": "#1A4C7C",
33
- "๐ŸŽฎ GamingAgent (claude-opus-4-20250514)": "#3A80D2",
34
- "๐ŸŽฎ GamingAgent (claude-sonnet-4-20250514)": "#5A9FE2",
35
- "๐ŸŽฎ GamingAgent (gemini-2.0-flash)": "#FF4081",
36
- "๐ŸŽฎ GamingAgent (gemini-2.0-flash-thinking-exp-1219)": "#C2185B",
37
- "๐ŸŽฎ GamingAgent (gemini-2.5-pro-exp-03-25)": "#FF80AB",
38
- "๐ŸŽฎ GamingAgent (gemini-2.5-flash-preview-04-17)": "#F06292",
39
- "๐ŸŽฎ GamingAgent (gemini-2.5-flash-preview-05-20)": "#F8BBD9",
40
- "๐ŸŽฎ GamingAgent (gemini-2.5-pro-preview-05-06)": "#AD1457",
41
- "๐ŸŽฎ GamingAgent (gemini-2.5-pro-preview-06-05)": "#EC407A",
42
- "๐ŸŽฎ GamingAgent (gpt-4o-2024-11-20)": "#00BFA5",
43
- "๐ŸŽฎ GamingAgent (gpt-4.5-preview-2025-02-27)": "#00796B",
44
- "๐ŸŽฎ GamingAgent (gpt-4.1-2025-04-14)": "#00897B",
45
- "๐ŸŽฎ GamingAgent (o1-2024-12-17)": "#4DB6AC",
46
- "๐ŸŽฎ GamingAgent (o1-mini-2024-09-12)": "#26A69A",
47
- "๐ŸŽฎ GamingAgent (o3-mini-2025-01-31(medium))": "#80CBC4",
48
- "๐ŸŽฎ GamingAgent (o3-2025-04-16)": "#26C6DA",
49
- "๐ŸŽฎ GamingAgent (o4-mini-2025-04-16)": "#00ACC1",
50
- "๐ŸŽฎ GamingAgent (grok-3-beta)": "#FF7043",
51
- "๐ŸŽฎ GamingAgent (grok-3-mini-beta)": "#FF8A65",
52
- "๐ŸŽฎ GamingAgent (deepseek-v3)": "#FFC107",
53
- "๐ŸŽฎ GamingAgent (deepseek-r1-0120)": "#FFA000",
54
- "๐ŸŽฎ GamingAgent (deepseek-r1-0528)": "#FFB300",
55
- "๐ŸŽฎ GamingAgent (llama-4-maverick-17b-128e-instruct-fp8)": "#8E24AA",
56
- "๐ŸŽฎ GamingAgent (qwen3-235B-A22B-fp8)": "#6A1B9A"
57
  }
 
27
  "llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
28
  "qwen3-235B-A22B-fp8": "#6A1B9A",
29
  "random (x30)": "#9E9E9E",
30
+ "๐ŸŽฎ claude-3-7-sonnet-20250219 (GamingAgent)": "#4A90E2",
31
+ "๐ŸŽฎ claude-3-5-haiku-20241022 (GamingAgent)": "#7FB5E6",
32
+ "๐ŸŽฎ claude-3-5-sonnet-20241022 (GamingAgent)": "#1A4C7C",
33
+ "๐ŸŽฎ claude-opus-4-20250514 (GamingAgent)": "#3A80D2",
34
+ "๐ŸŽฎ claude-sonnet-4-20250514 (GamingAgent)": "#5A9FE2",
35
+ "๐ŸŽฎ gemini-2.0-flash (GamingAgent)": "#FF4081",
36
+ "๐ŸŽฎ gemini-2.0-flash-thinking-exp-1219 (GamingAgent)": "#C2185B",
37
+ "๐ŸŽฎ gemini-2.5-pro-exp-03-25 (GamingAgent)": "#FF80AB",
38
+ "๐ŸŽฎ gemini-2.5-flash-preview-04-17 (GamingAgent)": "#F06292",
39
+ "๐ŸŽฎ gemini-2.5-flash-preview-05-20 (GamingAgent)": "#F8BBD9",
40
+ "๐ŸŽฎ gemini-2.5-pro-preview-05-06 (GamingAgent)": "#AD1457",
41
+ "๐ŸŽฎ gemini-2.5-pro-preview-06-05 (GamingAgent)": "#EC407A",
42
+ "๐ŸŽฎ gpt-4o-2024-11-20 (GamingAgent)": "#00BFA5",
43
+ "๐ŸŽฎ gpt-4.5-preview-2025-02-27 (GamingAgent)": "#00796B",
44
+ "๐ŸŽฎ gpt-4.1-2025-04-14 (GamingAgent)": "#00897B",
45
+ "๐ŸŽฎ o1-2024-12-17 (GamingAgent)": "#4DB6AC",
46
+ "๐ŸŽฎ o1-mini-2024-09-12 (GamingAgent)": "#26A69A",
47
+ "๐ŸŽฎ o3-mini-2025-01-31(medium) (GamingAgent)": "#80CBC4",
48
+ "๐ŸŽฎ o3-2025-04-16 (GamingAgent)": "#26C6DA",
49
+ "๐ŸŽฎ o4-mini-2025-04-16 (GamingAgent)": "#00ACC1",
50
+ "๐ŸŽฎ grok-3-beta (GamingAgent)": "#FF7043",
51
+ "๐ŸŽฎ grok-3-mini-beta (GamingAgent)": "#FF8A65",
52
+ "๐ŸŽฎ deepseek-v3 (GamingAgent)": "#FFC107",
53
+ "๐ŸŽฎ deepseek-r1-0120 (GamingAgent)": "#FFA000",
54
+ "๐ŸŽฎ deepseek-r1-0528 (GamingAgent)": "#FFB300",
55
+ "๐ŸŽฎ llama-4-maverick-17b-128e-instruct-fp8 (GamingAgent)": "#8E24AA",
56
+ "๐ŸŽฎ qwen3-235B-A22B-fp8 (GamingAgent)": "#6A1B9A"
57
  }
rank_data_03_25_2025.json CHANGED
@@ -3,61 +3,61 @@
3
  "runs": 3,
4
  "results": [
5
  {
6
- "model": "๐ŸŽฎ GamingAgent (claude-3-5-sonnet-20241022)",
7
  "score": 1267.7,
8
  "detail_data": "709,1532,1562",
9
  "progress": "1-1"
10
  },
11
  {
12
- "model": "๐ŸŽฎ GamingAgent (claude-3-7-sonnet-20250219)",
13
  "score": 1418.7,
14
  "detail_data": "2015,709,1532",
15
  "progress": "1-1"
16
  },
17
  {
18
- "model": "๐ŸŽฎ GamingAgent (gemini-2.5-flash-preview-04-17)",
19
  "score": 1385.0,
20
  "detail_data": "1672,1266,1247",
21
  "progress": "1-1"
22
  },
23
  {
24
- "model": "๐ŸŽฎ GamingAgent (gemini-2.5-pro-preview-05-06)",
25
  "score": 1498.3,
26
  "detail_data": "1561,1271,1663",
27
  "progress": "1-1"
28
  },
29
  {
30
- "model": "๐ŸŽฎ GamingAgent (llama-4-maverick-17b-128e-instruct-fp8)",
31
  "score": 1468.7,
32
  "detail_data": "898,2008,1500",
33
  "progress": "1-1"
34
  },
35
  {
36
- "model": "๐ŸŽฎ GamingAgent (gpt-4.1-2025-04-14)",
37
  "score": 2126.3,
38
  "detail_data": "1531,722,4126",
39
  "progress": "1-1"
40
  },
41
  {
42
- "model": "๐ŸŽฎ GamingAgent (gpt-4o-2024-11-20)",
43
  "score": 2047.3,
44
  "detail_data": "2017,2590,1535",
45
  "progress": "1-1"
46
  },
47
  {
48
- "model": "๐ŸŽฎ GamingAgent (o1-2024-12-17)",
49
  "score": 855,
50
  "detail_data": "855",
51
  "progress": "1-1"
52
  },
53
  {
54
- "model": "๐ŸŽฎ GamingAgent (o3-2025-04-16)",
55
  "score": 3445,
56
  "detail_data": "3445",
57
  "progress": "1-1"
58
  },
59
  {
60
- "model": "๐ŸŽฎ GamingAgent (o4-mini-2025-04-16)",
61
  "score": 1448.0,
62
  "detail_data": "1525,1263,1556",
63
  "progress": "1-1"
@@ -74,79 +74,79 @@
74
  "runs": 3,
75
  "results": [
76
  {
77
- "model": "๐ŸŽฎ GamingAgent (claude-3-5-sonnet-20241022)",
78
  "score": 1914.67,
79
  "details": "1352,2860,1532",
80
  "highest_tail": 256
81
  },
82
  {
83
- "model": "๐ŸŽฎ GamingAgent (claude-3-7-sonnet-20250219)",
84
  "score": 2624,
85
  "details": "2560,3224,2088",
86
  "highest_tail": 256
87
  },
88
  {
89
- "model": "๐ŸŽฎ GamingAgent (deepseek-r1-0120)",
90
  "score": 1873.33,
91
  "details": "700,1240,3680",
92
  "highest_tail": 256
93
  },
94
  {
95
- "model": "๐ŸŽฎ GamingAgent (gemini-2.5-flash-preview-04-17)",
96
  "score": 1697.33,
97
  "details": "1304,1316,2472",
98
  "highest_tail": 256
99
  },
100
  {
101
- "model": "๐ŸŽฎ GamingAgent (gemini-2.5-pro-preview-05-06)",
102
  "score": 3586.67,
103
  "details": "5300,2400,3060",
104
  "highest_tail": 512
105
  },
106
  {
107
- "model": "๐ŸŽฎ GamingAgent (grok-3-mini-beta)",
108
  "score": 4036,
109
  "details": "6412,2492,3204",
110
  "highest_tail": 512
111
  },
112
  {
113
- "model": "๐ŸŽฎ GamingAgent (llama-4-maverick-17b-128e-instruct-fp8)",
114
  "score": 1586.67,
115
  "details": "1404,1272,2084",
116
  "highest_tail": 128
117
  },
118
  {
119
- "model": "๐ŸŽฎ GamingAgent (gpt-4.1-2025-04-14)",
120
  "score": 1656,
121
  "details": "1156,2664,1148",
122
  "highest_tail": 256
123
  },
124
  {
125
- "model": "๐ŸŽฎ GamingAgent (gpt-4o-2024-11-20)",
126
  "score": 1656,
127
  "details": "1604,1284,2080",
128
  "highest_tail": 256
129
  },
130
  {
131
- "model": "๐ŸŽฎ GamingAgent (o1-2024-12-17)",
132
  "score": 7580,
133
  "details": "7580",
134
  "highest_tail": 512
135
  },
136
  {
137
- "model": "๐ŸŽฎ GamingAgent (o1-mini-2024-09-12)",
138
  "score": 2757.33,
139
  "details": "3132,2004,3136",
140
  "highest_tail": 256
141
  },
142
  {
143
- "model": "๐ŸŽฎ GamingAgent (o3-2025-04-16)",
144
  "score": 7120,
145
  "details": "7120",
146
  "highest_tail": 512
147
  },
148
  {
149
- "model": "๐ŸŽฎ GamingAgent (o4-mini-2025-04-16)",
150
  "score": 4432.0,
151
  "details": "4928,5456,2912",
152
  "highest_tail": 512
@@ -158,25 +158,25 @@
158
  "highest_tail": 128
159
  },
160
  {
161
- "model": "๐ŸŽฎ GamingAgent (claude-opus-4-20250514)",
162
  "score": 3036.0,
163
  "details": "3036.0",
164
  "highest_tail": 256
165
  },
166
  {
167
- "model": "๐ŸŽฎ GamingAgent (claude-sonnet-4-20250514)",
168
  "score": 3136,
169
  "details": "2148,2360,4900",
170
  "highest_tail": 256
171
  },
172
  {
173
- "model": "๐ŸŽฎ GamingAgent (deepseek-r1-0528)",
174
  "score": 3330.0,
175
  "details": "3260,3400",
176
  "highest_tail": 256
177
  },
178
  {
179
- "model": "๐ŸŽฎ GamingAgent (qwen3-235B-A22B-fp8)",
180
  "score": 2144.0,
181
  "details": "1436,2556,2440",
182
  "highest_tail": 256
@@ -187,67 +187,67 @@
187
  "runs": 3,
188
  "results": [
189
  {
190
- "model": "๐ŸŽฎ GamingAgent (claude-3-5-sonnet-20241022)",
191
  "score": 14.7,
192
  "details": "16,14,14"
193
  },
194
  {
195
- "model": "๐ŸŽฎ GamingAgent (claude-3-7-sonnet-20250219)",
196
  "score": 16.3,
197
  "details": "19,15,15"
198
  },
199
  {
200
- "model": "๐ŸŽฎ GamingAgent (deepseek-r1-0120)",
201
  "score": 14.3,
202
  "details": "15,14,14"
203
  },
204
  {
205
- "model": "๐ŸŽฎ GamingAgent (gemini-2.5-flash-preview-04-17)",
206
  "score": 16.3,
207
  "details": "20,14,15"
208
  },
209
  {
210
- "model": "๐ŸŽฎ GamingAgent (gemini-2.5-pro-preview-05-06)",
211
  "score": 23.3,
212
  "details": "23,23,24"
213
  },
214
  {
215
- "model": "๐ŸŽฎ GamingAgent (grok-3-mini-beta)",
216
  "score": 21.3,
217
  "details": "20,15,29"
218
  },
219
  {
220
- "model": "๐ŸŽฎ GamingAgent (llama-4-maverick-17b-128e-instruct-fp8)",
221
  "score": 10.3,
222
  "details": "9,10,12"
223
  },
224
  {
225
- "model": "๐ŸŽฎ GamingAgent (gpt-4.1-2025-04-14)",
226
  "score": 13.7,
227
  "details": "13,14,14"
228
  },
229
  {
230
- "model": "๐ŸŽฎ GamingAgent (gpt-4o-2024-11-20)",
231
  "score": 14,
232
  "details": "18,11,13"
233
  },
234
  {
235
- "model": "๐ŸŽฎ GamingAgent (o1-2024-12-17)",
236
  "score": 35,
237
  "details": "35"
238
  },
239
  {
240
- "model": "๐ŸŽฎ GamingAgent (o1-mini-2024-09-12)",
241
  "score": 11.7,
242
  "details": "11,11,13"
243
  },
244
  {
245
- "model": "๐ŸŽฎ GamingAgent (o3-2025-04-16)",
246
  "score": 42,
247
  "details": "42"
248
  },
249
  {
250
- "model": "๐ŸŽฎ GamingAgent (o4-mini-2025-04-16)",
251
  "score": 25.3,
252
  "details": "22,35,19"
253
  },
@@ -257,22 +257,22 @@
257
  "details": ""
258
  },
259
  {
260
- "model": "๐ŸŽฎ GamingAgent (claude-opus-4-20250514)",
261
  "score": 20,
262
  "details": "17,18,25"
263
  },
264
  {
265
- "model": "๐ŸŽฎ GamingAgent (claude-sonnet-4-20250514)",
266
  "score": 19.33,
267
  "details": "20,17,21"
268
  },
269
  {
270
- "model": "๐ŸŽฎ GamingAgent (deepseek-r1-0528)",
271
  "score": 33.67,
272
  "details": "26,34,41"
273
  },
274
  {
275
- "model": "๐ŸŽฎ GamingAgent (qwen3-235B-A22B-fp8)",
276
  "score": 11.67,
277
  "details": "13,14,8"
278
  }
@@ -282,67 +282,67 @@
282
  "runs": 3,
283
  "results": [
284
  {
285
- "model": "๐ŸŽฎ GamingAgent (claude-3-5-sonnet-20241022)",
286
  "score": 106,
287
  "details": "92,165,61"
288
  },
289
  {
290
- "model": "๐ŸŽฎ GamingAgent (claude-3-7-sonnet-20250219)",
291
  "score": 484,
292
  "details": "535,428,489"
293
  },
294
  {
295
- "model": "๐ŸŽฎ GamingAgent (deepseek-r1-0120)",
296
  "score": 447.3,
297
  "details": "409,436,497"
298
  },
299
  {
300
- "model": "๐ŸŽฎ GamingAgent (gemini-2.5-flash-preview-04-17)",
301
  "score": 334.7,
302
  "details": "259,372,373"
303
  },
304
  {
305
- "model": "๐ŸŽฎ GamingAgent (gemini-2.5-pro-preview-05-06)",
306
  "score": 416.3,
307
  "details": "411,414,424"
308
  },
309
  {
310
- "model": "๐ŸŽฎ GamingAgent (grok-3-mini-beta)",
311
  "score": 254,
312
  "details": "299,332,131"
313
  },
314
  {
315
- "model": "๐ŸŽฎ GamingAgent (llama-4-maverick-17b-128e-instruct-fp8)",
316
  "score": 128.7,
317
  "details": "67,139,180"
318
  },
319
  {
320
- "model": "๐ŸŽฎ GamingAgent (gpt-4.1-2025-04-14)",
321
  "score": 182,
322
  "details": "163,215,168"
323
  },
324
  {
325
- "model": "๐ŸŽฎ GamingAgent (gpt-4o-2024-11-20)",
326
  "score": 147.3,
327
  "details": "131,104,207"
328
  },
329
  {
330
- "model": "๐ŸŽฎ GamingAgent (o1-2024-12-17)",
331
  "score": 159,
332
  "details": "159"
333
  },
334
  {
335
- "model": "๐ŸŽฎ GamingAgent (o1-mini-2024-09-12)",
336
  "score": 48,
337
  "details": "21,86,37"
338
  },
339
  {
340
- "model": "๐ŸŽฎ GamingAgent (o3-2025-04-16)",
341
  "score": 647,
342
  "details": "647"
343
  },
344
  {
345
- "model": "๐ŸŽฎ GamingAgent (o4-mini-2025-04-16)",
346
  "score": 487.3,
347
  "details": "259,591,612"
348
  },
@@ -352,22 +352,22 @@
352
  "details": ""
353
  },
354
  {
355
- "model": "๐ŸŽฎ GamingAgent (claude-opus-4-20250514)",
356
  "score": 464,
357
  "details": "593,406,393"
358
  },
359
  {
360
- "model": "๐ŸŽฎ GamingAgent (claude-sonnet-4-20250514)",
361
  "score": 478.33,
362
  "details": "545,468,422"
363
  },
364
  {
365
- "model": "๐ŸŽฎ GamingAgent (deepseek-r1-0528)",
366
  "score": 491.67,
367
  "details": "464,463,548"
368
  },
369
  {
370
- "model": "๐ŸŽฎ GamingAgent (qwen3-235B-A22B-fp8)",
371
  "score": 363.33,
372
  "details": "365,372,353"
373
  }
@@ -377,79 +377,79 @@
377
  "runs": 3,
378
  "results": [
379
  {
380
- "model": "๐ŸŽฎ GamingAgent (claude-3-5-sonnet-20241022)",
381
  "score": 0,
382
  "detail_box_on_target": "0,0,0",
383
  "cracked_levels": "0,0,0"
384
  },
385
  {
386
- "model": "๐ŸŽฎ GamingAgent (claude-3-7-sonnet-20250219)",
387
  "score": 2.33,
388
  "detail_box_on_target": "2,4,1",
389
  "cracked_levels": "1,2,0"
390
  },
391
  {
392
- "model": "๐ŸŽฎ GamingAgent (deepseek-r1-0120)",
393
  "score": 1.33,
394
  "detail_box_on_target": "2,0,2",
395
  "cracked_levels": "1,0,1"
396
  },
397
  {
398
- "model": "๐ŸŽฎ GamingAgent (gemini-2.5-flash-preview-04-17)",
399
  "score": 1.67,
400
  "detail_box_on_target": "3,0,2",
401
  "cracked_levels": "2,0,1"
402
  },
403
  {
404
- "model": "๐ŸŽฎ GamingAgent (gemini-2.5-pro-preview-05-06)",
405
  "score": 4.33,
406
  "detail_box_on_target": "4,4,5",
407
  "cracked_levels": "2,2,3"
408
  },
409
  {
410
- "model": "๐ŸŽฎ GamingAgent (grok-3-mini-beta)",
411
  "score": 5.67,
412
  "detail_box_on_target": "5,6,6",
413
  "cracked_levels": "3,3,3"
414
  },
415
  {
416
- "model": "๐ŸŽฎ GamingAgent (llama-4-maverick-17b-128e-instruct-fp8)",
417
  "score": 0,
418
  "detail_box_on_target": "0,0,0",
419
  "cracked_levels": "0,0,0"
420
  },
421
  {
422
- "model": "๐ŸŽฎ GamingAgent (gpt-4.1-2025-04-14)",
423
  "score": 0,
424
  "detail_box_on_target": "0,0,0",
425
  "cracked_levels": "0,0,0"
426
  },
427
  {
428
- "model": "๐ŸŽฎ GamingAgent (gpt-4o-2024-11-20)",
429
  "score": 0,
430
  "detail_box_on_target": "0,0,0",
431
  "cracked_levels": "0,0,0"
432
  },
433
  {
434
- "model": "๐ŸŽฎ GamingAgent (o1-2024-12-17)",
435
  "score": 2.33,
436
  "detail_box_on_target": "2,2,3",
437
  "cracked_levels": "1,1,2"
438
  },
439
  {
440
- "model": "๐ŸŽฎ GamingAgent (o1-mini-2024-09-12)",
441
  "score": 1.33,
442
  "detail_box_on_target": "1,2,1",
443
  "cracked_levels": "0,1,0"
444
  },
445
  {
446
- "model": "๐ŸŽฎ GamingAgent (o3-2025-04-16)",
447
  "score": 8,
448
  "detail_box_on_target": "10,6",
449
  "cracked_levels": "5,3"
450
  },
451
  {
452
- "model": "๐ŸŽฎ GamingAgent (o4-mini-2025-04-16)",
453
  "score": 5.33,
454
  "detail_box_on_target": "4,6,6",
455
  "cracked_levels": "2,2,3"
@@ -461,22 +461,22 @@
461
  "cracked_levels": "0,0,0"
462
  },
463
  {
464
- "model": "๐ŸŽฎ GamingAgent (claude-opus-4-20250514)",
465
  "score": 4,
466
  "details": "4,4,4"
467
  },
468
  {
469
- "model": "๐ŸŽฎ GamingAgent (claude-sonnet-4-20250514)",
470
  "score": 3,
471
  "details": "2,2,5"
472
  },
473
  {
474
- "model": "๐ŸŽฎ GamingAgent (deepseek-r1-0528)",
475
  "score": 4.67,
476
  "details": "4,4,6"
477
  },
478
  {
479
- "model": "๐ŸŽฎ GamingAgent (qwen3-235B-A22B-fp8)",
480
  "score": 2.33,
481
  "details": "1,2,4"
482
  }
@@ -486,79 +486,79 @@
486
  "runs": 1,
487
  "results": [
488
  {
489
- "model": "๐ŸŽฎ GamingAgent (claude-3-5-sonnet-20241022)",
490
  "score": 2,
491
  "progress": "1:2/5",
492
  "evaluator result": "1/3"
493
  },
494
  {
495
- "model": "๐ŸŽฎ GamingAgent (claude-3-7-sonnet-20250219)",
496
  "score": 7,
497
  "progress": "2:2/9",
498
  "evaluator result": "5/11"
499
  },
500
  {
501
- "model": "๐ŸŽฎ GamingAgent (deepseek-r1-0120)",
502
  "score": 0,
503
  "progress": "0",
504
  "evaluator result": "1/5"
505
  },
506
  {
507
- "model": "๐ŸŽฎ GamingAgent (gemini-2.5-flash-preview-04-17)",
508
  "score": 4,
509
  "progress": "1:4/5",
510
  "evaluator result": "1/7"
511
  },
512
  {
513
- "model": "๐ŸŽฎ GamingAgent (gemini-2.5-pro-preview-05-06)",
514
  "score": 7,
515
  "progress": "2:2/9",
516
  "evaluator result": "2/3"
517
  },
518
  {
519
- "model": "๐ŸŽฎ GamingAgent (grok-3-mini-beta)",
520
  "score": 0,
521
  "progress": "0",
522
  "evaluator result": "0"
523
  },
524
  {
525
- "model": "๐ŸŽฎ GamingAgent (llama-4-maverick-17b-128e-instruct-fp8)",
526
  "score": 0,
527
  "progress": "0",
528
  "evaluator result": "0"
529
  },
530
  {
531
- "model": "๐ŸŽฎ GamingAgent (gpt-4.1-2025-04-14)",
532
  "score": 2,
533
  "progress": "1:2/5",
534
  "evaluator result": "2/3"
535
  },
536
  {
537
- "model": "๐ŸŽฎ GamingAgent (gpt-4o-2024-11-20)",
538
  "score": 0,
539
  "progress": "0",
540
  "evaluator result": "0"
541
  },
542
  {
543
- "model": "๐ŸŽฎ GamingAgent (o1-2024-12-17)",
544
  "score": 16,
545
  "progress": "3: 2/8",
546
  "evaluator result": "6/11"
547
  },
548
  {
549
- "model": "๐ŸŽฎ GamingAgent (o1-mini-2024-09-12)",
550
  "score": 0,
551
  "progress": "0",
552
  "evaluator result": "1/5"
553
  },
554
  {
555
- "model": "๐ŸŽฎ GamingAgent (o3-2025-04-16)",
556
  "score": 16,
557
  "progress": "3: 2/8",
558
  "evaluator result": "1/2"
559
  },
560
  {
561
- "model": "๐ŸŽฎ GamingAgent (o4-mini-2025-04-16)",
562
  "score": 4,
563
  "progress": "1:4/5",
564
  "evaluator result": "2/5"
@@ -570,17 +570,17 @@
570
  "evaluator result": "0"
571
  },
572
  {
573
- "model": "๐ŸŽฎ GamingAgent (claude-opus-4-20250514)",
574
  "score": 6,
575
  "details": "6"
576
  },
577
  {
578
- "model": "๐ŸŽฎ GamingAgent (claude-sonnet-4-20250514)",
579
  "score": 3.67,
580
  "details": "3,4,4"
581
  },
582
  {
583
- "model": "๐ŸŽฎ GamingAgent (gemini-2.5-flash-preview-05-20)",
584
  "score": 4.33,
585
  "details": "3,4,6"
586
  }
 
3
  "runs": 3,
4
  "results": [
5
  {
6
+ "model": "๐ŸŽฎ claude-3-5-sonnet-20241022 (GamingAgent)",
7
  "score": 1267.7,
8
  "detail_data": "709,1532,1562",
9
  "progress": "1-1"
10
  },
11
  {
12
+ "model": "๐ŸŽฎ claude-3-7-sonnet-20250219 (GamingAgent)",
13
  "score": 1418.7,
14
  "detail_data": "2015,709,1532",
15
  "progress": "1-1"
16
  },
17
  {
18
+ "model": "๐ŸŽฎ gemini-2.5-flash-preview-04-17 (GamingAgent)",
19
  "score": 1385.0,
20
  "detail_data": "1672,1266,1247",
21
  "progress": "1-1"
22
  },
23
  {
24
+ "model": "๐ŸŽฎ gemini-2.5-pro-preview-05-06 (GamingAgent)",
25
  "score": 1498.3,
26
  "detail_data": "1561,1271,1663",
27
  "progress": "1-1"
28
  },
29
  {
30
+ "model": "๐ŸŽฎ llama-4-maverick-17b-128e-instruct-fp8 (GamingAgent)",
31
  "score": 1468.7,
32
  "detail_data": "898,2008,1500",
33
  "progress": "1-1"
34
  },
35
  {
36
+ "model": "๐ŸŽฎ gpt-4.1-2025-04-14 (GamingAgent)",
37
  "score": 2126.3,
38
  "detail_data": "1531,722,4126",
39
  "progress": "1-1"
40
  },
41
  {
42
+ "model": "๐ŸŽฎ gpt-4o-2024-11-20 (GamingAgent)",
43
  "score": 2047.3,
44
  "detail_data": "2017,2590,1535",
45
  "progress": "1-1"
46
  },
47
  {
48
+ "model": "๐ŸŽฎ o1-2024-12-17 (GamingAgent)",
49
  "score": 855,
50
  "detail_data": "855",
51
  "progress": "1-1"
52
  },
53
  {
54
+ "model": "๐ŸŽฎ o3-2025-04-16 (GamingAgent)",
55
  "score": 3445,
56
  "detail_data": "3445",
57
  "progress": "1-1"
58
  },
59
  {
60
+ "model": "๐ŸŽฎ o4-mini-2025-04-16 (GamingAgent)",
61
  "score": 1448.0,
62
  "detail_data": "1525,1263,1556",
63
  "progress": "1-1"
 
74
  "runs": 3,
75
  "results": [
76
  {
77
+ "model": "๐ŸŽฎ claude-3-5-sonnet-20241022 (GamingAgent)",
78
  "score": 1914.67,
79
  "details": "1352,2860,1532",
80
  "highest_tail": 256
81
  },
82
  {
83
+ "model": "๐ŸŽฎ claude-3-7-sonnet-20250219 (GamingAgent)",
84
  "score": 2624,
85
  "details": "2560,3224,2088",
86
  "highest_tail": 256
87
  },
88
  {
89
+ "model": "๐ŸŽฎ deepseek-r1-0120 (GamingAgent)",
90
  "score": 1873.33,
91
  "details": "700,1240,3680",
92
  "highest_tail": 256
93
  },
94
  {
95
+ "model": "๐ŸŽฎ gemini-2.5-flash-preview-04-17 (GamingAgent)",
96
  "score": 1697.33,
97
  "details": "1304,1316,2472",
98
  "highest_tail": 256
99
  },
100
  {
101
+ "model": "๐ŸŽฎ gemini-2.5-pro-preview-05-06 (GamingAgent)",
102
  "score": 3586.67,
103
  "details": "5300,2400,3060",
104
  "highest_tail": 512
105
  },
106
  {
107
+ "model": "๐ŸŽฎ grok-3-mini-beta (GamingAgent)",
108
  "score": 4036,
109
  "details": "6412,2492,3204",
110
  "highest_tail": 512
111
  },
112
  {
113
+ "model": "๐ŸŽฎ llama-4-maverick-17b-128e-instruct-fp8 (GamingAgent)",
114
  "score": 1586.67,
115
  "details": "1404,1272,2084",
116
  "highest_tail": 128
117
  },
118
  {
119
+ "model": "๐ŸŽฎ gpt-4.1-2025-04-14 (GamingAgent)",
120
  "score": 1656,
121
  "details": "1156,2664,1148",
122
  "highest_tail": 256
123
  },
124
  {
125
+ "model": "๐ŸŽฎ gpt-4o-2024-11-20 (GamingAgent)",
126
  "score": 1656,
127
  "details": "1604,1284,2080",
128
  "highest_tail": 256
129
  },
130
  {
131
+ "model": "๐ŸŽฎ o1-2024-12-17 (GamingAgent)",
132
  "score": 7580,
133
  "details": "7580",
134
  "highest_tail": 512
135
  },
136
  {
137
+ "model": "๐ŸŽฎ o1-mini-2024-09-12 (GamingAgent)",
138
  "score": 2757.33,
139
  "details": "3132,2004,3136",
140
  "highest_tail": 256
141
  },
142
  {
143
+ "model": "๐ŸŽฎ o3-2025-04-16 (GamingAgent)",
144
  "score": 7120,
145
  "details": "7120",
146
  "highest_tail": 512
147
  },
148
  {
149
+ "model": "๐ŸŽฎ o4-mini-2025-04-16 (GamingAgent)",
150
  "score": 4432.0,
151
  "details": "4928,5456,2912",
152
  "highest_tail": 512
 
158
  "highest_tail": 128
159
  },
160
  {
161
+ "model": "๐ŸŽฎ claude-opus-4-20250514 (GamingAgent)",
162
  "score": 3036.0,
163
  "details": "3036.0",
164
  "highest_tail": 256
165
  },
166
  {
167
+ "model": "๐ŸŽฎ claude-sonnet-4-20250514 (GamingAgent)",
168
  "score": 3136,
169
  "details": "2148,2360,4900",
170
  "highest_tail": 256
171
  },
172
  {
173
+ "model": "๐ŸŽฎ deepseek-r1-0528 (GamingAgent)",
174
  "score": 3330.0,
175
  "details": "3260,3400",
176
  "highest_tail": 256
177
  },
178
  {
179
+ "model": "๐ŸŽฎ qwen3-235B-A22B-fp8 (GamingAgent)",
180
  "score": 2144.0,
181
  "details": "1436,2556,2440",
182
  "highest_tail": 256
 
187
  "runs": 3,
188
  "results": [
189
  {
190
+ "model": "๐ŸŽฎ claude-3-5-sonnet-20241022 (GamingAgent)",
191
  "score": 14.7,
192
  "details": "16,14,14"
193
  },
194
  {
195
+ "model": "๐ŸŽฎ claude-3-7-sonnet-20250219 (GamingAgent)",
196
  "score": 16.3,
197
  "details": "19,15,15"
198
  },
199
  {
200
+ "model": "๐ŸŽฎ deepseek-r1-0120 (GamingAgent)",
201
  "score": 14.3,
202
  "details": "15,14,14"
203
  },
204
  {
205
+ "model": "๐ŸŽฎ gemini-2.5-flash-preview-04-17 (GamingAgent)",
206
  "score": 16.3,
207
  "details": "20,14,15"
208
  },
209
  {
210
+ "model": "๐ŸŽฎ gemini-2.5-pro-preview-05-06 (GamingAgent)",
211
  "score": 23.3,
212
  "details": "23,23,24"
213
  },
214
  {
215
+ "model": "๐ŸŽฎ grok-3-mini-beta (GamingAgent)",
216
  "score": 21.3,
217
  "details": "20,15,29"
218
  },
219
  {
220
+ "model": "๐ŸŽฎ llama-4-maverick-17b-128e-instruct-fp8 (GamingAgent)",
221
  "score": 10.3,
222
  "details": "9,10,12"
223
  },
224
  {
225
+ "model": "๐ŸŽฎ gpt-4.1-2025-04-14 (GamingAgent)",
226
  "score": 13.7,
227
  "details": "13,14,14"
228
  },
229
  {
230
+ "model": "๐ŸŽฎ gpt-4o-2024-11-20 (GamingAgent)",
231
  "score": 14,
232
  "details": "18,11,13"
233
  },
234
  {
235
+ "model": "๐ŸŽฎ o1-2024-12-17 (GamingAgent)",
236
  "score": 35,
237
  "details": "35"
238
  },
239
  {
240
+ "model": "๐ŸŽฎ o1-mini-2024-09-12 (GamingAgent)",
241
  "score": 11.7,
242
  "details": "11,11,13"
243
  },
244
  {
245
+ "model": "๐ŸŽฎ o3-2025-04-16 (GamingAgent)",
246
  "score": 42,
247
  "details": "42"
248
  },
249
  {
250
+ "model": "๐ŸŽฎ o4-mini-2025-04-16 (GamingAgent)",
251
  "score": 25.3,
252
  "details": "22,35,19"
253
  },
 
257
  "details": ""
258
  },
259
  {
260
+ "model": "๐ŸŽฎ claude-opus-4-20250514 (GamingAgent)",
261
  "score": 20,
262
  "details": "17,18,25"
263
  },
264
  {
265
+ "model": "๐ŸŽฎ claude-sonnet-4-20250514 (GamingAgent)",
266
  "score": 19.33,
267
  "details": "20,17,21"
268
  },
269
  {
270
+ "model": "๐ŸŽฎ deepseek-r1-0528 (GamingAgent)",
271
  "score": 33.67,
272
  "details": "26,34,41"
273
  },
274
  {
275
+ "model": "๐ŸŽฎ qwen3-235B-A22B-fp8 (GamingAgent)",
276
  "score": 11.67,
277
  "details": "13,14,8"
278
  }
 
282
  "runs": 3,
283
  "results": [
284
  {
285
+ "model": "๐ŸŽฎ claude-3-5-sonnet-20241022 (GamingAgent)",
286
  "score": 106,
287
  "details": "92,165,61"
288
  },
289
  {
290
+ "model": "๐ŸŽฎ claude-3-7-sonnet-20250219 (GamingAgent)",
291
  "score": 484,
292
  "details": "535,428,489"
293
  },
294
  {
295
+ "model": "๐ŸŽฎ deepseek-r1-0120 (GamingAgent)",
296
  "score": 447.3,
297
  "details": "409,436,497"
298
  },
299
  {
300
+ "model": "๐ŸŽฎ gemini-2.5-flash-preview-04-17 (GamingAgent)",
301
  "score": 334.7,
302
  "details": "259,372,373"
303
  },
304
  {
305
+ "model": "๐ŸŽฎ gemini-2.5-pro-preview-05-06 (GamingAgent)",
306
  "score": 416.3,
307
  "details": "411,414,424"
308
  },
309
  {
310
+ "model": "๐ŸŽฎ grok-3-mini-beta (GamingAgent)",
311
  "score": 254,
312
  "details": "299,332,131"
313
  },
314
  {
315
+ "model": "๐ŸŽฎ llama-4-maverick-17b-128e-instruct-fp8 (GamingAgent)",
316
  "score": 128.7,
317
  "details": "67,139,180"
318
  },
319
  {
320
+ "model": "๐ŸŽฎ gpt-4.1-2025-04-14 (GamingAgent)",
321
  "score": 182,
322
  "details": "163,215,168"
323
  },
324
  {
325
+ "model": "๐ŸŽฎ gpt-4o-2024-11-20 (GamingAgent)",
326
  "score": 147.3,
327
  "details": "131,104,207"
328
  },
329
  {
330
+ "model": "๐ŸŽฎ o1-2024-12-17 (GamingAgent)",
331
  "score": 159,
332
  "details": "159"
333
  },
334
  {
335
+ "model": "๐ŸŽฎ o1-mini-2024-09-12 (GamingAgent)",
336
  "score": 48,
337
  "details": "21,86,37"
338
  },
339
  {
340
+ "model": "๐ŸŽฎ o3-2025-04-16 (GamingAgent)",
341
  "score": 647,
342
  "details": "647"
343
  },
344
  {
345
+ "model": "๐ŸŽฎ o4-mini-2025-04-16 (GamingAgent)",
346
  "score": 487.3,
347
  "details": "259,591,612"
348
  },
 
352
  "details": ""
353
  },
354
  {
355
+ "model": "๐ŸŽฎ claude-opus-4-20250514 (GamingAgent)",
356
  "score": 464,
357
  "details": "593,406,393"
358
  },
359
  {
360
+ "model": "๐ŸŽฎ claude-sonnet-4-20250514 (GamingAgent)",
361
  "score": 478.33,
362
  "details": "545,468,422"
363
  },
364
  {
365
+ "model": "๐ŸŽฎ deepseek-r1-0528 (GamingAgent)",
366
  "score": 491.67,
367
  "details": "464,463,548"
368
  },
369
  {
370
+ "model": "๐ŸŽฎ qwen3-235B-A22B-fp8 (GamingAgent)",
371
  "score": 363.33,
372
  "details": "365,372,353"
373
  }
 
377
  "runs": 3,
378
  "results": [
379
  {
380
+ "model": "๐ŸŽฎ claude-3-5-sonnet-20241022 (GamingAgent)",
381
  "score": 0,
382
  "detail_box_on_target": "0,0,0",
383
  "cracked_levels": "0,0,0"
384
  },
385
  {
386
+ "model": "๐ŸŽฎ claude-3-7-sonnet-20250219 (GamingAgent)",
387
  "score": 2.33,
388
  "detail_box_on_target": "2,4,1",
389
  "cracked_levels": "1,2,0"
390
  },
391
  {
392
+ "model": "๐ŸŽฎ deepseek-r1-0120 (GamingAgent)",
393
  "score": 1.33,
394
  "detail_box_on_target": "2,0,2",
395
  "cracked_levels": "1,0,1"
396
  },
397
  {
398
+ "model": "๐ŸŽฎ gemini-2.5-flash-preview-04-17 (GamingAgent)",
399
  "score": 1.67,
400
  "detail_box_on_target": "3,0,2",
401
  "cracked_levels": "2,0,1"
402
  },
403
  {
404
+ "model": "๐ŸŽฎ gemini-2.5-pro-preview-05-06 (GamingAgent)",
405
  "score": 4.33,
406
  "detail_box_on_target": "4,4,5",
407
  "cracked_levels": "2,2,3"
408
  },
409
  {
410
+ "model": "๐ŸŽฎ grok-3-mini-beta (GamingAgent)",
411
  "score": 5.67,
412
  "detail_box_on_target": "5,6,6",
413
  "cracked_levels": "3,3,3"
414
  },
415
  {
416
+ "model": "๐ŸŽฎ llama-4-maverick-17b-128e-instruct-fp8 (GamingAgent)",
417
  "score": 0,
418
  "detail_box_on_target": "0,0,0",
419
  "cracked_levels": "0,0,0"
420
  },
421
  {
422
+ "model": "๐ŸŽฎ gpt-4.1-2025-04-14 (GamingAgent)",
423
  "score": 0,
424
  "detail_box_on_target": "0,0,0",
425
  "cracked_levels": "0,0,0"
426
  },
427
  {
428
+ "model": "๐ŸŽฎ gpt-4o-2024-11-20 (GamingAgent)",
429
  "score": 0,
430
  "detail_box_on_target": "0,0,0",
431
  "cracked_levels": "0,0,0"
432
  },
433
  {
434
+ "model": "๐ŸŽฎ o1-2024-12-17 (GamingAgent)",
435
  "score": 2.33,
436
  "detail_box_on_target": "2,2,3",
437
  "cracked_levels": "1,1,2"
438
  },
439
  {
440
+ "model": "๐ŸŽฎ o1-mini-2024-09-12 (GamingAgent)",
441
  "score": 1.33,
442
  "detail_box_on_target": "1,2,1",
443
  "cracked_levels": "0,1,0"
444
  },
445
  {
446
+ "model": "๐ŸŽฎ o3-2025-04-16 (GamingAgent)",
447
  "score": 8,
448
  "detail_box_on_target": "10,6",
449
  "cracked_levels": "5,3"
450
  },
451
  {
452
+ "model": "๐ŸŽฎ o4-mini-2025-04-16 (GamingAgent)",
453
  "score": 5.33,
454
  "detail_box_on_target": "4,6,6",
455
  "cracked_levels": "2,2,3"
 
461
  "cracked_levels": "0,0,0"
462
  },
463
  {
464
+ "model": "๐ŸŽฎ claude-opus-4-20250514 (GamingAgent)",
465
  "score": 4,
466
  "details": "4,4,4"
467
  },
468
  {
469
+ "model": "๐ŸŽฎ claude-sonnet-4-20250514 (GamingAgent)",
470
  "score": 3,
471
  "details": "2,2,5"
472
  },
473
  {
474
+ "model": "๐ŸŽฎ deepseek-r1-0528 (GamingAgent)",
475
  "score": 4.67,
476
  "details": "4,4,6"
477
  },
478
  {
479
+ "model": "๐ŸŽฎ qwen3-235B-A22B-fp8 (GamingAgent)",
480
  "score": 2.33,
481
  "details": "1,2,4"
482
  }
 
486
  "runs": 1,
487
  "results": [
488
  {
489
+ "model": "๐ŸŽฎ claude-3-5-sonnet-20241022 (GamingAgent)",
490
  "score": 2,
491
  "progress": "1:2/5",
492
  "evaluator result": "1/3"
493
  },
494
  {
495
+ "model": "๐ŸŽฎ claude-3-7-sonnet-20250219 (GamingAgent)",
496
  "score": 7,
497
  "progress": "2:2/9",
498
  "evaluator result": "5/11"
499
  },
500
  {
501
+ "model": "๐ŸŽฎ deepseek-r1-0120 (GamingAgent)",
502
  "score": 0,
503
  "progress": "0",
504
  "evaluator result": "1/5"
505
  },
506
  {
507
+ "model": "๐ŸŽฎ gemini-2.5-flash-preview-04-17 (GamingAgent)",
508
  "score": 4,
509
  "progress": "1:4/5",
510
  "evaluator result": "1/7"
511
  },
512
  {
513
+ "model": "๐ŸŽฎ gemini-2.5-pro-preview-05-06 (GamingAgent)",
514
  "score": 7,
515
  "progress": "2:2/9",
516
  "evaluator result": "2/3"
517
  },
518
  {
519
+ "model": "๐ŸŽฎ grok-3-mini-beta (GamingAgent)",
520
  "score": 0,
521
  "progress": "0",
522
  "evaluator result": "0"
523
  },
524
  {
525
+ "model": "๐ŸŽฎ llama-4-maverick-17b-128e-instruct-fp8 (GamingAgent)",
526
  "score": 0,
527
  "progress": "0",
528
  "evaluator result": "0"
529
  },
530
  {
531
+ "model": "๐ŸŽฎ gpt-4.1-2025-04-14 (GamingAgent)",
532
  "score": 2,
533
  "progress": "1:2/5",
534
  "evaluator result": "2/3"
535
  },
536
  {
537
+ "model": "๐ŸŽฎ gpt-4o-2024-11-20 (GamingAgent)",
538
  "score": 0,
539
  "progress": "0",
540
  "evaluator result": "0"
541
  },
542
  {
543
+ "model": "๐ŸŽฎ o1-2024-12-17 (GamingAgent)",
544
  "score": 16,
545
  "progress": "3: 2/8",
546
  "evaluator result": "6/11"
547
  },
548
  {
549
+ "model": "๐ŸŽฎ o1-mini-2024-09-12 (GamingAgent)",
550
  "score": 0,
551
  "progress": "0",
552
  "evaluator result": "1/5"
553
  },
554
  {
555
+ "model": "๐ŸŽฎ o3-2025-04-16 (GamingAgent)",
556
  "score": 16,
557
  "progress": "3: 2/8",
558
  "evaluator result": "1/2"
559
  },
560
  {
561
+ "model": "๐ŸŽฎ o4-mini-2025-04-16 (GamingAgent)",
562
  "score": 4,
563
  "progress": "1:4/5",
564
  "evaluator result": "2/5"
 
570
  "evaluator result": "0"
571
  },
572
  {
573
+ "model": "๐ŸŽฎ claude-opus-4-20250514 (GamingAgent)",
574
  "score": 6,
575
  "details": "6"
576
  },
577
  {
578
+ "model": "๐ŸŽฎ claude-sonnet-4-20250514 (GamingAgent)",
579
  "score": 3.67,
580
  "details": "3,4,4"
581
  },
582
  {
583
+ "model": "๐ŸŽฎ gemini-2.5-flash-preview-05-20 (GamingAgent)",
584
  "score": 4.33,
585
  "details": "3,4,6"
586
  }