o
e @ sl d dl mZ dZdZde dZdZdZdZd d
ddd
dddddddddddZg dZ dZ
dZdZdZ
dS ) )Pathg333333?z3https://allenai.github.io/WildBench/gray_banner.pngzC
u 🦁 AI2 WildBench Leaderboard zaw @misc{wildbench2024,
title = {WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild},
author = {Bill Yuchen Lin and Khyathi Chandu and Faeze Brahman and Yuntian Deng and Abhilasha Ravichander and Valentina Pyatkin and Ronan Le Bras and Yejin Choi},
year = 2024,
url = {https://huggingface.co/spaces/allenai/WildBench},
}
ModelzOverall EloZInfoSekZCrtWrtCodeZReasonZEditMathZPlanZBrnstrmZRolPlyZAdvSekZDataAnaMisczTask-Avg Elo)zmodel name zelo overallInformation seekingCreative WritingCoding & Debugging ReasoningEditingr Planning
BrainstormingRole playingAdvice seeking
Data AnalysisOthersaverage)r r r r
r r r r
r r r r z
function refresh() {
const url = new URL(window.location);
if (url.searchParams.get('__theme') !== 'light') {
url.searchParams.set('__theme', 'light');
window.location.href = url.href;
}
}
a4
function scroll_top() {
console.log("Hello from Gradio!");
const bubbles = document.querySelectorAll('.bubble-wrap');
bubbles.forEach((bubble, index) => {
setTimeout(() => {
bubble.scrollTop = 0;
}, index * 100); // Delay of 100ms between each iteration
});
}
a **Tasks**: Info seeking (**InfoSek**), Creative Writing (**CrtWrt**), Coding&Debugging (**Code**), Reasoning (**Reason**), Editing (**Edit**), **Math**, Planning (**Plan**), Brainstorming (**Brnstrm**), Role playing (**RolPly**), Advice seeking (**AdvSek**), Data Analysis (**DataAna**)ai
code {
font-size: large;
}
footer {visibility: hidden}
.top-left-LP{
margin-top: 6px;
margin-left: 5px;
}
.markdown-text{font-size: 14pt}
.markdown-text-small{font-size: 13pt}
.markdown-text-tiny{font-size: 12pt}
.markdown-text-tiny-red{
font-size: 12pt;
color: red;
background-color: yellow;
font-color: red;
font-weight: bold;
}
th {
text-align: center;
font-size: 17px; /* Adjust the font size as needed */
}
td {
font-size: 15px; /* Adjust the font size as needed */
text-align: center;
}
.sample_button{
border: 1px solid #000000;
border-radius: 5px;
padding: 5px;
font-size: 15pt;
font-weight: bold;
margin: 5px;
}
.chat-common{
height: auto;
max-height: 400px;
min-height: 100px;
}
.chat-specific{
height: auto;
max-height: 600px;
min-height: 200px;
}
#od-benchmark-tab-table-button{
font-size: 15pt;
font-weight: bold;
}
.btn_boderline{
border: 1px solid #000000;
border-radius: 5px;
padding: 5px;
margin: 5px;
font-size: 15pt;
font-weight: bold;
}
.btn_boderline_next{
border: 0.1px solid #000000;
border-radius: 5px;
padding: 5px;
margin: 5px;
font-size: 15pt;
font-weight: bold;
}
.btn_boderline_gray{
border: 0.5px solid gray;
border-radius: 5px;
padding: 5px;
margin: 5px;
font-size: 15pt;
font-weight: italic;
}
.btn_boderline_selected{
border: 2px solid purple;
background-color: #f2f2f2;
border-radius: 5px;
padding: 5px;
margin: 5px;
font-size: 15pt;
font-weight: bold;
}
.accordion-label button span{
font-size: 14pt;
font-weight: bold;
}
#select-models span{
font-size: 10pt;
}
#select-tasks span{
font-size: 10pt;
}
.markdown-text-details{
margin: 10px;
padding: 10px;
}
button.selected[role="tab"][aria-selected="true"] {
font-size: 18px; /* or any other size you prefer */
font-weight: bold;
}
#od-benchmark-tab-table-ablation-button {
font-size: larger; /* Adjust the font size as needed */
}
.plotly-plot{
height: auto;
max-height: 600px;
min-height: 600px;
}
N)pathlibr
DEFAULT_LPZ
banner_urlBANNERZTITLEWINRATE_HEATMAP
CITATION_TEXTcolumn_namesall_task_typesjs_lightjs_code
TASK_TYPE_STRcss r r /home/day/WildBench/constants.py s8