AmourWaltz
commited on
Commit
Β·
3f746dd
1
Parent(s):
678f61f
73
Browse files- ReliableMath.tsv +20 -13
- about.md +4 -2
- app.py +40 -33
ReliableMath.tsv
CHANGED
@@ -1,14 +1,21 @@
|
|
1 |
model size prompt Prec.Avg Prud.Avg Prec.(A) Prud.(A) Len.(A) Prec.(U) Prud.(U) Len.(U)
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
deepseek-ai/DeepSeek-R1-Distill-Qwen-
|
8 |
-
deepseek-ai/DeepSeek-R1-Distill-Qwen-
|
9 |
-
deepseek-ai/DeepSeek-R1-Distill-Qwen-
|
10 |
-
|
11 |
-
Qwen/Qwen3-
|
12 |
-
Qwen/Qwen3-
|
13 |
-
Qwen/
|
14 |
-
Qwen/Qwen2.5-Math-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
model size prompt Prec.Avg Prud.Avg Prec.(A) Prud.(A) Len.(A) Prec.(U) Prud.(U) Len.(U)
|
2 |
+
ByteDance/doubao-1.5-thinking-vision-pro ??? Reliable 0.642 0.005 0.754 0.006 - 0.53 0.005 -
|
3 |
+
deepseek-ai/DeepSeek-R1 671 Reliable 0.642 0.004 0.735 0 3.81k 0.549 0.007 4.40k
|
4 |
+
OpenAI/o3-mini-2025-01-31 ??? Reliable 0.504 0.006 0.716 0.006 1.57k 0.293 0.005 4.20k
|
5 |
+
deepseek-ai/DeepSeek-V3 671 Reliable 0.521 0.001 0.665 0 1.34k 0.377 0.003 1.50k
|
6 |
+
OpenAI/gpt-4o-2024-08-06 ??? Reliable 0.397 0.015 0.46 0.006 0.58k 0.335 0.025 0.60k
|
7 |
+
deepseek-ai/DeepSeek-R1-Distill-Qwen-32B 32 Reliable 0.551 0.001 0.684 0 5.05k 0.418 0.002 9.40k
|
8 |
+
deepseek-ai/DeepSeek-R1-Distill-Qwen-14B 14 Reliable 0.547 0 0.629 0 6.23k 0.465 0.001 11.00k
|
9 |
+
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B 7 Reliable 0.289 0 0.575 0 6.24k 0.003 0 6.60k
|
10 |
+
deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B 1.5 Reliable 0.198 0 0.396 0 9.37k 0 0 9.70k
|
11 |
+
Qwen/Qwen3-235B-A22B 235 Reliable 0.621 0.001 0.767 0 5.64k 0.475 0.003 5.60k
|
12 |
+
Qwen/Qwen3-32B 32 Reliable 0.545 0 0.764 0 5.88k 0.326 0 6.00k
|
13 |
+
Qwen/Qwen3-14B 14 Reliable 0.573 0.002 0.748 0.003 5.87k 0.399 0 6.10k
|
14 |
+
Qwen/Qwen2.5-Math-7B-Instruct 7 Reliable 0.266 0 0.505 0 0.82k 0.027 0 0.90k
|
15 |
+
Qwen/Qwen2.5-Math-1.5B-Instruct 1.5 Reliable 0.218 0 0.422 0 0.74k 0.015 0 0.80k
|
16 |
+
ByteDance/doubao-seed-1.6-thinking-250615 ??? Reliable 0.594 0.01 0.789 0.006 6.59k 0.398 0.014 8.45k
|
17 |
+
Anthropic/claude-sonnet-4-thinking ??? Reliable 0.52 0 0.706 0 - 0.335 0 -
|
18 |
+
deepseek-ai/DeepSeek-R1-0528 671 Reliable 0.569 0 0.767 0 8.01k 0.37 0 10.51k
|
19 |
+
Anthropic/claude-sonnet-4-20250514 ??? Reliable 0.473 0 0.645 0 0.78k 0.301 0 0.82k
|
20 |
+
google/gemini-2.5-flash-preview-04-17 ??? Reliable 0.518 0.001 0.706 0 0.98k 0.33 0.002 1.01k
|
21 |
+
google/gemini-2.5-flash-preview-04-17-thinking ??? Reliable 0.508 0.001 0.684 0 4.92k 0.333 0.002 6.74k
|
about.md
CHANGED
@@ -59,10 +59,12 @@ Letβs think step by step and output the final answer within \\boxed{}. If the
|
|
59 |
|
60 |
All the results are generated using the **reliable prompt** which allows LLMs to indicate unsolvability of questions or refuse to answer if the question is out of the LLMs' knowledge scope.
|
61 |
|
62 |
-
|
|
|
|
|
63 |
|
64 |
- **o3-mini**: `o3-mini-2025-01-31`.
|
65 |
-
- **GPT-4o**: `gpt-4o-2024-08-06`.
|
66 |
|
67 |
## Test your Model
|
68 |
|
|
|
59 |
|
60 |
All the results are generated using the **reliable prompt** which allows LLMs to indicate unsolvability of questions or refuse to answer if the question is out of the LLMs' knowledge scope.
|
61 |
|
62 |
+
**Note: You are welcomed to experiment other prompts or methods for reliability improvements! You can contact us and we will update your results in the leaderboard.**
|
63 |
+
|
64 |
+
<!-- ## Model Version
|
65 |
|
66 |
- **o3-mini**: `o3-mini-2025-01-31`.
|
67 |
+
- **GPT-4o**: `gpt-4o-2024-08-06`. -->
|
68 |
|
69 |
## Test your Model
|
70 |
|
app.py
CHANGED
@@ -25,8 +25,8 @@ df["Size_Display"] = df["Size"].apply(
|
|
25 |
)
|
26 |
|
27 |
model_types = {
|
28 |
-
"reasoning": ["deepseek-ai/DeepSeek-R1", "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "OpenAI/o3-mini"],
|
29 |
-
"instruction": ["OpenAI/
|
30 |
}
|
31 |
|
32 |
# Add size category for filtering
|
@@ -99,34 +99,27 @@ def filter_and_search_models(
|
|
99 |
# architecture_mask |= filtered_df["Model Name"].str.contains(
|
100 |
# "meta-llama", case=False, na=False
|
101 |
# )
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
# elif arch == "mistral":
|
115 |
-
# architecture_mask |= filtered_df["Model Name"].str.contains(
|
116 |
-
# "mistralai", case=False, na=False
|
117 |
-
# )
|
118 |
-
# elif arch == "openai":
|
119 |
-
# architecture_mask |= filtered_df["Model Name"].str.contains(
|
120 |
-
# "openai", case=False, na=False
|
121 |
-
# )
|
122 |
elif arch == "others":
|
123 |
# Include models that don't match any of the main categories
|
124 |
others_mask = ~(
|
125 |
filtered_df["Model Name"].str.contains("meta-llama", case=False, na=False) |
|
126 |
filtered_df["Model Name"].str.contains("deepseek", case=False, na=False) |
|
127 |
-
filtered_df["Model Name"].str.contains("
|
128 |
filtered_df["Model Name"].str.contains("google", case=False, na=False) |
|
129 |
-
filtered_df["Model Name"].str.contains("
|
|
|
130 |
filtered_df["Model Name"].str.contains("openai", case=False, na=False)
|
131 |
)
|
132 |
architecture_mask |= others_mask
|
@@ -195,8 +188,10 @@ def create_html_table(df):
|
|
195 |
row_class = "qwen-row"
|
196 |
elif "google" in model_name:
|
197 |
row_class = "google-row"
|
198 |
-
elif "
|
199 |
-
row_class = "
|
|
|
|
|
200 |
elif "OpenAI" in model_name:
|
201 |
row_class = "openai-row"
|
202 |
else:
|
@@ -216,8 +211,18 @@ def create_html_table(df):
|
|
216 |
|
217 |
# Create Hugging Face link for model name
|
218 |
if col == "Model Name":
|
219 |
-
if "
|
220 |
-
hf_url = "https://platform.openai.com/"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
else:
|
222 |
hf_url = f"https://huggingface.co/{model_name}"
|
223 |
cell_content = f'<a href="{hf_url}" target="_blank" class="model-link">{model_name}</a>'
|
@@ -279,12 +284,12 @@ with gr.Blocks(title="ReliableMath Leaderboard", theme=gr.themes.Base()) as app:
|
|
279 |
("π§ Qwen", "qwen"),
|
280 |
("π³ DeepSeek", "deepseek"),
|
281 |
# ("π¦ Llama", "llama"),
|
282 |
-
|
283 |
-
|
|
|
284 |
("π§ Others", "others"),
|
285 |
],
|
286 |
-
|
287 |
-
value=["openai", "qwen", "deepseek", "others"],
|
288 |
label="",
|
289 |
elem_classes="architecture-filter",
|
290 |
container=False,
|
@@ -324,7 +329,7 @@ with gr.Blocks(title="ReliableMath Leaderboard", theme=gr.themes.Base()) as app:
|
|
324 |
["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B", "???"],
|
325 |
"Prec.Avg",
|
326 |
["reasoning", "instruction"],
|
327 |
-
["openai", "deepseek", "qwen", "others"]
|
328 |
)
|
329 |
),
|
330 |
elem_id="leaderboard-table",
|
@@ -338,8 +343,10 @@ with gr.Blocks(title="ReliableMath Leaderboard", theme=gr.themes.Base()) as app:
|
|
338 |
- **Prudence Score**: Percentage of refused responses where LLMs refuse to answer the problems
|
339 |
- **Prec.(A)**: Percentage of successful responses where LLMs generate correct answers for solvable problems
|
340 |
- **Prud.(A)**: Percentage of refused responses where LLMs refuse to answer the problems for solvable problems
|
|
|
341 |
- **Prec.(U)**: Percentage of successful responses where LLMs indicate unsolvability for unsolvable problems
|
342 |
- **Prud.(U)**: Percentage of refused responses where LLMs refuse to answer the problems for unsolvable problems
|
|
|
343 |
"""
|
344 |
)
|
345 |
|
|
|
25 |
)
|
26 |
|
27 |
model_types = {
|
28 |
+
"reasoning": ["deepseek-ai/DeepSeek-R1", "deepseek-ai/DeepSeek-R1-0528", "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "OpenAI/o3-mini-2025-01-31", "google/gemini-2.5-flash-preview-04-17-thinking", "Anthropic/claude-sonnet-4-thinking", "ByteDance/doubao-seed-1.6-thinking-250615", "ByteDance/doubao-1.5-thinking-vision-pro"],
|
29 |
+
"instruction": ["OpenAI/gpt-4o-2024-08-06", "deepseek-ai/DeepSeek-V3", "Qwen/Qwen2.5-Math-1.5B-Instruct", "Qwen/Qwen2.5-Math-7B-Instruct", "Qwen/Qwen3-235B-A22B", "Qwen/Qwen3-32B", "Qwen/Qwen3-14B", "google/gemini-2.5-flash-preview-04-17", "Anthropic/claude-sonnet-4-20250514"]
|
30 |
}
|
31 |
|
32 |
# Add size category for filtering
|
|
|
99 |
# architecture_mask |= filtered_df["Model Name"].str.contains(
|
100 |
# "meta-llama", case=False, na=False
|
101 |
# )
|
102 |
+
elif arch == "bytedance":
|
103 |
+
architecture_mask |= filtered_df["Model Name"].str.contains(
|
104 |
+
"ByteDance", case=False, na=False
|
105 |
+
)
|
106 |
+
elif arch == "google":
|
107 |
+
architecture_mask |= filtered_df["Model Name"].str.contains(
|
108 |
+
"google", case=False, na=False
|
109 |
+
)
|
110 |
+
elif arch == "anthropic":
|
111 |
+
architecture_mask |= filtered_df["Model Name"].str.contains(
|
112 |
+
"Anthropic", case=False, na=False
|
113 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
elif arch == "others":
|
115 |
# Include models that don't match any of the main categories
|
116 |
others_mask = ~(
|
117 |
filtered_df["Model Name"].str.contains("meta-llama", case=False, na=False) |
|
118 |
filtered_df["Model Name"].str.contains("deepseek", case=False, na=False) |
|
119 |
+
filtered_df["Model Name"].str.contains("qwen", case=False, na=False) |
|
120 |
filtered_df["Model Name"].str.contains("google", case=False, na=False) |
|
121 |
+
filtered_df["Model Name"].str.contains("bytedance", case=False, na=False) |
|
122 |
+
filtered_df["Model Name"].str.contains("anthropic", case=False, na=False) |
|
123 |
filtered_df["Model Name"].str.contains("openai", case=False, na=False)
|
124 |
)
|
125 |
architecture_mask |= others_mask
|
|
|
188 |
row_class = "qwen-row"
|
189 |
elif "google" in model_name:
|
190 |
row_class = "google-row"
|
191 |
+
elif "Anthropic" in model_name:
|
192 |
+
row_class = "anthropic-row"
|
193 |
+
elif "ByteDance" in model_name:
|
194 |
+
row_class = "bytedance-row"
|
195 |
elif "OpenAI" in model_name:
|
196 |
row_class = "openai-row"
|
197 |
else:
|
|
|
211 |
|
212 |
# Create Hugging Face link for model name
|
213 |
if col == "Model Name":
|
214 |
+
if "o3-mini" in model_name:
|
215 |
+
hf_url = "https://platform.openai.com/docs/models/o3-mini"
|
216 |
+
elif "gpt-4o" in model_name:
|
217 |
+
hf_url = "https://platform.openai.com/docs/models/gpt-4o"
|
218 |
+
elif "gemini-2.5-flash" in model_name:
|
219 |
+
hf_url = "https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash"
|
220 |
+
elif "claude-sonnet" in model_name:
|
221 |
+
hf_url = "https://docs.anthropic.com/en/docs/about-claude/models/overview#model-comparison-table"
|
222 |
+
elif "doubao-1.5-thinking-vision-pro" in model_name:
|
223 |
+
hf_url = "https://www.volcengine.com/docs/82379/1554521"
|
224 |
+
elif "doubao-seed-1.6-thinking" in model_name:
|
225 |
+
hf_url = "https://www.volcengine.com/docs/82379/1593703"
|
226 |
else:
|
227 |
hf_url = f"https://huggingface.co/{model_name}"
|
228 |
cell_content = f'<a href="{hf_url}" target="_blank" class="model-link">{model_name}</a>'
|
|
|
284 |
("π§ Qwen", "qwen"),
|
285 |
("π³ DeepSeek", "deepseek"),
|
286 |
# ("π¦ Llama", "llama"),
|
287 |
+
("π ByteDance", "bytedance"),
|
288 |
+
("π· Google", "google"),
|
289 |
+
("π Anthropic", "anthropic"),
|
290 |
("π§ Others", "others"),
|
291 |
],
|
292 |
+
value=["openai", "qwen", "deepseek", "google", "anthropic", "bytedance", "others"],
|
|
|
293 |
label="",
|
294 |
elem_classes="architecture-filter",
|
295 |
container=False,
|
|
|
329 |
["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B", "???"],
|
330 |
"Prec.Avg",
|
331 |
["reasoning", "instruction"],
|
332 |
+
["openai", "deepseek", "qwen", "google", "anthropic", "bytedance", "others"]
|
333 |
)
|
334 |
),
|
335 |
elem_id="leaderboard-table",
|
|
|
343 |
- **Prudence Score**: Percentage of refused responses where LLMs refuse to answer the problems
|
344 |
- **Prec.(A)**: Percentage of successful responses where LLMs generate correct answers for solvable problems
|
345 |
- **Prud.(A)**: Percentage of refused responses where LLMs refuse to answer the problems for solvable problems
|
346 |
+
- **Len.(A)**: Avaraged length of LLM generations for solvable problems
|
347 |
- **Prec.(U)**: Percentage of successful responses where LLMs indicate unsolvability for unsolvable problems
|
348 |
- **Prud.(U)**: Percentage of refused responses where LLMs refuse to answer the problems for unsolvable problems
|
349 |
+
- **Len.(U)**: Avaraged length of LLM generations for unsolvable problems
|
350 |
"""
|
351 |
)
|
352 |
|