AmourWaltz commited on
Commit
3f746dd
Β·
1 Parent(s): 678f61f
Files changed (3) hide show
  1. ReliableMath.tsv +20 -13
  2. about.md +4 -2
  3. app.py +40 -33
ReliableMath.tsv CHANGED
@@ -1,14 +1,21 @@
1
  model size prompt Prec.Avg Prud.Avg Prec.(A) Prud.(A) Len.(A) Prec.(U) Prud.(U) Len.(U)
2
- deepseek-ai/DeepSeek-R1 671 Reliable 0.642 0.004 0.735 0.000 3.81k 0.549 0.007 4.40k
3
- OpenAI/o3-mini ??? Reliable 0.504 0.006 0.716 0.006 1.57k 0.293 0.005 4.20k
4
- deepseek-ai/DeepSeek-V3 671 Reliable 0.521 0.001 0.665 0.000 1.34k 0.377 0.003 1.50k
5
- OpenAI/GPT-4o ??? Reliable 0.397 0.015 0.460 0.006 0.58k 0.335 0.025 0.60k
6
- deepseek-ai/DeepSeek-R1-Distill-Qwen-32B 32 Reliable 0.551 0.001 0.684 0.000 5.05k 0.418 0.002 9.40k
7
- deepseek-ai/DeepSeek-R1-Distill-Qwen-14B 14 Reliable 0.547 0.000 0.629 0.000 6.23k 0.465 0.001 11.00k
8
- deepseek-ai/DeepSeek-R1-Distill-Qwen-7B 7 Reliable 0.289 0.000 0.575 0.000 6.24k 0.003 0.000 6.60k
9
- deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B 1.5 Reliable 0.198 0.000 0.396 0.000 9.37k 0.000 0.000 9.70k
10
- Qwen/Qwen3-235B-A22B 235 Reliable 0.621 0.001 0.767 0.000 5.64k 0.475 0.003 5.60k
11
- Qwen/Qwen3-32B 32 Reliable 0.545 0.000 0.764 0.000 5.88k 0.326 0.000 6.00k
12
- Qwen/Qwen3-14B 14 Reliable 0.573 0.002 0.748 0.003 5.87k 0.399 0.000 6.10k
13
- Qwen/Qwen2.5-Math-7B-Instruct 7 Reliable 0.266 0.000 0.505 0.000 0.82k 0.027 0.000 0.90k
14
- Qwen/Qwen2.5-Math-1.5B-Instruct 1.5 Reliable 0.218 0.000 0.422 0.000 0.74k 0.015 0.000 0.80k
 
 
 
 
 
 
 
 
1
  model size prompt Prec.Avg Prud.Avg Prec.(A) Prud.(A) Len.(A) Prec.(U) Prud.(U) Len.(U)
2
+ ByteDance/doubao-1.5-thinking-vision-pro ??? Reliable 0.642 0.005 0.754 0.006 - 0.53 0.005 -
3
+ deepseek-ai/DeepSeek-R1 671 Reliable 0.642 0.004 0.735 0 3.81k 0.549 0.007 4.40k
4
+ OpenAI/o3-mini-2025-01-31 ??? Reliable 0.504 0.006 0.716 0.006 1.57k 0.293 0.005 4.20k
5
+ deepseek-ai/DeepSeek-V3 671 Reliable 0.521 0.001 0.665 0 1.34k 0.377 0.003 1.50k
6
+ OpenAI/gpt-4o-2024-08-06 ??? Reliable 0.397 0.015 0.46 0.006 0.58k 0.335 0.025 0.60k
7
+ deepseek-ai/DeepSeek-R1-Distill-Qwen-32B 32 Reliable 0.551 0.001 0.684 0 5.05k 0.418 0.002 9.40k
8
+ deepseek-ai/DeepSeek-R1-Distill-Qwen-14B 14 Reliable 0.547 0 0.629 0 6.23k 0.465 0.001 11.00k
9
+ deepseek-ai/DeepSeek-R1-Distill-Qwen-7B 7 Reliable 0.289 0 0.575 0 6.24k 0.003 0 6.60k
10
+ deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B 1.5 Reliable 0.198 0 0.396 0 9.37k 0 0 9.70k
11
+ Qwen/Qwen3-235B-A22B 235 Reliable 0.621 0.001 0.767 0 5.64k 0.475 0.003 5.60k
12
+ Qwen/Qwen3-32B 32 Reliable 0.545 0 0.764 0 5.88k 0.326 0 6.00k
13
+ Qwen/Qwen3-14B 14 Reliable 0.573 0.002 0.748 0.003 5.87k 0.399 0 6.10k
14
+ Qwen/Qwen2.5-Math-7B-Instruct 7 Reliable 0.266 0 0.505 0 0.82k 0.027 0 0.90k
15
+ Qwen/Qwen2.5-Math-1.5B-Instruct 1.5 Reliable 0.218 0 0.422 0 0.74k 0.015 0 0.80k
16
+ ByteDance/doubao-seed-1.6-thinking-250615 ??? Reliable 0.594 0.01 0.789 0.006 6.59k 0.398 0.014 8.45k
17
+ Anthropic/claude-sonnet-4-thinking ??? Reliable 0.52 0 0.706 0 - 0.335 0 -
18
+ deepseek-ai/DeepSeek-R1-0528 671 Reliable 0.569 0 0.767 0 8.01k 0.37 0 10.51k
19
+ Anthropic/claude-sonnet-4-20250514 ??? Reliable 0.473 0 0.645 0 0.78k 0.301 0 0.82k
20
+ google/gemini-2.5-flash-preview-04-17 ??? Reliable 0.518 0.001 0.706 0 0.98k 0.33 0.002 1.01k
21
+ google/gemini-2.5-flash-preview-04-17-thinking ??? Reliable 0.508 0.001 0.684 0 4.92k 0.333 0.002 6.74k
about.md CHANGED
@@ -59,10 +59,12 @@ Letβ€˜s think step by step and output the final answer within \\boxed{}. If the
59
 
60
  All the results are generated using the **reliable prompt** which allows LLMs to indicate unsolvability of questions or refuse to answer if the question is out of the LLMs' knowledge scope.
61
 
62
- ## Model Version
 
 
63
 
64
  - **o3-mini**: `o3-mini-2025-01-31`.
65
- - **GPT-4o**: `gpt-4o-2024-08-06`.
66
 
67
  ## Test your Model
68
 
 
59
 
60
  All the results are generated using the **reliable prompt** which allows LLMs to indicate unsolvability of questions or refuse to answer if the question is out of the LLMs' knowledge scope.
61
 
62
+ **Note: You are welcomed to experiment other prompts or methods for reliability improvements! You can contact us and we will update your results in the leaderboard.**
63
+
64
+ <!-- ## Model Version
65
 
66
  - **o3-mini**: `o3-mini-2025-01-31`.
67
+ - **GPT-4o**: `gpt-4o-2024-08-06`. -->
68
 
69
  ## Test your Model
70
 
app.py CHANGED
@@ -25,8 +25,8 @@ df["Size_Display"] = df["Size"].apply(
25
  )
26
 
27
  model_types = {
28
- "reasoning": ["deepseek-ai/DeepSeek-R1", "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "OpenAI/o3-mini"],
29
- "instruction": ["OpenAI/GPT-4o", "deepseek-ai/DeepSeek-V3", "Qwen/Qwen2.5-Math-1.5B-Instruct", "Qwen/Qwen2.5-Math-7B-Instruct", "Qwen/Qwen3-235B-A22B", "Qwen/Qwen3-32B", "Qwen/Qwen3-14B"]
30
  }
31
 
32
  # Add size category for filtering
@@ -99,34 +99,27 @@ def filter_and_search_models(
99
  # architecture_mask |= filtered_df["Model Name"].str.contains(
100
  # "meta-llama", case=False, na=False
101
  # )
102
- # elif arch == "deepseek":
103
- # architecture_mask |= filtered_df["Model Name"].str.contains(
104
- # "deepseek", case=False, na=False
105
- # )
106
- # elif arch == "qwen":
107
- # architecture_mask |= filtered_df["Model Name"].str.contains(
108
- # "Qwen", case=False, na=False
109
- # )
110
- # elif arch == "google":
111
- # architecture_mask |= filtered_df["Model Name"].str.contains(
112
- # "google", case=False, na=False
113
- # )
114
- # elif arch == "mistral":
115
- # architecture_mask |= filtered_df["Model Name"].str.contains(
116
- # "mistralai", case=False, na=False
117
- # )
118
- # elif arch == "openai":
119
- # architecture_mask |= filtered_df["Model Name"].str.contains(
120
- # "openai", case=False, na=False
121
- # )
122
  elif arch == "others":
123
  # Include models that don't match any of the main categories
124
  others_mask = ~(
125
  filtered_df["Model Name"].str.contains("meta-llama", case=False, na=False) |
126
  filtered_df["Model Name"].str.contains("deepseek", case=False, na=False) |
127
- filtered_df["Model Name"].str.contains("Qwen", case=False, na=False) |
128
  filtered_df["Model Name"].str.contains("google", case=False, na=False) |
129
- filtered_df["Model Name"].str.contains("mistralai", case=False, na=False) |
 
130
  filtered_df["Model Name"].str.contains("openai", case=False, na=False)
131
  )
132
  architecture_mask |= others_mask
@@ -195,8 +188,10 @@ def create_html_table(df):
195
  row_class = "qwen-row"
196
  elif "google" in model_name:
197
  row_class = "google-row"
198
- elif "mistralai" in model_name:
199
- row_class = "mistral-row"
 
 
200
  elif "OpenAI" in model_name:
201
  row_class = "openai-row"
202
  else:
@@ -216,8 +211,18 @@ def create_html_table(df):
216
 
217
  # Create Hugging Face link for model name
218
  if col == "Model Name":
219
- if "OpenAI" in model_name:
220
- hf_url = "https://platform.openai.com/"
 
 
 
 
 
 
 
 
 
 
221
  else:
222
  hf_url = f"https://huggingface.co/{model_name}"
223
  cell_content = f'<a href="{hf_url}" target="_blank" class="model-link">{model_name}</a>'
@@ -279,12 +284,12 @@ with gr.Blocks(title="ReliableMath Leaderboard", theme=gr.themes.Base()) as app:
279
  ("🐧 Qwen", "qwen"),
280
  ("🐳 DeepSeek", "deepseek"),
281
  # ("πŸ¦™ Llama", "llama"),
282
- # ("πŸ”· Gemma", "google"),
283
- # ("🌟 Mistral", "mistral"),
 
284
  ("πŸ”§ Others", "others"),
285
  ],
286
- # value=["llama", "deepseek", "qwen", "google", "mistral", "others"],
287
- value=["openai", "qwen", "deepseek", "others"],
288
  label="",
289
  elem_classes="architecture-filter",
290
  container=False,
@@ -324,7 +329,7 @@ with gr.Blocks(title="ReliableMath Leaderboard", theme=gr.themes.Base()) as app:
324
  ["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B", "???"],
325
  "Prec.Avg",
326
  ["reasoning", "instruction"],
327
- ["openai", "deepseek", "qwen", "others"]
328
  )
329
  ),
330
  elem_id="leaderboard-table",
@@ -338,8 +343,10 @@ with gr.Blocks(title="ReliableMath Leaderboard", theme=gr.themes.Base()) as app:
338
  - **Prudence Score**: Percentage of refused responses where LLMs refuse to answer the problems
339
  - **Prec.(A)**: Percentage of successful responses where LLMs generate correct answers for solvable problems
340
  - **Prud.(A)**: Percentage of refused responses where LLMs refuse to answer the problems for solvable problems
 
341
  - **Prec.(U)**: Percentage of successful responses where LLMs indicate unsolvability for unsolvable problems
342
  - **Prud.(U)**: Percentage of refused responses where LLMs refuse to answer the problems for unsolvable problems
 
343
  """
344
  )
345
 
 
25
  )
26
 
27
  model_types = {
28
+ "reasoning": ["deepseek-ai/DeepSeek-R1", "deepseek-ai/DeepSeek-R1-0528", "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "OpenAI/o3-mini-2025-01-31", "google/gemini-2.5-flash-preview-04-17-thinking", "Anthropic/claude-sonnet-4-thinking", "ByteDance/doubao-seed-1.6-thinking-250615", "ByteDance/doubao-1.5-thinking-vision-pro"],
29
+ "instruction": ["OpenAI/gpt-4o-2024-08-06", "deepseek-ai/DeepSeek-V3", "Qwen/Qwen2.5-Math-1.5B-Instruct", "Qwen/Qwen2.5-Math-7B-Instruct", "Qwen/Qwen3-235B-A22B", "Qwen/Qwen3-32B", "Qwen/Qwen3-14B", "google/gemini-2.5-flash-preview-04-17", "Anthropic/claude-sonnet-4-20250514"]
30
  }
31
 
32
  # Add size category for filtering
 
99
  # architecture_mask |= filtered_df["Model Name"].str.contains(
100
  # "meta-llama", case=False, na=False
101
  # )
102
+ elif arch == "bytedance":
103
+ architecture_mask |= filtered_df["Model Name"].str.contains(
104
+ "ByteDance", case=False, na=False
105
+ )
106
+ elif arch == "google":
107
+ architecture_mask |= filtered_df["Model Name"].str.contains(
108
+ "google", case=False, na=False
109
+ )
110
+ elif arch == "anthropic":
111
+ architecture_mask |= filtered_df["Model Name"].str.contains(
112
+ "Anthropic", case=False, na=False
113
+ )
 
 
 
 
 
 
 
 
114
  elif arch == "others":
115
  # Include models that don't match any of the main categories
116
  others_mask = ~(
117
  filtered_df["Model Name"].str.contains("meta-llama", case=False, na=False) |
118
  filtered_df["Model Name"].str.contains("deepseek", case=False, na=False) |
119
+ filtered_df["Model Name"].str.contains("qwen", case=False, na=False) |
120
  filtered_df["Model Name"].str.contains("google", case=False, na=False) |
121
+ filtered_df["Model Name"].str.contains("bytedance", case=False, na=False) |
122
+ filtered_df["Model Name"].str.contains("anthropic", case=False, na=False) |
123
  filtered_df["Model Name"].str.contains("openai", case=False, na=False)
124
  )
125
  architecture_mask |= others_mask
 
188
  row_class = "qwen-row"
189
  elif "google" in model_name:
190
  row_class = "google-row"
191
+ elif "Anthropic" in model_name:
192
+ row_class = "anthropic-row"
193
+ elif "ByteDance" in model_name:
194
+ row_class = "bytedance-row"
195
  elif "OpenAI" in model_name:
196
  row_class = "openai-row"
197
  else:
 
211
 
212
  # Create Hugging Face link for model name
213
  if col == "Model Name":
214
+ if "o3-mini" in model_name:
215
+ hf_url = "https://platform.openai.com/docs/models/o3-mini"
216
+ elif "gpt-4o" in model_name:
217
+ hf_url = "https://platform.openai.com/docs/models/gpt-4o"
218
+ elif "gemini-2.5-flash" in model_name:
219
+ hf_url = "https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/2-5-flash"
220
+ elif "claude-sonnet" in model_name:
221
+ hf_url = "https://docs.anthropic.com/en/docs/about-claude/models/overview#model-comparison-table"
222
+ elif "doubao-1.5-thinking-vision-pro" in model_name:
223
+ hf_url = "https://www.volcengine.com/docs/82379/1554521"
224
+ elif "doubao-seed-1.6-thinking" in model_name:
225
+ hf_url = "https://www.volcengine.com/docs/82379/1593703"
226
  else:
227
  hf_url = f"https://huggingface.co/{model_name}"
228
  cell_content = f'<a href="{hf_url}" target="_blank" class="model-link">{model_name}</a>'
 
284
  ("🐧 Qwen", "qwen"),
285
  ("🐳 DeepSeek", "deepseek"),
286
  # ("πŸ¦™ Llama", "llama"),
287
+ ("πŸŒ‹ ByteDance", "bytedance"),
288
+ ("πŸ”· Google", "google"),
289
+ ("🌟 Anthropic", "anthropic"),
290
  ("πŸ”§ Others", "others"),
291
  ],
292
+ value=["openai", "qwen", "deepseek", "google", "anthropic", "bytedance", "others"],
 
293
  label="",
294
  elem_classes="architecture-filter",
295
  container=False,
 
329
  ["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B", "???"],
330
  "Prec.Avg",
331
  ["reasoning", "instruction"],
332
+ ["openai", "deepseek", "qwen", "google", "anthropic", "bytedance", "others"]
333
  )
334
  ),
335
  elem_id="leaderboard-table",
 
343
  - **Prudence Score**: Percentage of refused responses where LLMs refuse to answer the problems
344
  - **Prec.(A)**: Percentage of successful responses where LLMs generate correct answers for solvable problems
345
  - **Prud.(A)**: Percentage of refused responses where LLMs refuse to answer the problems for solvable problems
346
+ - **Len.(A)**: Avaraged length of LLM generations for solvable problems
347
  - **Prec.(U)**: Percentage of successful responses where LLMs indicate unsolvability for unsolvable problems
348
  - **Prud.(U)**: Percentage of refused responses where LLMs refuse to answer the problems for unsolvable problems
349
+ - **Len.(U)**: Avaraged length of LLM generations for unsolvable problems
350
  """
351
  )
352