Hao Xu
commited on
Commit
Β·
219a6f1
1
Parent(s):
15e1734
update loading method for core benchmark
Browse files
app.py
CHANGED
@@ -1,14 +1,15 @@
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
import json
|
4 |
-
import
|
5 |
-
from huggingface_hub import HfApi
|
6 |
from datasets import load_dataset
|
7 |
import requests
|
8 |
import datetime
|
9 |
|
10 |
TOKEN = os.environ.get("HF_TOKEN")
|
11 |
OWNER = os.environ.get("OWNER")
|
|
|
12 |
RESULTS_COMMUNITY = f"{OWNER}/benchmark_results"
|
13 |
api = HfApi()
|
14 |
|
@@ -17,8 +18,13 @@ URL = os.environ.get("URL")
|
|
17 |
|
18 |
def load_data(source, refresh=False):
|
19 |
if source == "core":
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
22 |
else:
|
23 |
if refresh:
|
24 |
ds = load_dataset(RESULTS_COMMUNITY, download_mode="force_redownload")
|
@@ -34,9 +40,21 @@ def build_table(source, refresh=False):
|
|
34 |
data = load_data(source, refresh)
|
35 |
|
36 |
if source == "core":
|
37 |
-
headers = ["Benchmark", "Category", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)"
|
38 |
else:
|
39 |
-
headers = ["Benchmark", "Contributor", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
html = """
|
42 |
<table id="benchmarkTable" style="border-collapse: collapse; width: 100%;">
|
@@ -63,10 +81,14 @@ def build_table(source, refresh=False):
|
|
63 |
row = {
|
64 |
"Benchmark": hyperlink,
|
65 |
"Pile-train Dirty (%)": entry.get("Pile Dirty", -1),
|
66 |
-
"DCLM-baseline Dirty (%)": entry.get("DCLM Dirty", -1)
|
67 |
-
"CC-2025-05 Dirty (%)": entry.get("CC202505 Dirty", -1),
|
68 |
-
"CC-2025-08 Dirty (%)": entry.get("CC202508 Dirty", -1)
|
69 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
if source == "core":
|
72 |
row["Category"] = entry.get("Category", "")
|
@@ -165,7 +187,7 @@ def record_submission(benchmark_name, contributor, jsonl_file, hf_path, hf_split
|
|
165 |
user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
|
166 |
creation_date = json.loads(user_data.content)["createdAt"]
|
167 |
if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=10):
|
168 |
-
return
|
169 |
|
170 |
if not benchmark_name or not benchmark_name.strip():
|
171 |
return "β Please provide a benchmark name."
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
import json
|
4 |
+
import re
|
5 |
+
from huggingface_hub import HfApi
|
6 |
from datasets import load_dataset
|
7 |
import requests
|
8 |
import datetime
|
9 |
|
10 |
TOKEN = os.environ.get("HF_TOKEN")
|
11 |
OWNER = os.environ.get("OWNER")
|
12 |
+
RESULTS_CORE = f"{OWNER}/core_benchmark_results"
|
13 |
RESULTS_COMMUNITY = f"{OWNER}/benchmark_results"
|
14 |
api = HfApi()
|
15 |
|
|
|
18 |
|
19 |
def load_data(source, refresh=False):
|
20 |
if source == "core":
|
21 |
+
if refresh:
|
22 |
+
ds = load_dataset(RESULTS_CORE, download_mode="force_redownload")
|
23 |
+
else:
|
24 |
+
ds = load_dataset(RESULTS_CORE)
|
25 |
+
data = []
|
26 |
+
for entry in ds['train']:
|
27 |
+
data.append(entry)
|
28 |
else:
|
29 |
if refresh:
|
30 |
ds = load_dataset(RESULTS_COMMUNITY, download_mode="force_redownload")
|
|
|
40 |
data = load_data(source, refresh)
|
41 |
|
42 |
if source == "core":
|
43 |
+
headers = ["Benchmark", "Category", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)"]
|
44 |
else:
|
45 |
+
headers = ["Benchmark", "Contributor", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)"]
|
46 |
+
|
47 |
+
cc_columns = set()
|
48 |
+
for entry in data:
|
49 |
+
for key in entry.keys():
|
50 |
+
match = re.match(r'CC(\d{4})(\d{2}) Dirty', key)
|
51 |
+
if match:
|
52 |
+
year, crawl = match.groups()
|
53 |
+
formatted_key = f"CC-{year}-{crawl} Dirty (%)"
|
54 |
+
cc_columns.add((year, crawl, formatted_key))
|
55 |
+
|
56 |
+
for year, crawl, formatted_key in sorted(cc_columns):
|
57 |
+
headers.append(formatted_key)
|
58 |
|
59 |
html = """
|
60 |
<table id="benchmarkTable" style="border-collapse: collapse; width: 100%;">
|
|
|
81 |
row = {
|
82 |
"Benchmark": hyperlink,
|
83 |
"Pile-train Dirty (%)": entry.get("Pile Dirty", -1),
|
84 |
+
"DCLM-baseline Dirty (%)": entry.get("DCLM Dirty", -1)
|
|
|
|
|
85 |
}
|
86 |
+
for key, value in entry.items():
|
87 |
+
match = re.match(r'CC(\d{4})(\d{2}) Dirty', key)
|
88 |
+
if match:
|
89 |
+
year, crawl = match.groups()
|
90 |
+
formatted_key = f"CC-{year}-{crawl} Dirty (%)"
|
91 |
+
row[formatted_key] = value
|
92 |
|
93 |
if source == "core":
|
94 |
row["Category"] = entry.get("Category", "")
|
|
|
187 |
user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
|
188 |
creation_date = json.loads(user_data.content)["createdAt"]
|
189 |
if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=10):
|
190 |
+
return "β This account is not authorized to submit."
|
191 |
|
192 |
if not benchmark_name or not benchmark_name.strip():
|
193 |
return "β Please provide a benchmark name."
|
data.json
DELETED
@@ -1,30 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 13.2, "DCLM Dirty": 28.4, "CC202505 Dirty": 13.5, "CC202508 Dirty": 9.0, "URL": "https://huggingface.co/datasets/cais/mmlu"},
|
3 |
-
{"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 5.5, "DCLM Dirty": 16.2, "CC202505 Dirty": 7.1, "CC202508 Dirty": 5.4, "URL": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro"},
|
4 |
-
{"Benchmark": "BBH", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.1, "CC202505 Dirty": 1.4, "CC202508 Dirty": 1.4, "URL": "https://github.com/suzgunmirac/BIG-Bench-Hard/tree/main/bbh"},
|
5 |
-
{"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.8, "DCLM Dirty": 3.1, "CC202505 Dirty": 2.7, "CC202508 Dirty": 3.6, "URL": "https://github.com/ruixiangcui/AGIEval/tree/main/data/v1_1"},
|
6 |
-
{"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.89, "CC202508 Dirty": 2.0, "URL": "https://huggingface.co/datasets/Idavidrein/gpqa"},
|
7 |
-
{"Benchmark": "HLE", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.3, "CC202505 Dirty": 0.1, "CC202508 Dirty": 0.0, "URL": "https://huggingface.co/datasets/cais/hle"},
|
8 |
-
|
9 |
-
{"Benchmark": "AIME_2024", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 10.0, "CC202508 Dirty": 3.3, "URL": "https://huggingface.co/datasets/Maxwell-Jia/AIME_2024"},
|
10 |
-
{"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 5.0, "CC202508 Dirty": 0.8, "URL": "https://huggingface.co/datasets/openai/gsm8k"},
|
11 |
-
{"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.6, "DCLM Dirty": 3.2, "CC202505 Dirty": 0.6, "CC202508 Dirty": 7.8, "URL": "https://huggingface.co/datasets/HuggingFaceH4/MATH-500"},
|
12 |
-
{"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 5.6, "CC202508 Dirty": 1.6, "URL": "https://huggingface.co/datasets/juletxara/mgsm"},
|
13 |
-
|
14 |
-
{"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.6, "URL": "https://huggingface.co/datasets/openai/openai_humaneval"},
|
15 |
-
{"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.6, "URL": "https://huggingface.co/datasets/evalplus/humanevalplus"},
|
16 |
-
{"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.0, "URL": "https://huggingface.co/datasets/livecodebench/code_generation"},
|
17 |
-
{"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.2, "CC202508 Dirty": 0.2, "URL": "https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified"},
|
18 |
-
{"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 1.0, "CC202508 Dirty": 1.4, "URL": "https://huggingface.co/datasets/google-research-datasets/mbpp"},
|
19 |
-
|
20 |
-
{"Benchmark": "ARC-Challenge", "Category": "Commonsense Understanding", "Pile Dirty": 1.8, "DCLM Dirty": 34.1, "CC202505 Dirty": 11.9, "CC202508 Dirty": 4.0, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"},
|
21 |
-
{"Benchmark": "ARC-Easy", "Category": "Commonsense Understanding", "Pile Dirty": 1.3, "DCLM Dirty": 31.7, "CC202505 Dirty": 5.4, "CC202508 Dirty": 9.5, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"},
|
22 |
-
{"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.1, "DCLM Dirty": 1.0, "CC202505 Dirty": 0.1, "CC202508 Dirty": 0.1, "URL": "https://huggingface.co/datasets/tau/commonsense_qa"},
|
23 |
-
{"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.0, "URL": "https://huggingface.co/datasets/Rowan/hellaswag"},
|
24 |
-
{"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.8, "DCLM Dirty": 15.6, "CC202505 Dirty": 14.6, "CC202508 Dirty": 30.2, "URL": "https://huggingface.co/datasets/allenai/openbookqa"},
|
25 |
-
{"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.5, "CC202505 Dirty": 0.2, "CC202508 Dirty": 4.4, "URL": "https://huggingface.co/datasets/allenai/social_i_qa"},
|
26 |
-
{"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.0, "URL": "https://huggingface.co/datasets/allenai/winogrande"},
|
27 |
-
|
28 |
-
{"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.0, "DCLM Dirty": 18.4, "CC202505 Dirty": 7.4, "CC202508 Dirty": 8.8, "URL": "https://huggingface.co/datasets/stanfordnlp/coqa"},
|
29 |
-
{"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.8, "DCLM Dirty": 40.1, "CC202505 Dirty": 2.7, "CC202508 Dirty": 33.0, "URL": "https://huggingface.co/datasets/rajpurkar/squad"}
|
30 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|