Hao Xu commited on
Commit
219a6f1
Β·
1 Parent(s): 15e1734

update loading method for core benchmark

Browse files
Files changed (2) hide show
  1. app.py +32 -10
  2. data.json +0 -30
app.py CHANGED
@@ -1,14 +1,15 @@
1
  import gradio as gr
2
  import os
3
  import json
4
- import pandas as pd
5
- from huggingface_hub import HfApi, hf_hub_download
6
  from datasets import load_dataset
7
  import requests
8
  import datetime
9
 
10
  TOKEN = os.environ.get("HF_TOKEN")
11
  OWNER = os.environ.get("OWNER")
 
12
  RESULTS_COMMUNITY = f"{OWNER}/benchmark_results"
13
  api = HfApi()
14
 
@@ -17,8 +18,13 @@ URL = os.environ.get("URL")
17
 
18
  def load_data(source, refresh=False):
19
  if source == "core":
20
- with open("data.json", "r") as f:
21
- data = json.load(f)
 
 
 
 
 
22
  else:
23
  if refresh:
24
  ds = load_dataset(RESULTS_COMMUNITY, download_mode="force_redownload")
@@ -34,9 +40,21 @@ def build_table(source, refresh=False):
34
  data = load_data(source, refresh)
35
 
36
  if source == "core":
37
- headers = ["Benchmark", "Category", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)", "CC-2025-08 Dirty (%)"]
38
  else:
39
- headers = ["Benchmark", "Contributor", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)", "CC-2025-08 Dirty (%)"]
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  html = """
42
  <table id="benchmarkTable" style="border-collapse: collapse; width: 100%;">
@@ -63,10 +81,14 @@ def build_table(source, refresh=False):
63
  row = {
64
  "Benchmark": hyperlink,
65
  "Pile-train Dirty (%)": entry.get("Pile Dirty", -1),
66
- "DCLM-baseline Dirty (%)": entry.get("DCLM Dirty", -1),
67
- "CC-2025-05 Dirty (%)": entry.get("CC202505 Dirty", -1),
68
- "CC-2025-08 Dirty (%)": entry.get("CC202508 Dirty", -1)
69
  }
 
 
 
 
 
 
70
 
71
  if source == "core":
72
  row["Category"] = entry.get("Category", "")
@@ -165,7 +187,7 @@ def record_submission(benchmark_name, contributor, jsonl_file, hf_path, hf_split
165
  user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
166
  creation_date = json.loads(user_data.content)["createdAt"]
167
  if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=10):
168
- return format_error("This account is not authorized to submit.")
169
 
170
  if not benchmark_name or not benchmark_name.strip():
171
  return "❌ Please provide a benchmark name."
 
1
  import gradio as gr
2
  import os
3
  import json
4
+ import re
5
+ from huggingface_hub import HfApi
6
  from datasets import load_dataset
7
  import requests
8
  import datetime
9
 
10
  TOKEN = os.environ.get("HF_TOKEN")
11
  OWNER = os.environ.get("OWNER")
12
+ RESULTS_CORE = f"{OWNER}/core_benchmark_results"
13
  RESULTS_COMMUNITY = f"{OWNER}/benchmark_results"
14
  api = HfApi()
15
 
 
18
 
19
  def load_data(source, refresh=False):
20
  if source == "core":
21
+ if refresh:
22
+ ds = load_dataset(RESULTS_CORE, download_mode="force_redownload")
23
+ else:
24
+ ds = load_dataset(RESULTS_CORE)
25
+ data = []
26
+ for entry in ds['train']:
27
+ data.append(entry)
28
  else:
29
  if refresh:
30
  ds = load_dataset(RESULTS_COMMUNITY, download_mode="force_redownload")
 
40
  data = load_data(source, refresh)
41
 
42
  if source == "core":
43
+ headers = ["Benchmark", "Category", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)"]
44
  else:
45
+ headers = ["Benchmark", "Contributor", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)"]
46
+
47
+ cc_columns = set()
48
+ for entry in data:
49
+ for key in entry.keys():
50
+ match = re.match(r'CC(\d{4})(\d{2}) Dirty', key)
51
+ if match:
52
+ year, crawl = match.groups()
53
+ formatted_key = f"CC-{year}-{crawl} Dirty (%)"
54
+ cc_columns.add((year, crawl, formatted_key))
55
+
56
+ for year, crawl, formatted_key in sorted(cc_columns):
57
+ headers.append(formatted_key)
58
 
59
  html = """
60
  <table id="benchmarkTable" style="border-collapse: collapse; width: 100%;">
 
81
  row = {
82
  "Benchmark": hyperlink,
83
  "Pile-train Dirty (%)": entry.get("Pile Dirty", -1),
84
+ "DCLM-baseline Dirty (%)": entry.get("DCLM Dirty", -1)
 
 
85
  }
86
+ for key, value in entry.items():
87
+ match = re.match(r'CC(\d{4})(\d{2}) Dirty', key)
88
+ if match:
89
+ year, crawl = match.groups()
90
+ formatted_key = f"CC-{year}-{crawl} Dirty (%)"
91
+ row[formatted_key] = value
92
 
93
  if source == "core":
94
  row["Category"] = entry.get("Category", "")
 
187
  user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
188
  creation_date = json.loads(user_data.content)["createdAt"]
189
  if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=10):
190
+ return "❌ This account is not authorized to submit."
191
 
192
  if not benchmark_name or not benchmark_name.strip():
193
  return "❌ Please provide a benchmark name."
data.json DELETED
@@ -1,30 +0,0 @@
1
- [
2
- {"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 13.2, "DCLM Dirty": 28.4, "CC202505 Dirty": 13.5, "CC202508 Dirty": 9.0, "URL": "https://huggingface.co/datasets/cais/mmlu"},
3
- {"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 5.5, "DCLM Dirty": 16.2, "CC202505 Dirty": 7.1, "CC202508 Dirty": 5.4, "URL": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro"},
4
- {"Benchmark": "BBH", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.1, "CC202505 Dirty": 1.4, "CC202508 Dirty": 1.4, "URL": "https://github.com/suzgunmirac/BIG-Bench-Hard/tree/main/bbh"},
5
- {"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.8, "DCLM Dirty": 3.1, "CC202505 Dirty": 2.7, "CC202508 Dirty": 3.6, "URL": "https://github.com/ruixiangcui/AGIEval/tree/main/data/v1_1"},
6
- {"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.89, "CC202508 Dirty": 2.0, "URL": "https://huggingface.co/datasets/Idavidrein/gpqa"},
7
- {"Benchmark": "HLE", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.3, "CC202505 Dirty": 0.1, "CC202508 Dirty": 0.0, "URL": "https://huggingface.co/datasets/cais/hle"},
8
-
9
- {"Benchmark": "AIME_2024", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 10.0, "CC202508 Dirty": 3.3, "URL": "https://huggingface.co/datasets/Maxwell-Jia/AIME_2024"},
10
- {"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 5.0, "CC202508 Dirty": 0.8, "URL": "https://huggingface.co/datasets/openai/gsm8k"},
11
- {"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.6, "DCLM Dirty": 3.2, "CC202505 Dirty": 0.6, "CC202508 Dirty": 7.8, "URL": "https://huggingface.co/datasets/HuggingFaceH4/MATH-500"},
12
- {"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 5.6, "CC202508 Dirty": 1.6, "URL": "https://huggingface.co/datasets/juletxara/mgsm"},
13
-
14
- {"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.6, "URL": "https://huggingface.co/datasets/openai/openai_humaneval"},
15
- {"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.6, "URL": "https://huggingface.co/datasets/evalplus/humanevalplus"},
16
- {"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.0, "URL": "https://huggingface.co/datasets/livecodebench/code_generation"},
17
- {"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.2, "CC202508 Dirty": 0.2, "URL": "https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified"},
18
- {"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 1.0, "CC202508 Dirty": 1.4, "URL": "https://huggingface.co/datasets/google-research-datasets/mbpp"},
19
-
20
- {"Benchmark": "ARC-Challenge", "Category": "Commonsense Understanding", "Pile Dirty": 1.8, "DCLM Dirty": 34.1, "CC202505 Dirty": 11.9, "CC202508 Dirty": 4.0, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"},
21
- {"Benchmark": "ARC-Easy", "Category": "Commonsense Understanding", "Pile Dirty": 1.3, "DCLM Dirty": 31.7, "CC202505 Dirty": 5.4, "CC202508 Dirty": 9.5, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"},
22
- {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.1, "DCLM Dirty": 1.0, "CC202505 Dirty": 0.1, "CC202508 Dirty": 0.1, "URL": "https://huggingface.co/datasets/tau/commonsense_qa"},
23
- {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.0, "URL": "https://huggingface.co/datasets/Rowan/hellaswag"},
24
- {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.8, "DCLM Dirty": 15.6, "CC202505 Dirty": 14.6, "CC202508 Dirty": 30.2, "URL": "https://huggingface.co/datasets/allenai/openbookqa"},
25
- {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.5, "CC202505 Dirty": 0.2, "CC202508 Dirty": 4.4, "URL": "https://huggingface.co/datasets/allenai/social_i_qa"},
26
- {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.0, "URL": "https://huggingface.co/datasets/allenai/winogrande"},
27
-
28
- {"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.0, "DCLM Dirty": 18.4, "CC202505 Dirty": 7.4, "CC202508 Dirty": 8.8, "URL": "https://huggingface.co/datasets/stanfordnlp/coqa"},
29
- {"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.8, "DCLM Dirty": 40.1, "CC202505 Dirty": 2.7, "CC202508 Dirty": 33.0, "URL": "https://huggingface.co/datasets/rajpurkar/squad"}
30
- ]