Spaces:

infini-gram-mini
/

Benchmark-Contamination-Monitoring-System

Running

App Files Files Community

Hao Xu commited on Jul 3

Commit

219a6f1

1 Parent(s): 15e1734

update loading method for core benchmark

Browse files

Files changed (2) hide show

app.py +32 -10
data.json +0 -30

app.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import gradio as gr
 import os
 import json
-import pandas as pd
-from huggingface_hub import HfApi, hf_hub_download
 from datasets import load_dataset
 import requests
 import datetime
 TOKEN = os.environ.get("HF_TOKEN")
 OWNER = os.environ.get("OWNER")
 RESULTS_COMMUNITY = f"{OWNER}/benchmark_results"
 api = HfApi()
@@ -17,8 +18,13 @@ URL = os.environ.get("URL")
 def load_data(source, refresh=False):
     if source == "core":
-        with open("data.json", "r") as f:
-            data = json.load(f)
     else:
         if refresh:
             ds = load_dataset(RESULTS_COMMUNITY, download_mode="force_redownload")
@@ -34,9 +40,21 @@ def build_table(source, refresh=False):
     data = load_data(source, refresh)
     if source == "core":
-        headers = ["Benchmark", "Category", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)", "CC-2025-08 Dirty (%)"]
     else:
-        headers = ["Benchmark", "Contributor", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)", "CC-2025-08 Dirty (%)"]
     html = """
     <table id="benchmarkTable" style="border-collapse: collapse; width: 100%;">
@@ -63,10 +81,14 @@ def build_table(source, refresh=False):
         row = {
             "Benchmark": hyperlink,
             "Pile-train Dirty (%)": entry.get("Pile Dirty", -1),
-            "DCLM-baseline Dirty (%)": entry.get("DCLM Dirty", -1),
-            "CC-2025-05 Dirty (%)": entry.get("CC202505 Dirty", -1),
-            "CC-2025-08 Dirty (%)": entry.get("CC202508 Dirty", -1)
         }
         if source == "core":
             row["Category"] = entry.get("Category", "")
@@ -165,7 +187,7 @@ def record_submission(benchmark_name, contributor, jsonl_file, hf_path, hf_split
     user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
     creation_date = json.loads(user_data.content)["createdAt"]
     if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=10):
-        return format_error("This account is not authorized to submit.")
     if not benchmark_name or not benchmark_name.strip():
         return "❌ Please provide a benchmark name."

 import gradio as gr
 import os
 import json
+import re
+from huggingface_hub import HfApi
 from datasets import load_dataset
 import requests
 import datetime
 TOKEN = os.environ.get("HF_TOKEN")
 OWNER = os.environ.get("OWNER")
+RESULTS_CORE = f"{OWNER}/core_benchmark_results"
 RESULTS_COMMUNITY = f"{OWNER}/benchmark_results"
 api = HfApi()
 def load_data(source, refresh=False):
     if source == "core":
+        if refresh:
+            ds = load_dataset(RESULTS_CORE, download_mode="force_redownload")
+        else:
+            ds = load_dataset(RESULTS_CORE)
+        data = []
+        for entry in ds['train']:
+            data.append(entry)
     else:
         if refresh:
             ds = load_dataset(RESULTS_COMMUNITY, download_mode="force_redownload")
     data = load_data(source, refresh)
     if source == "core":
+        headers = ["Benchmark", "Category", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)"]
     else:
+        headers = ["Benchmark", "Contributor", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)"]
+    cc_columns = set()
+    for entry in data:
+        for key in entry.keys():
+            match = re.match(r'CC(\d{4})(\d{2}) Dirty', key)
+            if match:
+                year, crawl = match.groups()
+                formatted_key = f"CC-{year}-{crawl} Dirty (%)"
+                cc_columns.add((year, crawl, formatted_key))
+    for year, crawl, formatted_key in sorted(cc_columns):
+        headers.append(formatted_key)
     html = """
     <table id="benchmarkTable" style="border-collapse: collapse; width: 100%;">
         row = {
             "Benchmark": hyperlink,
             "Pile-train Dirty (%)": entry.get("Pile Dirty", -1),
+            "DCLM-baseline Dirty (%)": entry.get("DCLM Dirty", -1)
         }
+        for key, value in entry.items():
+            match = re.match(r'CC(\d{4})(\d{2}) Dirty', key)
+            if match:
+                year, crawl = match.groups()
+                formatted_key = f"CC-{year}-{crawl} Dirty (%)"
+                row[formatted_key] = value
         if source == "core":
             row["Category"] = entry.get("Category", "")
     user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
     creation_date = json.loads(user_data.content)["createdAt"]
     if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=10):
+        return "❌ This account is not authorized to submit."
     if not benchmark_name or not benchmark_name.strip():
         return "❌ Please provide a benchmark name."

data.json DELETED Viewed

@@ -1,30 +0,0 @@
-[
-  {"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 13.2, "DCLM Dirty": 28.4, "CC202505 Dirty": 13.5, "CC202508 Dirty": 9.0, "URL": "https://huggingface.co/datasets/cais/mmlu"},
-  {"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 5.5, "DCLM Dirty": 16.2, "CC202505 Dirty": 7.1, "CC202508 Dirty": 5.4, "URL": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro"},
-  {"Benchmark": "BBH", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.1, "CC202505 Dirty": 1.4, "CC202508 Dirty": 1.4, "URL": "https://github.com/suzgunmirac/BIG-Bench-Hard/tree/main/bbh"},
-  {"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.8, "DCLM Dirty": 3.1, "CC202505 Dirty": 2.7, "CC202508 Dirty": 3.6, "URL": "https://github.com/ruixiangcui/AGIEval/tree/main/data/v1_1"},
-  {"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.89, "CC202508 Dirty": 2.0, "URL": "https://huggingface.co/datasets/Idavidrein/gpqa"},
-  {"Benchmark": "HLE", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.3, "CC202505 Dirty": 0.1, "CC202508 Dirty": 0.0, "URL": "https://huggingface.co/datasets/cais/hle"},
-  {"Benchmark": "AIME_2024", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 10.0, "CC202508 Dirty": 3.3, "URL": "https://huggingface.co/datasets/Maxwell-Jia/AIME_2024"},
-  {"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 5.0, "CC202508 Dirty": 0.8, "URL": "https://huggingface.co/datasets/openai/gsm8k"},
-  {"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.6, "DCLM Dirty": 3.2, "CC202505 Dirty": 0.6, "CC202508 Dirty": 7.8, "URL": "https://huggingface.co/datasets/HuggingFaceH4/MATH-500"},
-  {"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 5.6, "CC202508 Dirty": 1.6, "URL": "https://huggingface.co/datasets/juletxara/mgsm"},
-  {"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.6, "URL": "https://huggingface.co/datasets/openai/openai_humaneval"},
-  {"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.6, "URL": "https://huggingface.co/datasets/evalplus/humanevalplus"},
-  {"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.0, "URL": "https://huggingface.co/datasets/livecodebench/code_generation"},
-  {"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.2, "CC202508 Dirty": 0.2, "URL": "https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified"},
-  {"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 1.0, "CC202508 Dirty": 1.4, "URL": "https://huggingface.co/datasets/google-research-datasets/mbpp"},
-  {"Benchmark": "ARC-Challenge", "Category": "Commonsense Understanding", "Pile Dirty": 1.8, "DCLM Dirty": 34.1, "CC202505 Dirty": 11.9, "CC202508 Dirty": 4.0, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"},
-  {"Benchmark": "ARC-Easy", "Category": "Commonsense Understanding", "Pile Dirty": 1.3, "DCLM Dirty": 31.7, "CC202505 Dirty": 5.4, "CC202508 Dirty": 9.5, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"},
-  {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.1, "DCLM Dirty": 1.0, "CC202505 Dirty": 0.1, "CC202508 Dirty": 0.1, "URL": "https://huggingface.co/datasets/tau/commonsense_qa"},
-  {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.0, "URL": "https://huggingface.co/datasets/Rowan/hellaswag"},
-  {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.8, "DCLM Dirty": 15.6, "CC202505 Dirty": 14.6, "CC202508 Dirty": 30.2, "URL": "https://huggingface.co/datasets/allenai/openbookqa"},
-  {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.5, "CC202505 Dirty": 0.2, "CC202508 Dirty": 4.4, "URL": "https://huggingface.co/datasets/allenai/social_i_qa"},
-  {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.0, "URL": "https://huggingface.co/datasets/allenai/winogrande"},
-  {"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.0, "DCLM Dirty": 18.4, "CC202505 Dirty": 7.4, "CC202508 Dirty": 8.8, "URL": "https://huggingface.co/datasets/stanfordnlp/coqa"},
-  {"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.8, "DCLM Dirty": 40.1, "CC202505 Dirty": 2.7, "CC202508 Dirty": 33.0, "URL": "https://huggingface.co/datasets/rajpurkar/squad"}
-]