import gradio as gr import pandas as pd import json from constants import BANNER, INTRODUCTION_TEXT, CITATION_TEXT, METRICS_TAB_TEXT, DIR_OUTPUT_REQUESTS, LEADERBOARD_CSS, EU_LANGUAGES, MULTILINGUAL_TAB_TEXT from init import is_model_on_hub, upload_file, load_all_info_from_dataset_hub from utils_display import AutoEvalColumn, MultilingualColumn, fields, make_clickable_model, styled_error, styled_message import numpy as np from datetime import datetime, timezone LAST_UPDATED = "Aug 15th 2025" # Global variable to store detailed benchmark data benchmark_details = {} expanded_languages = set() # Track which languages are expanded column_names = { "MODEL": "Model", "Avg. WER": "Average WER ⬇️", "RTFx": "RTFx ⬆️️", "AMI WER": "AMI", "Earnings22 WER": "Earnings22", "Gigaspeech WER": "Gigaspeech", "LS Clean WER": "LS Clean", "LS Other WER": "LS Other", "SPGISpeech WER": "SPGISpeech", "Tedlium WER": "Tedlium", "Voxpopuli WER": "Voxpopuli", } eval_queue_repo, requested_models, csv_results, multilingual_csv_path = load_all_info_from_dataset_hub() if not csv_results.exists(): raise Exception(f"CSV file {csv_results} does not exist locally") # Get csv with data and parse columns original_df = pd.read_csv(csv_results) # Formats the columns def formatter(x): if type(x) is str: x = x elif x == -1: x = "NA" else: x = round(x, 2) return x for col in original_df.columns: if col == "model": original_df[col] = original_df[col].apply(lambda x: x.replace(x, make_clickable_model(x))) else: original_df[col] = original_df[col].apply(formatter) # For numerical values original_df.rename(columns=column_names, inplace=True) original_df.sort_values(by='Average WER ⬇️', inplace=True) COLS = [c.name for c in fields(AutoEvalColumn)] TYPES = [c.type for c in fields(AutoEvalColumn)] # Multilingual columns (dynamic based on expansion state) MULTILINGUAL_COLS = [c.name for c in fields(MultilingualColumn)] def create_multilingual_dataframe(): """Create multilingual dataframe with CoVoST, MLS, and FLEURS benchmark data""" global benchmark_details, expanded_languages if multilingual_csv_path is None or not multilingual_csv_path.exists(): raise Exception("Multilingual CSV file not found") # Load CSV data multilingual_raw_df = pd.read_csv(multilingual_csv_path) # Store detailed benchmark data for click functionality benchmark_details = {} multilingual_data = [] for _, row_data in multilingual_raw_df.iterrows(): model_name = row_data['model'] model_details = {} row = {"Model": make_clickable_model(model_name)} # Process data for each language and collect all individual datapoints all_datapoints = [] # Collect all individual dataset scores across all languages for lang_code, lang_info in EU_LANGUAGES.items(): # Get individual benchmark scores from CSV, using None for missing values # Special cases: de doesn't have MLS, pt doesn't have CoVoST if lang_code == "pt": covost_score = None # pt doesn't have CoVoST data else: covost_score = row_data.get(f"{lang_code}_covost", None) if lang_code == "de": mls_score = None # de doesn't have MLS data else: mls_score = row_data.get(f"{lang_code}_mls", None) fleurs_score = row_data.get(f"{lang_code}_fleurs", None) # Convert string zeros or empty values to None for score_name, score_val in [("covost", covost_score), ("mls", mls_score), ("fleurs", fleurs_score)]: if score_val is not None and (score_val == 0.0 or score_val == "" or str(score_val).strip() == "0" or str(score_val).strip() == ""): if score_name == "covost": covost_score = None elif score_name == "mls": mls_score = None elif score_name == "fleurs": fleurs_score = None # Add individual datapoints to the global list if covost_score is not None and covost_score > 0: all_datapoints.append(covost_score) if mls_score is not None and mls_score > 0: all_datapoints.append(mls_score) if fleurs_score is not None and fleurs_score > 0: all_datapoints.append(fleurs_score) # Calculate average only from available scores for this language (for display) available_scores = [s for s in [covost_score, mls_score, fleurs_score] if s is not None and s > 0] if available_scores: avg_score = round(sum(available_scores) / len(available_scores), 2) else: avg_score = None # Store individual scores for detailed view (only store existing datasets) lang_data = {"average": avg_score if avg_score is not None else "NA"} # Only store datasets that exist for this language if lang_code != "pt" and covost_score is not None: # pt doesn't have CoVoST lang_data["CoVoST"] = covost_score if lang_code != "de" and mls_score is not None: # de doesn't have MLS lang_data["MLS"] = mls_score if fleurs_score is not None: # All languages have FLEURS lang_data["FLEURS"] = fleurs_score model_details[lang_code] = lang_data # Calculate overall multilingual average from all individual datapoints if all_datapoints: row["Average WER ⬇️"] = round(np.mean(all_datapoints), 2) else: row["Average WER ⬇️"] = 0.0 # Add RTFx from the CSV (it should be a single value per model) rtfx_value = row_data.get("rtfx", row_data.get("RTFx", 0.0)) # Convert 0 or -1 values to "NA" like in the English leaderboard if rtfx_value == 0.0 or rtfx_value == -1 or rtfx_value == 0 or rtfx_value == "0" or rtfx_value == "0.0": row["RTFx ⬆️️"] = "NA" else: row["RTFx ⬆️️"] = rtfx_value # Add language columns based on expansion state for lang_code, lang_info in EU_LANGUAGES.items(): lang_col_name = f"{lang_info['flag']} {lang_info['name']}" model_data = model_details[lang_code] if lang_code in expanded_languages: # Show average column AND detailed columns row[f"{lang_col_name} Avg"] = model_data["average"] # Only show columns for datasets that actually exist in the data if "CoVoST" in model_data: row[f"{lang_col_name} CoVoST"] = model_data["CoVoST"] if "MLS" in model_data: row[f"{lang_col_name} MLS"] = model_data["MLS"] if "FLEURS" in model_data: row[f"{lang_col_name} FLEURS"] = model_data["FLEURS"] else: # Show only average column row[lang_col_name] = model_data["average"] # Store model details for click functionality benchmark_details[model_name] = model_details multilingual_data.append(row) multilingual_df = pd.DataFrame(multilingual_data) multilingual_df = multilingual_df.sort_values(by='Average WER ⬇️') return multilingual_df def get_multilingual_datatypes(df): """Generate appropriate datatypes for multilingual dataframe columns""" datatypes = [] for col in df.columns: if col == "Model": datatypes.append("markdown") # This allows HTML rendering else: datatypes.append("number") return datatypes def get_language_details(model, language_code): """Get detailed breakdown for a specific model and language""" global benchmark_details if model not in benchmark_details or language_code not in benchmark_details[model]: return None language_info = EU_LANGUAGES.get(language_code, {}) language_name = language_info.get("name", "Unknown") model_data = benchmark_details[model][language_code] details = { "Language": f"{language_info.get('flag', '')} {language_name}", "Model": model, "CoVoST WER": model_data["CoVoST"], "MLS WER": model_data["MLS"], "FLEURS WER": model_data["FLEURS"], "Average WER": model_data["average"] } return details def toggle_language_expansion(language_code): """Toggle expansion of language columns when button is clicked""" global expanded_languages # Toggle expansion state if language_code in expanded_languages: expanded_languages.remove(language_code) else: expanded_languages.add(language_code) # Recreate dataframe with new expansion state updated_df = create_multilingual_dataframe() updated_datatypes = get_multilingual_datatypes(updated_df) return gr.update(value=updated_df, datatype=updated_datatypes) # Initialize multilingual dataframe multilingual_df = create_multilingual_dataframe() def request_model(model_text, chbcoco2017): # Determine the selected checkboxes dataset_selection = [] if chbcoco2017: dataset_selection.append("ESB Datasets tests only") if len(dataset_selection) == 0: return styled_error("You need to select at least one dataset") base_model_on_hub, error_msg = is_model_on_hub(model_text) if not base_model_on_hub: return styled_error(f"Base model '{model_text}' {error_msg}") # Construct the output dictionary current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") required_datasets = ', '.join(dataset_selection) eval_entry = { "date": current_time, "model": model_text, "datasets_selected": required_datasets } # Prepare file path DIR_OUTPUT_REQUESTS.mkdir(parents=True, exist_ok=True) fn_datasets = '@ '.join(dataset_selection) filename = model_text.replace("/","@") + "@@" + fn_datasets if filename in requested_models: return styled_error(f"A request for this model '{model_text}' and dataset(s) was already made.") try: filename_ext = filename + ".txt" out_filepath = DIR_OUTPUT_REQUESTS / filename_ext # Write the results to a text file with open(out_filepath, "w") as f: f.write(json.dumps(eval_entry)) upload_file(filename, out_filepath) # Include file in the list of uploaded files requested_models.append(filename) # Remove the local file out_filepath.unlink() return styled_message("🤗 Your request has been submitted and will be evaluated soon!

") except Exception as e: return styled_error(f"Error submitting request!") def filter_main_table(show_proprietary=True): filtered_df = original_df.copy() # Filter proprietary models if needed if not show_proprietary and "License" in filtered_df.columns: # Keep only models with "Open" license filtered_df = filtered_df[filtered_df["License"] == "Open"] return filtered_df with gr.Blocks(css=LEADERBOARD_CSS) as demo: gr.HTML(BANNER, elem_id="banner") gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0): leaderboard_table = gr.components.Dataframe( value=original_df, datatype=TYPES, elem_id="leaderboard-table", interactive=False, visible=True, ) with gr.Row(): show_proprietary_checkbox = gr.Checkbox( label="Show proprietary models", value=True, elem_id="show-proprietary-checkbox" ) # Connect checkbox to the filtering function show_proprietary_checkbox.change( filter_main_table, inputs=[show_proprietary_checkbox], outputs=leaderboard_table ) with gr.TabItem("🌍 Multilingual", elem_id="multilingual-benchmark-tab-table", id=1): gr.Markdown(MULTILINGUAL_TAB_TEXT, elem_classes="markdown-text") # Language toggle buttons gr.Markdown("Click on a language button to show/hide detailed benchmark scores (CoVoST, MLS, FLEURS):") language_buttons = {} lang_codes = list(EU_LANGUAGES.keys()) # First row of buttons (5 languages) with gr.Row(): for lang_code in lang_codes[:5]: lang_info = EU_LANGUAGES[lang_code] button_label = f"{lang_info['flag']} {lang_info['name']}" language_buttons[lang_code] = gr.Button( button_label, variant="secondary", size="sm" ) # Second row of buttons (remaining 5 languages) with gr.Row(): for lang_code in lang_codes[5:]: lang_info = EU_LANGUAGES[lang_code] button_label = f"{lang_info['flag']} {lang_info['name']}" language_buttons[lang_code] = gr.Button( button_label, variant="secondary", size="sm" ) multilingual_table = gr.components.Dataframe( value=multilingual_df, datatype=get_multilingual_datatypes(multilingual_df), elem_id="multilingual-table", interactive=False, visible=True, ) # Connect buttons to toggle language expansion for lang_code, button in language_buttons.items(): def create_toggle_func(code): return lambda: toggle_language_expansion(code) button.click( create_toggle_func(lang_code), outputs=[multilingual_table] ) with gr.TabItem("📈 Metrics", elem_id="od-benchmark-tab-table", id=3): gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text") with gr.TabItem("✉️✨ Request a model here!", elem_id="od-benchmark-tab-table", id=4): with gr.Column(): gr.Markdown("# ✉️✨ Request results for a new model here!", elem_classes="markdown-text") with gr.Column(): gr.Markdown("Select a dataset:", elem_classes="markdown-text") with gr.Column(): model_name_textbox = gr.Textbox(label="Model name (user_name/model_name)") chb_coco2017 = gr.Checkbox(label="COCO validation 2017 dataset", visible=False, value=True, interactive=False) with gr.Column(): mdw_submission_result = gr.Markdown() btn_submitt = gr.Button(value="🚀 Request") btn_submitt.click(request_model, [model_name_textbox, chb_coco2017], mdw_submission_result) # add an about section with gr.TabItem("🤗 About", elem_id="od-benchmark-tab-table", id=5): gr.Markdown("## About", elem_classes="markdown-text") gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text") with gr.Row(): with gr.Accordion("📙 Citation", open=False): gr.Textbox( value=CITATION_TEXT, lines=7, label="Copy the BibTeX snippet to cite this source", elem_id="citation-button", show_copy_button=True, ) demo.launch(ssr_mode=False)