Spaces:

akera
/

leaderboard

Sleeping

App Files Files Community

leaderboard / app.py

akera

Update app.py

e179a7b verified about 2 months ago

raw

history blame contribute delete

32.8 kB

	# app.py
	import subprocess
	import sys
	import os
	from pathlib import Path
	import traceback
	from datetime import datetime
	from typing import Optional, Dict, Tuple, List

	def setup_salt():
	"""Clone and setup SALT library like in Colab."""
	try:
	import salt.dataset
	print("✅ SALT library already available")
	return True
	except ImportError:
	pass

	print("📥 Setting up SALT library...")

	try:
	salt_dir = Path("salt")
	if not salt_dir.exists():
	print("🔄 Cloning SALT repository...")
	subprocess.check_call([
	"git", "clone", "https://github.com/sunbirdai/salt.git"
	])
	else:
	print("📁 SALT repository already exists")

	salt_requirements = salt_dir / "requirements.txt"
	if salt_requirements.exists():
	print("📦 Installing SALT requirements...")
	subprocess.check_call([
	sys.executable, "-m", "pip", "install", "-q", "-r", str(salt_requirements)
	])

	salt_path = str(salt_dir.absolute())
	if salt_path not in sys.path:
	sys.path.insert(0, salt_path)
	print(f"🔗 Added {salt_path} to Python path")

	import salt.dataset
	print("✅ SALT library setup completed successfully")
	return True

	except Exception as e:
	print(f"❌ Failed to setup SALT: {e}")
	return False

	# Setup SALT on startup
	print("🚀 Starting SALT Translation Leaderboard...")
	if not setup_salt():
	print("❌ Cannot continue without SALT library")
	sys.exit(1)

	import gradio as gr
	import pandas as pd
	import json

	# Import our modules
	from src.test_set import (
	get_public_test_set,
	get_complete_test_set,
	create_test_set_download
	)
	from src.validation import validate_submission
	from src.evaluation import evaluate_predictions, generate_evaluation_report
	from src.leaderboard import (
	load_leaderboard,
	add_model_to_leaderboard,
	get_track_leaderboard,
	prepare_leaderboard_display
	)
	from src.plotting import (
	create_leaderboard_plot,
	create_language_pair_heatmap,
	create_performance_comparison_plot,
	create_language_pair_comparison_plot
	)
	from src.utils import sanitize_model_name, get_all_language_pairs
	from config import *

	# Global variables for caching
	current_leaderboard = None
	public_test_set = None
	complete_test_set = None

	def initialize_data():
	"""Initialize test sets and leaderboard data."""
	global public_test_set, complete_test_set, current_leaderboard

	try:
	print("📥 Loading test sets...")
	public_test_set = get_public_test_set()
	complete_test_set = get_complete_test_set()

	print("🏆 Loading leaderboard...")
	current_leaderboard = load_leaderboard()

	# Debug leaderboard content
	print(f"Leaderboard loaded with {len(current_leaderboard)} entries")
	if not current_leaderboard.empty:
	print(f"Leaderboard columns: {list(current_leaderboard.columns)}")
	print(f"Sample row types: {current_leaderboard.dtypes.to_dict()}")
	else:
	print("Leaderboard is empty - will show empty interface")

	print(f"✅ Initialization complete!")
	print(f" - Test set: {len(public_test_set):,} samples")
	print(f" - Current models: {len(current_leaderboard)}")

	return True

	except Exception as e:
	print(f"❌ Initialization failed: {e}")
	import traceback
	traceback.print_exc()
	return False

	def download_test_set() -> Tuple[str, str]:
	"""Create downloadable test set and return file path and info."""
	try:
	global public_test_set
	if public_test_set is None:
	public_test_set = get_public_test_set()

	download_path, stats = create_test_set_download()

	info_msg = f"""
	## 📥 SALT Test Set Downloaded Successfully!

	### 📊 Dataset Statistics:
	- Total Samples: {stats['total_samples']:,}
	- Languages: {len(stats.get('languages', []))} ({', '.join(stats.get('languages', []))})
	- Google Comparable: {stats.get('google_comparable_samples', 0):,} samples
	- Language Pairs: {stats.get('language_pairs', 0)}

	### 🏁 Track Breakdown:
	"""

	track_breakdown = stats.get('track_breakdown', {})
	for track_name, track_info in track_breakdown.items():
	info_msg += f"""
	{EVALUATION_TRACKS[track_name]['name']}:
	- Samples: {track_info.get('total_samples', 0):,}
	- Language Pairs: {track_info.get('language_pairs', 0)}
	"""

	info_msg += f"""

	### 📋 File Format:
	- `sample_id`: Unique identifier for each sample
	- `source_text`: Text to be translated
	- `source_language`: Source language code
	- `target_language`: Target language code
	- `domain`: Content domain (if available)
	- `google_comparable`: Whether this pair can be compared with Google Translate

	### 🔬 Next Steps:
	1. Run your model on the source texts to generate translations
	2. Create a predictions file with columns: `sample_id`, `prediction`
	3. Submit your predictions using the submission tab
	"""

	return download_path, info_msg

	except Exception as e:
	error_msg = f"❌ Error creating test set download: {str(e)}"
	return None, error_msg

	def validate_submission_file(file, model_name: str, author: str, description: str) -> Tuple[str, Optional[pd.DataFrame], str]:
	"""Validate uploaded prediction file."""
	try:
	if file is None:
	return "❌ Please upload a predictions file", None, "community"
	if not model_name.strip():
	return "❌ Please provide a model name", None, "community"

	# Handle different file input types
	if isinstance(file, bytes):
	file_content = file
	elif isinstance(file, str):
	if os.path.exists(file):
	with open(file, "rb") as f:
	file_content = f.read()
	else:
	file_content = file.encode("utf-8")
	elif hasattr(file, "name") and os.path.exists(file.name):
	with open(file.name, "rb") as f:
	file_content = f.read()
	else:
	return "❌ Could not read uploaded file", None, "community"

	filename = getattr(file, "name", None) or getattr(file, "filename", None) or "predictions.csv"

	global complete_test_set
	if complete_test_set is None:
	complete_test_set = get_complete_test_set()

	validation_result = validate_submission(
	file_content, filename, complete_test_set, model_name, author, description
	)

	detected_category = validation_result.get("category", "community")

	if validation_result.get("can_evaluate", False):
	return validation_result["report"], validation_result["predictions"], detected_category
	else:
	return validation_result["report"], None, detected_category

	except Exception as e:
	return f"❌ Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}", None, "community"

	def evaluate_submission(
	predictions_df: pd.DataFrame,
	model_name: str,
	author: str,
	description: str,
	detected_category: str,
	) -> Tuple[str, pd.DataFrame, object, object]:
	"""Evaluate validated predictions."""
	try:
	if predictions_df is None:
	return "❌ No valid predictions to evaluate", None, None, None

	global complete_test_set, current_leaderboard
	if complete_test_set is None:
	complete_test_set = get_complete_test_set()

	print(f"🔬 Starting evaluation for {model_name}...")
	evaluation_results = evaluate_predictions(predictions_df, complete_test_set, detected_category)

	if evaluation_results.get('error'):
	return f"❌ Evaluation error: {evaluation_results['error']}", None, None, None

	print("🏆 Adding to leaderboard...")
	updated_leaderboard = add_model_to_leaderboard(
	model_name=sanitize_model_name(model_name),
	author=author or "Anonymous",
	evaluation_results=evaluation_results,
	model_category=detected_category,
	description=description or ""
	)

	current_leaderboard = updated_leaderboard

	report = generate_evaluation_report(evaluation_results, model_name)

	# Create visualizations
	summary_plot = create_performance_comparison_plot(updated_leaderboard, "google_comparable")
	google_leaderboard = get_track_leaderboard(updated_leaderboard, "google_comparable")
	display_leaderboard = prepare_leaderboard_display(google_leaderboard, "google_comparable")

	success_msg = f"""
	## 🎉 Evaluation Complete!

	### 📊 Model Information:
	- Model: {model_name}
	- Category: {MODEL_CATEGORIES.get(detected_category, {}).get('name', detected_category)}
	- Author: {author or 'Anonymous'}

	{report}
	"""

	return success_msg, display_leaderboard, summary_plot, None

	except Exception as e:
	error_msg = f"❌ Evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
	return error_msg, None, None, None

	def refresh_track_leaderboard(track: str, search_query: str = "", category_filter: str = "all") -> Tuple[pd.DataFrame, object, object, str]:
	"""Refresh leaderboard for a specific track with filters."""
	try:
	print(f"Refreshing {track} leaderboard...")

	global current_leaderboard
	if current_leaderboard is None:
	print("Loading leaderboard...")
	current_leaderboard = load_leaderboard()

	print(f"Leaderboard loaded with {len(current_leaderboard)} entries")

	# Get track leaderboard with robust error handling
	try:
	print(f"Getting track leaderboard for {track}...")
	track_leaderboard = get_track_leaderboard(current_leaderboard, track, category_filter=category_filter)
	print(f"Track leaderboard has {len(track_leaderboard)} entries")
	except Exception as e:
	print(f"Error getting track leaderboard: {e}")
	track_leaderboard = pd.DataFrame()

	# Apply search filter
	if search_query and not track_leaderboard.empty:
	try:
	print(f"Applying search filter: {search_query}")
	query_lower = search_query.lower()
	mask = (
	track_leaderboard['model_name'].str.lower().str.contains(query_lower, na=False) \|
	track_leaderboard['author'].str.lower().str.contains(query_lower, na=False)
	)
	track_leaderboard = track_leaderboard[mask]
	print(f"After search filter: {len(track_leaderboard)} entries")
	except Exception as e:
	print(f"Error applying search filter: {e}")

	# Prepare display with error handling
	try:
	print("Preparing display...")
	display_df = prepare_leaderboard_display(track_leaderboard, track)
	print(f"Display prepared with {len(display_df)} rows")
	except Exception as e:
	print(f"Error preparing display: {e}")
	display_df = pd.DataFrame()

	# Create plots with error handling
	try:
	print("Creating ranking plot...")
	ranking_plot = create_leaderboard_plot(track_leaderboard, track)
	except Exception as e:
	print(f"Error creating ranking plot: {e}")
	ranking_plot = None

	try:
	print("Creating comparison plot...")
	comparison_plot = create_performance_comparison_plot(track_leaderboard, track)
	except Exception as e:
	print(f"Error creating comparison plot: {e}")
	comparison_plot = None

	# Generate stats text with safe formatting
	try:
	print("Generating stats...")
	track_config = EVALUATION_TRACKS[track]
	best_model = "None"
	best_score = 0.0

	if not track_leaderboard.empty:
	best_model = str(track_leaderboard.iloc[0]['model_name'])
	quality_col = f'{track}_quality'
	if quality_col in track_leaderboard.columns:
	try:
	score_val = track_leaderboard.iloc[0][quality_col]
	best_score = float(score_val) if pd.notnull(score_val) else 0.0
	except (ValueError, TypeError):
	best_score = 0.0

	stats_text = f"""
	### 📊 {track_config['name']} Statistics

	- Total Models: {len(track_leaderboard)}
	- Best Model: {best_model}
	- Best Score: {best_score:.4f}

	### 🔬 Track Information:
	{track_config.get('description', 'No description available')}
	"""
	print("Stats generated successfully")
	except Exception as e:
	print(f"Error generating stats: {e}")
	stats_text = f"Error loading {track} statistics: {str(e)}"

	print("Track refresh completed successfully")
	return display_df, ranking_plot, comparison_plot, stats_text

	except Exception as e:
	error_msg = f"Error loading {track} leaderboard: {str(e)}"
	print(f"MAIN ERROR: {error_msg}")
	import traceback
	traceback.print_exc()
	return pd.DataFrame(), None, None, error_msg

	def get_language_pair_comparison(track: str) -> Tuple[pd.DataFrame, object]:
	"""Get language pair comparison data and visualization."""
	try:
	global current_leaderboard
	if current_leaderboard is None:
	return pd.DataFrame(), None

	track_leaderboard = get_track_leaderboard(current_leaderboard, track)

	if track_leaderboard.empty:
	return pd.DataFrame(), None

	# Create language pair comparison table
	pairs_data = []
	track_languages = EVALUATION_TRACKS[track]["languages"]

	for src in track_languages:
	for tgt in track_languages:
	if src == tgt:
	continue

	pair_key = f"{src}_to_{tgt}"
	pair_display = f"{LANGUAGE_NAMES.get(src, src)} → {LANGUAGE_NAMES.get(tgt, tgt)}"

	for _, model in track_leaderboard.iterrows():
	# Extract detailed results if available
	detailed_col = f'detailed_{track}'
	if detailed_col in model and pd.notna(model[detailed_col]):
	try:
	detailed_results = json.loads(model[detailed_col])
	pair_metrics = detailed_results.get('pair_metrics', {})

	if pair_key in pair_metrics:
	metrics = pair_metrics[pair_key]
	pairs_data.append({
	'Language Pair': pair_display,
	'Model': model['model_name'],
	'Category': model['model_category'],
	'Quality Score': metrics.get('quality_score', {}).get('mean', 0),
	'BLEU': metrics.get('bleu', {}).get('mean', 0),
	'ChrF': metrics.get('chrf', {}).get('mean', 0),
	'Samples': metrics.get('sample_count', 0)
	})
	except (json.JSONDecodeError, KeyError):
	continue

	pairs_df = pd.DataFrame(pairs_data)

	if pairs_df.empty:
	return pd.DataFrame(), None

	# Create visualization
	comparison_plot = create_language_pair_comparison_plot(pairs_df, track)

	return pairs_df, comparison_plot

	except Exception as e:
	print(f"Error in language pair comparison: {e}")
	return pd.DataFrame(), None

	# Initialize data on startup
	initialization_success = initialize_data()

	# Create Gradio interface
	with gr.Blocks(
	title="🏆 SALT Translation Leaderboard",
	theme=gr.themes.Soft(),
	css="""
	.gradio-container {
	max-width: 1600px !important;
	margin: 0 auto;
	}

	/* Force readable text in all themes */
	.markdown, .gr-markdown, .gr-html {
	color: var(--body-text-color) !important;
	background: var(--background-fill-primary) !important;
	}

	.markdown h1, .markdown h2, .markdown h3,
	.gr-markdown h1, .gr-markdown h2, .gr-markdown h3 {
	color: var(--body-text-color) !important;
	}

	.markdown p, .markdown li, .markdown strong,
	.gr-markdown p, .gr-markdown li, .gr-markdown strong {
	color: var(--body-text-color) !important;
	}

	/* Table styling */
	.dataframe, .gr-dataframe {
	color: var(--body-text-color) !important;
	background: var(--background-fill-primary) !important;
	}

	/* Button and input styling */
	.gr-button, .gr-textbox, .gr-dropdown {
	color: var(--body-text-color) !important;
	}

	/* Ensure plot backgrounds work in both themes */
	.plot-container {
	background: var(--background-fill-primary) !important;
	}
	"""
	) as demo:

	# Header
	gr.HTML("""
	<div style="text-align: center; margin-bottom: 2rem; padding: 2rem; background: linear-gradient(135deg, #1e3a8a 0%, #3730a3 50%, #1e40af 100%); color: white !important; border-radius: 10px;">
	<h1 style="color: white !important;">🏆 SALT Translation Leaderboard</h1>
	<p style="color: white !important;"><strong>Rigorous Evaluation of Translation Models on Ugandan Languages</strong></p>
	<p style="color: white !important;">Three-tier evaluation • Statistical confidence intervals • Research-grade analysis</p>
	</div>
	""")

	# Status indicator
	if initialization_success:
	status_msg = "✅ System initialized successfully"
	else:
	status_msg = "❌ System initialization failed - some features may not work"

	gr.Markdown(f"System Status: {status_msg}")

	with gr.Tabs():

	# Tab 1: Download Test Set
	with gr.Tab("📥 Download Test Set", id="download"):
	gr.Markdown("""
	## 📋 Get the SALT Test Set

	Download our test set for translation model evaluation.
	""")

	download_btn = gr.Button("📥 Download Test Set", variant="primary", size="lg")

	with gr.Row():
	with gr.Column():
	download_file = gr.File(label="📂 Test Set File", interactive=False)
	with gr.Column():
	download_info = gr.Markdown()

	# Tab 2: Submit Predictions
	with gr.Tab("🚀 Submit Predictions", id="submit"):
	gr.Markdown("""
	## 🎯 Submit Your Model's Predictions

	Upload predictions for evaluation across all tracks.
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 📝 Model Information")

	model_name_input = gr.Textbox(
	label="🤖 Model Name",
	placeholder="e.g., MyTranslator-v2.0",
	info="Unique name for your model"
	)

	author_input = gr.Textbox(
	label="👤 Author/Organization",
	placeholder="Your name or organization",
	value="Anonymous"
	)

	description_input = gr.Textbox(
	label="📄 Model Description",
	placeholder="Architecture, training data, special features...",
	lines=4
	)

	predictions_file = gr.File(
	label="📂 Predictions File",
	file_types=[".csv", ".tsv", ".json"]
	)

	validate_btn = gr.Button("✅ Validate Submission", variant="secondary")
	submit_btn = gr.Button("🚀 Submit for Evaluation", variant="primary", interactive=False)

	with gr.Column(scale=1):
	validation_output = gr.Markdown()

	gr.Markdown("### 🏆 Evaluation Results")
	evaluation_output = gr.Markdown()

	with gr.Row():
	with gr.Column():
	submission_plot = gr.Plot(label="📈 Performance Analysis")
	with gr.Column():
	results_table = gr.Dataframe(label="📊 Updated Leaderboard", interactive=False)

	# Tab 3: Google-Comparable Track
	with gr.Tab("🤖 Google-Comparable Track", id="google_track"):
	gr.Markdown(f"""
	## {EVALUATION_TRACKS['google_comparable']['name']}

	{EVALUATION_TRACKS['google_comparable']['description']}

	This track evaluates models on language pairs supported by Google Translate,
	enabling direct comparison with commercial baselines.
	""")

	with gr.Row():
	with gr.Column(scale=2):
	google_search = gr.Textbox(label="🔍 Search Models", placeholder="Search by model name, author...")
	with gr.Column(scale=1):
	google_category = gr.Dropdown(
	label="🏷️ Category Filter",
	choices=["all"] + list(MODEL_CATEGORIES.keys()),
	value="all"
	)
	with gr.Column(scale=1):
	google_refresh = gr.Button("🔄 Refresh", variant="secondary")

	google_stats = gr.Markdown()

	with gr.Row():
	with gr.Column():
	google_ranking_plot = gr.Plot(label="🏆 Rankings")
	with gr.Column():
	google_comparison_plot = gr.Plot(label="📊 Performance Comparison")

	google_leaderboard = gr.Dataframe(label="📈 Google-Comparable Leaderboard", interactive=False)

	# Tab 4: UG40-Complete Track
	with gr.Tab("🌍 UG40-Complete Track", id="ug40_track"):
	gr.Markdown(f"""
	## {EVALUATION_TRACKS['ug40_complete']['name']}

	{EVALUATION_TRACKS['ug40_complete']['description']}

	This track evaluates models on all UG40 language pairs,
	providing comprehensive assessment of Ugandan language translation capabilities.
	""")

	with gr.Row():
	with gr.Column(scale=2):
	ug40_search = gr.Textbox(label="🔍 Search Models", placeholder="Search by model name, author...")
	with gr.Column(scale=1):
	ug40_category = gr.Dropdown(
	label="🏷️ Category Filter",
	choices=["all"] + list(MODEL_CATEGORIES.keys()),
	value="all"
	)
	with gr.Column(scale=1):
	ug40_refresh = gr.Button("🔄 Refresh", variant="secondary")

	ug40_stats = gr.Markdown()

	with gr.Row():
	with gr.Column():
	ug40_ranking_plot = gr.Plot(label="🏆 Rankings")
	with gr.Column():
	ug40_comparison_plot = gr.Plot(label="📊 Performance Comparison")

	ug40_leaderboard = gr.Dataframe(label="📈 UG40-Complete Leaderboard", interactive=False)

	# Tab 5: Language Pair Analysis
	with gr.Tab("📊 Language Pair Analysis", id="pairs_analysis"):
	gr.Markdown("""
	## 📊 Language Pair Performance Analysis

	Compare model performance across individual language pairs with detailed breakdowns.
	""")

	with gr.Row():
	with gr.Column(scale=1):
	pairs_track_select = gr.Dropdown(
	label="🏁 Select Track",
	choices=list(EVALUATION_TRACKS.keys()),
	value="google_comparable"
	)
	with gr.Column(scale=1):
	pairs_refresh = gr.Button("🔄 Analyze Language Pairs", variant="primary")

	pairs_comparison_plot = gr.Plot(label="📊 Language Pair Comparison")
	pairs_table = gr.Dataframe(label="📈 Language Pair Performance", interactive=False)

	# Tab 6: Documentation
	with gr.Tab("📚 Documentation", id="docs"):
	gr.Markdown(f"""
	# 📖 SALT Translation Leaderboard Documentation

	## 🎯 Overview

	The SALT Translation Leaderboard provides rigorous evaluation of translation models
	on Ugandan languages using three different tracks for fair comparison.

	## 🏁 Evaluation Tracks

	1. 🤖 Google-Comparable Track
	- Languages: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
	- Purpose: Fair comparison with commercial translation systems
	- Language Pairs: {len([1 for src in GOOGLE_SUPPORTED_LANGUAGES for tgt in GOOGLE_SUPPORTED_LANGUAGES if src != tgt])}

	2. 🌍 UG40-Complete Track
	- Languages: All {len(ALL_UG40_LANGUAGES)} UG40 languages
	- Purpose: Comprehensive Ugandan language capability assessment
	- Language Pairs: {len([1 for src in ALL_UG40_LANGUAGES for tgt in ALL_UG40_LANGUAGES if src != tgt])}

	## 📊 Evaluation Metrics

	### Primary Metrics
	- Quality Score: Composite metric (0-1) combining BLEU, ChrF, and error rates
	- BLEU: Bilingual Evaluation Understudy (0-100)
	- ChrF: Character-level F-score (0-1)

	### Model Categories

	Models are automatically categorized for fair comparison:

	- 🏢 Commercial: Production translation systems
	- 🔬 Research: Academic and research institution models
	- 📊 Baseline: Simple baseline and reference models
	- 👥 Community: User-submitted models

	## 🔄 Submission Process

	### Step 1: Download Test Set
	1. Click "Download Test Set" in the first tab
	2. Save the test set file

	### Step 2: Generate Predictions
	1. Load the test set in your evaluation pipeline
	2. For each row, translate `source_text` from `source_language` to `target_language`
	3. Save results as CSV with columns: `sample_id`, `prediction`

	### Step 3: Submit & Evaluate
	1. Fill in model information
	2. Upload your predictions file
	3. Review validation report
	4. Submit for evaluation

	## 📋 File Formats

	### Test Set Format
	```csv
	sample_id,source_text,source_language,target_language,domain,google_comparable
	salt_000001,"Hello world",eng,lug,general,true
	salt_000002,"How are you?",eng,ach,conversation,true
	```

	### Predictions Format
	```csv
	sample_id,prediction
	salt_000001,"Amakuru ensi"
	salt_000002,"Ibino nining?"
	```

	## 🤝 Contributing

	This leaderboard is designed for the research community. When using results:

	1. Consider the appropriate track for your comparison
	2. Report confidence intervals when available
	3. Acknowledge the model category in comparisons

	---

	For questions, contact the team at research@sunbird.ai
	""")

	# Event handlers
	predictions_validated = gr.State(value=None)
	detected_category_state = gr.State(value="community")

	# Download test set
	download_btn.click(
	fn=download_test_set,
	outputs=[download_file, download_info]
	)

	# Validate predictions
	def handle_validation(file, model_name, author, description):
	report, predictions, category = validate_submission_file(file, model_name, author, description)
	can_evaluate = predictions is not None

	if can_evaluate:
	button_status = "\n\n✅ Ready to submit for evaluation!"
	else:
	button_status = "\n\n❌ Please fix issues above before evaluation"

	enhanced_report = report + button_status

	return (
	enhanced_report,
	predictions,
	category,
	gr.update(interactive=can_evaluate)
	)

	validate_btn.click(
	fn=handle_validation,
	inputs=[predictions_file, model_name_input, author_input, description_input],
	outputs=[validation_output, predictions_validated, detected_category_state, submit_btn]
	)

	# Submit for evaluation
	submit_btn.click(
	fn=evaluate_submission,
	inputs=[predictions_validated, model_name_input, author_input, description_input, detected_category_state],
	outputs=[evaluation_output, results_table, submission_plot, gr.Plot(visible=False)]
	)

	# Track leaderboard refresh functions
	google_refresh.click(
	fn=lambda args: refresh_track_leaderboard("google_comparable", args),
	inputs=[google_search, google_category],
	outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats]
	)

	ug40_refresh.click(
	fn=lambda args: refresh_track_leaderboard("ug40_complete", args),
	inputs=[ug40_search, ug40_category],
	outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats]
	)

	# Language pair analysis
	pairs_refresh.click(
	fn=get_language_pair_comparison,
	inputs=[pairs_track_select],
	outputs=[pairs_table, pairs_comparison_plot]
	)

	# Load initial data and update dropdowns
	def load_initial_data():
	try:
	print("Loading initial data...")
	global current_leaderboard

	# Make sure we have a leaderboard
	if current_leaderboard is None:
	current_leaderboard = load_leaderboard()

	print(f"Current leaderboard has {len(current_leaderboard)} entries")

	# Try to load Google track data
	try:
	google_data = refresh_track_leaderboard("google_comparable", "", "all")
	print("Successfully loaded Google track data")
	return google_data
	except Exception as e:
	print(f"Error loading Google track: {e}")
	# Return empty data if there's an error
	empty_df = pd.DataFrame()
	return (empty_df, None, None, "No data available")

	except Exception as e:
	print(f"Error in load_initial_data: {e}")
	empty_df = pd.DataFrame()
	return (empty_df, None, None, "Error loading data")

	demo.load(
	fn=load_initial_data,
	outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats]
	)

	# Launch the application
	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True
	)