Spaces:
Sleeping
Sleeping
# app.py | |
import subprocess | |
import sys | |
import os | |
from pathlib import Path | |
import traceback | |
from datetime import datetime | |
from typing import Optional, Dict, Tuple, List | |
def setup_salt(): | |
"""Clone and setup SALT library like in Colab.""" | |
try: | |
import salt.dataset | |
print("β SALT library already available") | |
return True | |
except ImportError: | |
pass | |
print("π₯ Setting up SALT library...") | |
try: | |
salt_dir = Path("salt") | |
if not salt_dir.exists(): | |
print("π Cloning SALT repository...") | |
subprocess.check_call([ | |
"git", "clone", "https://github.com/sunbirdai/salt.git" | |
]) | |
else: | |
print("π SALT repository already exists") | |
salt_requirements = salt_dir / "requirements.txt" | |
if salt_requirements.exists(): | |
print("π¦ Installing SALT requirements...") | |
subprocess.check_call([ | |
sys.executable, "-m", "pip", "install", "-q", "-r", str(salt_requirements) | |
]) | |
salt_path = str(salt_dir.absolute()) | |
if salt_path not in sys.path: | |
sys.path.insert(0, salt_path) | |
print(f"π Added {salt_path} to Python path") | |
import salt.dataset | |
print("β SALT library setup completed successfully") | |
return True | |
except Exception as e: | |
print(f"β Failed to setup SALT: {e}") | |
return False | |
# Setup SALT on startup | |
print("π Starting SALT Translation Leaderboard...") | |
if not setup_salt(): | |
print("β Cannot continue without SALT library") | |
sys.exit(1) | |
import gradio as gr | |
import pandas as pd | |
import json | |
# Import our modules | |
from src.test_set import ( | |
get_public_test_set, | |
get_complete_test_set, | |
create_test_set_download | |
) | |
from src.validation import validate_submission | |
from src.evaluation import evaluate_predictions, generate_evaluation_report | |
from src.leaderboard import ( | |
load_leaderboard, | |
add_model_to_leaderboard, | |
get_track_leaderboard, | |
prepare_leaderboard_display | |
) | |
from src.plotting import ( | |
create_leaderboard_plot, | |
create_language_pair_heatmap, | |
create_performance_comparison_plot, | |
create_language_pair_comparison_plot | |
) | |
from src.utils import sanitize_model_name, get_all_language_pairs | |
from config import * | |
# Global variables for caching | |
current_leaderboard = None | |
public_test_set = None | |
complete_test_set = None | |
def initialize_data(): | |
"""Initialize test sets and leaderboard data.""" | |
global public_test_set, complete_test_set, current_leaderboard | |
try: | |
print("π₯ Loading test sets...") | |
public_test_set = get_public_test_set() | |
complete_test_set = get_complete_test_set() | |
print("π Loading leaderboard...") | |
current_leaderboard = load_leaderboard() | |
# Debug leaderboard content | |
print(f"Leaderboard loaded with {len(current_leaderboard)} entries") | |
if not current_leaderboard.empty: | |
print(f"Leaderboard columns: {list(current_leaderboard.columns)}") | |
print(f"Sample row types: {current_leaderboard.dtypes.to_dict()}") | |
else: | |
print("Leaderboard is empty - will show empty interface") | |
print(f"β Initialization complete!") | |
print(f" - Test set: {len(public_test_set):,} samples") | |
print(f" - Current models: {len(current_leaderboard)}") | |
return True | |
except Exception as e: | |
print(f"β Initialization failed: {e}") | |
import traceback | |
traceback.print_exc() | |
return False | |
def download_test_set() -> Tuple[str, str]: | |
"""Create downloadable test set and return file path and info.""" | |
try: | |
global public_test_set | |
if public_test_set is None: | |
public_test_set = get_public_test_set() | |
download_path, stats = create_test_set_download() | |
info_msg = f""" | |
## π₯ SALT Test Set Downloaded Successfully! | |
### π Dataset Statistics: | |
- **Total Samples**: {stats['total_samples']:,} | |
- **Languages**: {len(stats.get('languages', []))} ({', '.join(stats.get('languages', []))}) | |
- **Google Comparable**: {stats.get('google_comparable_samples', 0):,} samples | |
- **Language Pairs**: {stats.get('language_pairs', 0)} | |
### π Track Breakdown: | |
""" | |
track_breakdown = stats.get('track_breakdown', {}) | |
for track_name, track_info in track_breakdown.items(): | |
info_msg += f""" | |
**{EVALUATION_TRACKS[track_name]['name']}**: | |
- Samples: {track_info.get('total_samples', 0):,} | |
- Language Pairs: {track_info.get('language_pairs', 0)} | |
""" | |
info_msg += f""" | |
### π File Format: | |
- `sample_id`: Unique identifier for each sample | |
- `source_text`: Text to be translated | |
- `source_language`: Source language code | |
- `target_language`: Target language code | |
- `domain`: Content domain (if available) | |
- `google_comparable`: Whether this pair can be compared with Google Translate | |
### π¬ Next Steps: | |
1. **Run your model** on the source texts to generate translations | |
2. **Create a predictions file** with columns: `sample_id`, `prediction` | |
3. **Submit** your predictions using the submission tab | |
""" | |
return download_path, info_msg | |
except Exception as e: | |
error_msg = f"β Error creating test set download: {str(e)}" | |
return None, error_msg | |
def validate_submission_file(file, model_name: str, author: str, description: str) -> Tuple[str, Optional[pd.DataFrame], str]: | |
"""Validate uploaded prediction file.""" | |
try: | |
if file is None: | |
return "β Please upload a predictions file", None, "community" | |
if not model_name.strip(): | |
return "β Please provide a model name", None, "community" | |
# Handle different file input types | |
if isinstance(file, bytes): | |
file_content = file | |
elif isinstance(file, str): | |
if os.path.exists(file): | |
with open(file, "rb") as f: | |
file_content = f.read() | |
else: | |
file_content = file.encode("utf-8") | |
elif hasattr(file, "name") and os.path.exists(file.name): | |
with open(file.name, "rb") as f: | |
file_content = f.read() | |
else: | |
return "β Could not read uploaded file", None, "community" | |
filename = getattr(file, "name", None) or getattr(file, "filename", None) or "predictions.csv" | |
global complete_test_set | |
if complete_test_set is None: | |
complete_test_set = get_complete_test_set() | |
validation_result = validate_submission( | |
file_content, filename, complete_test_set, model_name, author, description | |
) | |
detected_category = validation_result.get("category", "community") | |
if validation_result.get("can_evaluate", False): | |
return validation_result["report"], validation_result["predictions"], detected_category | |
else: | |
return validation_result["report"], None, detected_category | |
except Exception as e: | |
return f"β Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}", None, "community" | |
def evaluate_submission( | |
predictions_df: pd.DataFrame, | |
model_name: str, | |
author: str, | |
description: str, | |
detected_category: str, | |
) -> Tuple[str, pd.DataFrame, object, object]: | |
"""Evaluate validated predictions.""" | |
try: | |
if predictions_df is None: | |
return "β No valid predictions to evaluate", None, None, None | |
global complete_test_set, current_leaderboard | |
if complete_test_set is None: | |
complete_test_set = get_complete_test_set() | |
print(f"π¬ Starting evaluation for {model_name}...") | |
evaluation_results = evaluate_predictions(predictions_df, complete_test_set, detected_category) | |
if evaluation_results.get('error'): | |
return f"β Evaluation error: {evaluation_results['error']}", None, None, None | |
print("π Adding to leaderboard...") | |
updated_leaderboard = add_model_to_leaderboard( | |
model_name=sanitize_model_name(model_name), | |
author=author or "Anonymous", | |
evaluation_results=evaluation_results, | |
model_category=detected_category, | |
description=description or "" | |
) | |
current_leaderboard = updated_leaderboard | |
report = generate_evaluation_report(evaluation_results, model_name) | |
# Create visualizations | |
summary_plot = create_performance_comparison_plot(updated_leaderboard, "google_comparable") | |
google_leaderboard = get_track_leaderboard(updated_leaderboard, "google_comparable") | |
display_leaderboard = prepare_leaderboard_display(google_leaderboard, "google_comparable") | |
success_msg = f""" | |
## π Evaluation Complete! | |
### π Model Information: | |
- **Model**: {model_name} | |
- **Category**: {MODEL_CATEGORIES.get(detected_category, {}).get('name', detected_category)} | |
- **Author**: {author or 'Anonymous'} | |
{report} | |
""" | |
return success_msg, display_leaderboard, summary_plot, None | |
except Exception as e: | |
error_msg = f"β Evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}" | |
return error_msg, None, None, None | |
def refresh_track_leaderboard(track: str, search_query: str = "", category_filter: str = "all") -> Tuple[pd.DataFrame, object, object, str]: | |
"""Refresh leaderboard for a specific track with filters.""" | |
try: | |
print(f"Refreshing {track} leaderboard...") | |
global current_leaderboard | |
if current_leaderboard is None: | |
print("Loading leaderboard...") | |
current_leaderboard = load_leaderboard() | |
print(f"Leaderboard loaded with {len(current_leaderboard)} entries") | |
# Get track leaderboard with robust error handling | |
try: | |
print(f"Getting track leaderboard for {track}...") | |
track_leaderboard = get_track_leaderboard(current_leaderboard, track, category_filter=category_filter) | |
print(f"Track leaderboard has {len(track_leaderboard)} entries") | |
except Exception as e: | |
print(f"Error getting track leaderboard: {e}") | |
track_leaderboard = pd.DataFrame() | |
# Apply search filter | |
if search_query and not track_leaderboard.empty: | |
try: | |
print(f"Applying search filter: {search_query}") | |
query_lower = search_query.lower() | |
mask = ( | |
track_leaderboard['model_name'].str.lower().str.contains(query_lower, na=False) | | |
track_leaderboard['author'].str.lower().str.contains(query_lower, na=False) | |
) | |
track_leaderboard = track_leaderboard[mask] | |
print(f"After search filter: {len(track_leaderboard)} entries") | |
except Exception as e: | |
print(f"Error applying search filter: {e}") | |
# Prepare display with error handling | |
try: | |
print("Preparing display...") | |
display_df = prepare_leaderboard_display(track_leaderboard, track) | |
print(f"Display prepared with {len(display_df)} rows") | |
except Exception as e: | |
print(f"Error preparing display: {e}") | |
display_df = pd.DataFrame() | |
# Create plots with error handling | |
try: | |
print("Creating ranking plot...") | |
ranking_plot = create_leaderboard_plot(track_leaderboard, track) | |
except Exception as e: | |
print(f"Error creating ranking plot: {e}") | |
ranking_plot = None | |
try: | |
print("Creating comparison plot...") | |
comparison_plot = create_performance_comparison_plot(track_leaderboard, track) | |
except Exception as e: | |
print(f"Error creating comparison plot: {e}") | |
comparison_plot = None | |
# Generate stats text with safe formatting | |
try: | |
print("Generating stats...") | |
track_config = EVALUATION_TRACKS[track] | |
best_model = "None" | |
best_score = 0.0 | |
if not track_leaderboard.empty: | |
best_model = str(track_leaderboard.iloc[0]['model_name']) | |
quality_col = f'{track}_quality' | |
if quality_col in track_leaderboard.columns: | |
try: | |
score_val = track_leaderboard.iloc[0][quality_col] | |
best_score = float(score_val) if pd.notnull(score_val) else 0.0 | |
except (ValueError, TypeError): | |
best_score = 0.0 | |
stats_text = f""" | |
### π {track_config['name']} Statistics | |
- **Total Models**: {len(track_leaderboard)} | |
- **Best Model**: {best_model} | |
- **Best Score**: {best_score:.4f} | |
### π¬ Track Information: | |
{track_config.get('description', 'No description available')} | |
""" | |
print("Stats generated successfully") | |
except Exception as e: | |
print(f"Error generating stats: {e}") | |
stats_text = f"Error loading {track} statistics: {str(e)}" | |
print("Track refresh completed successfully") | |
return display_df, ranking_plot, comparison_plot, stats_text | |
except Exception as e: | |
error_msg = f"Error loading {track} leaderboard: {str(e)}" | |
print(f"MAIN ERROR: {error_msg}") | |
import traceback | |
traceback.print_exc() | |
return pd.DataFrame(), None, None, error_msg | |
def get_language_pair_comparison(track: str) -> Tuple[pd.DataFrame, object]: | |
"""Get language pair comparison data and visualization.""" | |
try: | |
global current_leaderboard | |
if current_leaderboard is None: | |
return pd.DataFrame(), None | |
track_leaderboard = get_track_leaderboard(current_leaderboard, track) | |
if track_leaderboard.empty: | |
return pd.DataFrame(), None | |
# Create language pair comparison table | |
pairs_data = [] | |
track_languages = EVALUATION_TRACKS[track]["languages"] | |
for src in track_languages: | |
for tgt in track_languages: | |
if src == tgt: | |
continue | |
pair_key = f"{src}_to_{tgt}" | |
pair_display = f"{LANGUAGE_NAMES.get(src, src)} β {LANGUAGE_NAMES.get(tgt, tgt)}" | |
for _, model in track_leaderboard.iterrows(): | |
# Extract detailed results if available | |
detailed_col = f'detailed_{track}' | |
if detailed_col in model and pd.notna(model[detailed_col]): | |
try: | |
detailed_results = json.loads(model[detailed_col]) | |
pair_metrics = detailed_results.get('pair_metrics', {}) | |
if pair_key in pair_metrics: | |
metrics = pair_metrics[pair_key] | |
pairs_data.append({ | |
'Language Pair': pair_display, | |
'Model': model['model_name'], | |
'Category': model['model_category'], | |
'Quality Score': metrics.get('quality_score', {}).get('mean', 0), | |
'BLEU': metrics.get('bleu', {}).get('mean', 0), | |
'ChrF': metrics.get('chrf', {}).get('mean', 0), | |
'Samples': metrics.get('sample_count', 0) | |
}) | |
except (json.JSONDecodeError, KeyError): | |
continue | |
pairs_df = pd.DataFrame(pairs_data) | |
if pairs_df.empty: | |
return pd.DataFrame(), None | |
# Create visualization | |
comparison_plot = create_language_pair_comparison_plot(pairs_df, track) | |
return pairs_df, comparison_plot | |
except Exception as e: | |
print(f"Error in language pair comparison: {e}") | |
return pd.DataFrame(), None | |
# Initialize data on startup | |
initialization_success = initialize_data() | |
# Create Gradio interface | |
with gr.Blocks( | |
title="π SALT Translation Leaderboard", | |
theme=gr.themes.Soft(), | |
css=""" | |
.gradio-container { | |
max-width: 1600px !important; | |
margin: 0 auto; | |
} | |
/* Force readable text in all themes */ | |
.markdown, .gr-markdown, .gr-html { | |
color: var(--body-text-color) !important; | |
background: var(--background-fill-primary) !important; | |
} | |
.markdown h1, .markdown h2, .markdown h3, | |
.gr-markdown h1, .gr-markdown h2, .gr-markdown h3 { | |
color: var(--body-text-color) !important; | |
} | |
.markdown p, .markdown li, .markdown strong, | |
.gr-markdown p, .gr-markdown li, .gr-markdown strong { | |
color: var(--body-text-color) !important; | |
} | |
/* Table styling */ | |
.dataframe, .gr-dataframe { | |
color: var(--body-text-color) !important; | |
background: var(--background-fill-primary) !important; | |
} | |
/* Button and input styling */ | |
.gr-button, .gr-textbox, .gr-dropdown { | |
color: var(--body-text-color) !important; | |
} | |
/* Ensure plot backgrounds work in both themes */ | |
.plot-container { | |
background: var(--background-fill-primary) !important; | |
} | |
""" | |
) as demo: | |
# Header | |
gr.HTML(""" | |
<div style="text-align: center; margin-bottom: 2rem; padding: 2rem; background: linear-gradient(135deg, #1e3a8a 0%, #3730a3 50%, #1e40af 100%); color: white !important; border-radius: 10px;"> | |
<h1 style="color: white !important;">π SALT Translation Leaderboard</h1> | |
<p style="color: white !important;"><strong>Rigorous Evaluation of Translation Models on Ugandan Languages</strong></p> | |
<p style="color: white !important;">Three-tier evaluation β’ Statistical confidence intervals β’ Research-grade analysis</p> | |
</div> | |
""") | |
# Status indicator | |
if initialization_success: | |
status_msg = "β System initialized successfully" | |
else: | |
status_msg = "β System initialization failed - some features may not work" | |
gr.Markdown(f"**System Status**: {status_msg}") | |
with gr.Tabs(): | |
# Tab 1: Download Test Set | |
with gr.Tab("π₯ Download Test Set", id="download"): | |
gr.Markdown(""" | |
## π Get the SALT Test Set | |
Download our test set for translation model evaluation. | |
""") | |
download_btn = gr.Button("π₯ Download Test Set", variant="primary", size="lg") | |
with gr.Row(): | |
with gr.Column(): | |
download_file = gr.File(label="π Test Set File", interactive=False) | |
with gr.Column(): | |
download_info = gr.Markdown() | |
# Tab 2: Submit Predictions | |
with gr.Tab("π Submit Predictions", id="submit"): | |
gr.Markdown(""" | |
## π― Submit Your Model's Predictions | |
Upload predictions for evaluation across all tracks. | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("### π Model Information") | |
model_name_input = gr.Textbox( | |
label="π€ Model Name", | |
placeholder="e.g., MyTranslator-v2.0", | |
info="Unique name for your model" | |
) | |
author_input = gr.Textbox( | |
label="π€ Author/Organization", | |
placeholder="Your name or organization", | |
value="Anonymous" | |
) | |
description_input = gr.Textbox( | |
label="π Model Description", | |
placeholder="Architecture, training data, special features...", | |
lines=4 | |
) | |
predictions_file = gr.File( | |
label="π Predictions File", | |
file_types=[".csv", ".tsv", ".json"] | |
) | |
validate_btn = gr.Button("β Validate Submission", variant="secondary") | |
submit_btn = gr.Button("π Submit for Evaluation", variant="primary", interactive=False) | |
with gr.Column(scale=1): | |
validation_output = gr.Markdown() | |
gr.Markdown("### π Evaluation Results") | |
evaluation_output = gr.Markdown() | |
with gr.Row(): | |
with gr.Column(): | |
submission_plot = gr.Plot(label="π Performance Analysis") | |
with gr.Column(): | |
results_table = gr.Dataframe(label="π Updated Leaderboard", interactive=False) | |
# Tab 3: Google-Comparable Track | |
with gr.Tab("π€ Google-Comparable Track", id="google_track"): | |
gr.Markdown(f""" | |
## {EVALUATION_TRACKS['google_comparable']['name']} | |
**{EVALUATION_TRACKS['google_comparable']['description']}** | |
This track evaluates models on language pairs supported by Google Translate, | |
enabling direct comparison with commercial baselines. | |
""") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
google_search = gr.Textbox(label="π Search Models", placeholder="Search by model name, author...") | |
with gr.Column(scale=1): | |
google_category = gr.Dropdown( | |
label="π·οΈ Category Filter", | |
choices=["all"] + list(MODEL_CATEGORIES.keys()), | |
value="all" | |
) | |
with gr.Column(scale=1): | |
google_refresh = gr.Button("π Refresh", variant="secondary") | |
google_stats = gr.Markdown() | |
with gr.Row(): | |
with gr.Column(): | |
google_ranking_plot = gr.Plot(label="π Rankings") | |
with gr.Column(): | |
google_comparison_plot = gr.Plot(label="π Performance Comparison") | |
google_leaderboard = gr.Dataframe(label="π Google-Comparable Leaderboard", interactive=False) | |
# Tab 4: UG40-Complete Track | |
with gr.Tab("π UG40-Complete Track", id="ug40_track"): | |
gr.Markdown(f""" | |
## {EVALUATION_TRACKS['ug40_complete']['name']} | |
**{EVALUATION_TRACKS['ug40_complete']['description']}** | |
This track evaluates models on all UG40 language pairs, | |
providing comprehensive assessment of Ugandan language translation capabilities. | |
""") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
ug40_search = gr.Textbox(label="π Search Models", placeholder="Search by model name, author...") | |
with gr.Column(scale=1): | |
ug40_category = gr.Dropdown( | |
label="π·οΈ Category Filter", | |
choices=["all"] + list(MODEL_CATEGORIES.keys()), | |
value="all" | |
) | |
with gr.Column(scale=1): | |
ug40_refresh = gr.Button("π Refresh", variant="secondary") | |
ug40_stats = gr.Markdown() | |
with gr.Row(): | |
with gr.Column(): | |
ug40_ranking_plot = gr.Plot(label="π Rankings") | |
with gr.Column(): | |
ug40_comparison_plot = gr.Plot(label="π Performance Comparison") | |
ug40_leaderboard = gr.Dataframe(label="π UG40-Complete Leaderboard", interactive=False) | |
# Tab 5: Language Pair Analysis | |
with gr.Tab("π Language Pair Analysis", id="pairs_analysis"): | |
gr.Markdown(""" | |
## π Language Pair Performance Analysis | |
Compare model performance across individual language pairs with detailed breakdowns. | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
pairs_track_select = gr.Dropdown( | |
label="π Select Track", | |
choices=list(EVALUATION_TRACKS.keys()), | |
value="google_comparable" | |
) | |
with gr.Column(scale=1): | |
pairs_refresh = gr.Button("π Analyze Language Pairs", variant="primary") | |
pairs_comparison_plot = gr.Plot(label="π Language Pair Comparison") | |
pairs_table = gr.Dataframe(label="π Language Pair Performance", interactive=False) | |
# Tab 6: Documentation | |
with gr.Tab("π Documentation", id="docs"): | |
gr.Markdown(f""" | |
# π SALT Translation Leaderboard Documentation | |
## π― Overview | |
The SALT Translation Leaderboard provides rigorous evaluation of translation models | |
on Ugandan languages using three different tracks for fair comparison. | |
## π Evaluation Tracks | |
**1. π€ Google-Comparable Track** | |
- **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])} | |
- **Purpose**: Fair comparison with commercial translation systems | |
- **Language Pairs**: {len([1 for src in GOOGLE_SUPPORTED_LANGUAGES for tgt in GOOGLE_SUPPORTED_LANGUAGES if src != tgt])} | |
**2. π UG40-Complete Track** | |
- **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages | |
- **Purpose**: Comprehensive Ugandan language capability assessment | |
- **Language Pairs**: {len([1 for src in ALL_UG40_LANGUAGES for tgt in ALL_UG40_LANGUAGES if src != tgt])} | |
## π Evaluation Metrics | |
### Primary Metrics | |
- **Quality Score**: Composite metric (0-1) combining BLEU, ChrF, and error rates | |
- **BLEU**: Bilingual Evaluation Understudy (0-100) | |
- **ChrF**: Character-level F-score (0-1) | |
### Model Categories | |
Models are automatically categorized for fair comparison: | |
- **π’ Commercial**: Production translation systems | |
- **π¬ Research**: Academic and research institution models | |
- **π Baseline**: Simple baseline and reference models | |
- **π₯ Community**: User-submitted models | |
## π Submission Process | |
### Step 1: Download Test Set | |
1. Click "Download Test Set" in the first tab | |
2. Save the test set file | |
### Step 2: Generate Predictions | |
1. Load the test set in your evaluation pipeline | |
2. For each row, translate `source_text` from `source_language` to `target_language` | |
3. Save results as CSV with columns: `sample_id`, `prediction` | |
### Step 3: Submit & Evaluate | |
1. Fill in model information | |
2. Upload your predictions file | |
3. Review validation report | |
4. Submit for evaluation | |
## π File Formats | |
### Test Set Format | |
```csv | |
sample_id,source_text,source_language,target_language,domain,google_comparable | |
salt_000001,"Hello world",eng,lug,general,true | |
salt_000002,"How are you?",eng,ach,conversation,true | |
``` | |
### Predictions Format | |
```csv | |
sample_id,prediction | |
salt_000001,"Amakuru ensi" | |
salt_000002,"Ibino nining?" | |
``` | |
## π€ Contributing | |
This leaderboard is designed for the research community. When using results: | |
1. Consider the appropriate track for your comparison | |
2. Report confidence intervals when available | |
3. Acknowledge the model category in comparisons | |
--- | |
*For questions, contact the team at research@sunbird.ai* | |
""") | |
# Event handlers | |
predictions_validated = gr.State(value=None) | |
detected_category_state = gr.State(value="community") | |
# Download test set | |
download_btn.click( | |
fn=download_test_set, | |
outputs=[download_file, download_info] | |
) | |
# Validate predictions | |
def handle_validation(file, model_name, author, description): | |
report, predictions, category = validate_submission_file(file, model_name, author, description) | |
can_evaluate = predictions is not None | |
if can_evaluate: | |
button_status = "\n\nβ **Ready to submit for evaluation!**" | |
else: | |
button_status = "\n\nβ **Please fix issues above before evaluation**" | |
enhanced_report = report + button_status | |
return ( | |
enhanced_report, | |
predictions, | |
category, | |
gr.update(interactive=can_evaluate) | |
) | |
validate_btn.click( | |
fn=handle_validation, | |
inputs=[predictions_file, model_name_input, author_input, description_input], | |
outputs=[validation_output, predictions_validated, detected_category_state, submit_btn] | |
) | |
# Submit for evaluation | |
submit_btn.click( | |
fn=evaluate_submission, | |
inputs=[predictions_validated, model_name_input, author_input, description_input, detected_category_state], | |
outputs=[evaluation_output, results_table, submission_plot, gr.Plot(visible=False)] | |
) | |
# Track leaderboard refresh functions | |
google_refresh.click( | |
fn=lambda *args: refresh_track_leaderboard("google_comparable", *args), | |
inputs=[google_search, google_category], | |
outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats] | |
) | |
ug40_refresh.click( | |
fn=lambda *args: refresh_track_leaderboard("ug40_complete", *args), | |
inputs=[ug40_search, ug40_category], | |
outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats] | |
) | |
# Language pair analysis | |
pairs_refresh.click( | |
fn=get_language_pair_comparison, | |
inputs=[pairs_track_select], | |
outputs=[pairs_table, pairs_comparison_plot] | |
) | |
# Load initial data and update dropdowns | |
def load_initial_data(): | |
try: | |
print("Loading initial data...") | |
global current_leaderboard | |
# Make sure we have a leaderboard | |
if current_leaderboard is None: | |
current_leaderboard = load_leaderboard() | |
print(f"Current leaderboard has {len(current_leaderboard)} entries") | |
# Try to load Google track data | |
try: | |
google_data = refresh_track_leaderboard("google_comparable", "", "all") | |
print("Successfully loaded Google track data") | |
return google_data | |
except Exception as e: | |
print(f"Error loading Google track: {e}") | |
# Return empty data if there's an error | |
empty_df = pd.DataFrame() | |
return (empty_df, None, None, "No data available") | |
except Exception as e: | |
print(f"Error in load_initial_data: {e}") | |
empty_df = pd.DataFrame() | |
return (empty_df, None, None, "Error loading data") | |
demo.load( | |
fn=load_initial_data, | |
outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats] | |
) | |
# Launch the application | |
if __name__ == "__main__": | |
demo.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
share=False, | |
show_error=True | |
) |