# app.py import subprocess import sys import os from pathlib import Path import traceback from datetime import datetime from typing import Optional, Dict, Tuple, List def setup_salt(): """Clone and setup SALT library like in Colab.""" try: import salt.dataset print("āœ… SALT library already available") return True except ImportError: pass print("šŸ“„ Setting up SALT library...") try: salt_dir = Path("salt") if not salt_dir.exists(): print("šŸ”„ Cloning SALT repository...") subprocess.check_call([ "git", "clone", "https://github.com/sunbirdai/salt.git" ]) else: print("šŸ“ SALT repository already exists") salt_requirements = salt_dir / "requirements.txt" if salt_requirements.exists(): print("šŸ“¦ Installing SALT requirements...") subprocess.check_call([ sys.executable, "-m", "pip", "install", "-q", "-r", str(salt_requirements) ]) salt_path = str(salt_dir.absolute()) if salt_path not in sys.path: sys.path.insert(0, salt_path) print(f"šŸ”— Added {salt_path} to Python path") import salt.dataset print("āœ… SALT library setup completed successfully") return True except Exception as e: print(f"āŒ Failed to setup SALT: {e}") return False # Setup SALT on startup print("šŸš€ Starting SALT Translation Leaderboard...") if not setup_salt(): print("āŒ Cannot continue without SALT library") sys.exit(1) import gradio as gr import pandas as pd import json # Import our modules from src.test_set import ( get_public_test_set, get_complete_test_set, create_test_set_download ) from src.validation import validate_submission from src.evaluation import evaluate_predictions, generate_evaluation_report from src.leaderboard import ( load_leaderboard, add_model_to_leaderboard, get_track_leaderboard, prepare_leaderboard_display ) from src.plotting import ( create_leaderboard_plot, create_language_pair_heatmap, create_performance_comparison_plot, create_language_pair_comparison_plot ) from src.utils import sanitize_model_name, get_all_language_pairs from config import * # Global variables for caching current_leaderboard = None public_test_set = None complete_test_set = None def initialize_data(): """Initialize test sets and leaderboard data.""" global public_test_set, complete_test_set, current_leaderboard try: print("šŸ“„ Loading test sets...") public_test_set = get_public_test_set() complete_test_set = get_complete_test_set() print("šŸ† Loading leaderboard...") current_leaderboard = load_leaderboard() # Debug leaderboard content print(f"Leaderboard loaded with {len(current_leaderboard)} entries") if not current_leaderboard.empty: print(f"Leaderboard columns: {list(current_leaderboard.columns)}") print(f"Sample row types: {current_leaderboard.dtypes.to_dict()}") else: print("Leaderboard is empty - will show empty interface") print(f"āœ… Initialization complete!") print(f" - Test set: {len(public_test_set):,} samples") print(f" - Current models: {len(current_leaderboard)}") return True except Exception as e: print(f"āŒ Initialization failed: {e}") import traceback traceback.print_exc() return False def download_test_set() -> Tuple[str, str]: """Create downloadable test set and return file path and info.""" try: global public_test_set if public_test_set is None: public_test_set = get_public_test_set() download_path, stats = create_test_set_download() info_msg = f""" ## šŸ“„ SALT Test Set Downloaded Successfully! ### šŸ“Š Dataset Statistics: - **Total Samples**: {stats['total_samples']:,} - **Languages**: {len(stats.get('languages', []))} ({', '.join(stats.get('languages', []))}) - **Google Comparable**: {stats.get('google_comparable_samples', 0):,} samples - **Language Pairs**: {stats.get('language_pairs', 0)} ### šŸ Track Breakdown: """ track_breakdown = stats.get('track_breakdown', {}) for track_name, track_info in track_breakdown.items(): info_msg += f""" **{EVALUATION_TRACKS[track_name]['name']}**: - Samples: {track_info.get('total_samples', 0):,} - Language Pairs: {track_info.get('language_pairs', 0)} """ info_msg += f""" ### šŸ“‹ File Format: - `sample_id`: Unique identifier for each sample - `source_text`: Text to be translated - `source_language`: Source language code - `target_language`: Target language code - `domain`: Content domain (if available) - `google_comparable`: Whether this pair can be compared with Google Translate ### šŸ”¬ Next Steps: 1. **Run your model** on the source texts to generate translations 2. **Create a predictions file** with columns: `sample_id`, `prediction` 3. **Submit** your predictions using the submission tab """ return download_path, info_msg except Exception as e: error_msg = f"āŒ Error creating test set download: {str(e)}" return None, error_msg def validate_submission_file(file, model_name: str, author: str, description: str) -> Tuple[str, Optional[pd.DataFrame], str]: """Validate uploaded prediction file.""" try: if file is None: return "āŒ Please upload a predictions file", None, "community" if not model_name.strip(): return "āŒ Please provide a model name", None, "community" # Handle different file input types if isinstance(file, bytes): file_content = file elif isinstance(file, str): if os.path.exists(file): with open(file, "rb") as f: file_content = f.read() else: file_content = file.encode("utf-8") elif hasattr(file, "name") and os.path.exists(file.name): with open(file.name, "rb") as f: file_content = f.read() else: return "āŒ Could not read uploaded file", None, "community" filename = getattr(file, "name", None) or getattr(file, "filename", None) or "predictions.csv" global complete_test_set if complete_test_set is None: complete_test_set = get_complete_test_set() validation_result = validate_submission( file_content, filename, complete_test_set, model_name, author, description ) detected_category = validation_result.get("category", "community") if validation_result.get("can_evaluate", False): return validation_result["report"], validation_result["predictions"], detected_category else: return validation_result["report"], None, detected_category except Exception as e: return f"āŒ Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}", None, "community" def evaluate_submission( predictions_df: pd.DataFrame, model_name: str, author: str, description: str, detected_category: str, ) -> Tuple[str, pd.DataFrame, object, object]: """Evaluate validated predictions.""" try: if predictions_df is None: return "āŒ No valid predictions to evaluate", None, None, None global complete_test_set, current_leaderboard if complete_test_set is None: complete_test_set = get_complete_test_set() print(f"šŸ”¬ Starting evaluation for {model_name}...") evaluation_results = evaluate_predictions(predictions_df, complete_test_set, detected_category) if evaluation_results.get('error'): return f"āŒ Evaluation error: {evaluation_results['error']}", None, None, None print("šŸ† Adding to leaderboard...") updated_leaderboard = add_model_to_leaderboard( model_name=sanitize_model_name(model_name), author=author or "Anonymous", evaluation_results=evaluation_results, model_category=detected_category, description=description or "" ) current_leaderboard = updated_leaderboard report = generate_evaluation_report(evaluation_results, model_name) # Create visualizations summary_plot = create_performance_comparison_plot(updated_leaderboard, "google_comparable") google_leaderboard = get_track_leaderboard(updated_leaderboard, "google_comparable") display_leaderboard = prepare_leaderboard_display(google_leaderboard, "google_comparable") success_msg = f""" ## šŸŽ‰ Evaluation Complete! ### šŸ“Š Model Information: - **Model**: {model_name} - **Category**: {MODEL_CATEGORIES.get(detected_category, {}).get('name', detected_category)} - **Author**: {author or 'Anonymous'} {report} """ return success_msg, display_leaderboard, summary_plot, None except Exception as e: error_msg = f"āŒ Evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}" return error_msg, None, None, None def refresh_track_leaderboard(track: str, search_query: str = "", category_filter: str = "all") -> Tuple[pd.DataFrame, object, object, str]: """Refresh leaderboard for a specific track with filters.""" try: print(f"Refreshing {track} leaderboard...") global current_leaderboard if current_leaderboard is None: print("Loading leaderboard...") current_leaderboard = load_leaderboard() print(f"Leaderboard loaded with {len(current_leaderboard)} entries") # Get track leaderboard with robust error handling try: print(f"Getting track leaderboard for {track}...") track_leaderboard = get_track_leaderboard(current_leaderboard, track, category_filter=category_filter) print(f"Track leaderboard has {len(track_leaderboard)} entries") except Exception as e: print(f"Error getting track leaderboard: {e}") track_leaderboard = pd.DataFrame() # Apply search filter if search_query and not track_leaderboard.empty: try: print(f"Applying search filter: {search_query}") query_lower = search_query.lower() mask = ( track_leaderboard['model_name'].str.lower().str.contains(query_lower, na=False) | track_leaderboard['author'].str.lower().str.contains(query_lower, na=False) ) track_leaderboard = track_leaderboard[mask] print(f"After search filter: {len(track_leaderboard)} entries") except Exception as e: print(f"Error applying search filter: {e}") # Prepare display with error handling try: print("Preparing display...") display_df = prepare_leaderboard_display(track_leaderboard, track) print(f"Display prepared with {len(display_df)} rows") except Exception as e: print(f"Error preparing display: {e}") display_df = pd.DataFrame() # Create plots with error handling try: print("Creating ranking plot...") ranking_plot = create_leaderboard_plot(track_leaderboard, track) except Exception as e: print(f"Error creating ranking plot: {e}") ranking_plot = None try: print("Creating comparison plot...") comparison_plot = create_performance_comparison_plot(track_leaderboard, track) except Exception as e: print(f"Error creating comparison plot: {e}") comparison_plot = None # Generate stats text with safe formatting try: print("Generating stats...") track_config = EVALUATION_TRACKS[track] best_model = "None" best_score = 0.0 if not track_leaderboard.empty: best_model = str(track_leaderboard.iloc[0]['model_name']) quality_col = f'{track}_quality' if quality_col in track_leaderboard.columns: try: score_val = track_leaderboard.iloc[0][quality_col] best_score = float(score_val) if pd.notnull(score_val) else 0.0 except (ValueError, TypeError): best_score = 0.0 stats_text = f""" ### šŸ“Š {track_config['name']} Statistics - **Total Models**: {len(track_leaderboard)} - **Best Model**: {best_model} - **Best Score**: {best_score:.4f} ### šŸ”¬ Track Information: {track_config.get('description', 'No description available')} """ print("Stats generated successfully") except Exception as e: print(f"Error generating stats: {e}") stats_text = f"Error loading {track} statistics: {str(e)}" print("Track refresh completed successfully") return display_df, ranking_plot, comparison_plot, stats_text except Exception as e: error_msg = f"Error loading {track} leaderboard: {str(e)}" print(f"MAIN ERROR: {error_msg}") import traceback traceback.print_exc() return pd.DataFrame(), None, None, error_msg def get_language_pair_comparison(track: str) -> Tuple[pd.DataFrame, object]: """Get language pair comparison data and visualization.""" try: global current_leaderboard if current_leaderboard is None: return pd.DataFrame(), None track_leaderboard = get_track_leaderboard(current_leaderboard, track) if track_leaderboard.empty: return pd.DataFrame(), None # Create language pair comparison table pairs_data = [] track_languages = EVALUATION_TRACKS[track]["languages"] for src in track_languages: for tgt in track_languages: if src == tgt: continue pair_key = f"{src}_to_{tgt}" pair_display = f"{LANGUAGE_NAMES.get(src, src)} → {LANGUAGE_NAMES.get(tgt, tgt)}" for _, model in track_leaderboard.iterrows(): # Extract detailed results if available detailed_col = f'detailed_{track}' if detailed_col in model and pd.notna(model[detailed_col]): try: detailed_results = json.loads(model[detailed_col]) pair_metrics = detailed_results.get('pair_metrics', {}) if pair_key in pair_metrics: metrics = pair_metrics[pair_key] pairs_data.append({ 'Language Pair': pair_display, 'Model': model['model_name'], 'Category': model['model_category'], 'Quality Score': metrics.get('quality_score', {}).get('mean', 0), 'BLEU': metrics.get('bleu', {}).get('mean', 0), 'ChrF': metrics.get('chrf', {}).get('mean', 0), 'Samples': metrics.get('sample_count', 0) }) except (json.JSONDecodeError, KeyError): continue pairs_df = pd.DataFrame(pairs_data) if pairs_df.empty: return pd.DataFrame(), None # Create visualization comparison_plot = create_language_pair_comparison_plot(pairs_df, track) return pairs_df, comparison_plot except Exception as e: print(f"Error in language pair comparison: {e}") return pd.DataFrame(), None # Initialize data on startup initialization_success = initialize_data() # Create Gradio interface with gr.Blocks( title="šŸ† SALT Translation Leaderboard", theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 1600px !important; margin: 0 auto; } /* Force readable text in all themes */ .markdown, .gr-markdown, .gr-html { color: var(--body-text-color) !important; background: var(--background-fill-primary) !important; } .markdown h1, .markdown h2, .markdown h3, .gr-markdown h1, .gr-markdown h2, .gr-markdown h3 { color: var(--body-text-color) !important; } .markdown p, .markdown li, .markdown strong, .gr-markdown p, .gr-markdown li, .gr-markdown strong { color: var(--body-text-color) !important; } /* Table styling */ .dataframe, .gr-dataframe { color: var(--body-text-color) !important; background: var(--background-fill-primary) !important; } /* Button and input styling */ .gr-button, .gr-textbox, .gr-dropdown { color: var(--body-text-color) !important; } /* Ensure plot backgrounds work in both themes */ .plot-container { background: var(--background-fill-primary) !important; } """ ) as demo: # Header gr.HTML("""

šŸ† SALT Translation Leaderboard

Rigorous Evaluation of Translation Models on Ugandan Languages

Three-tier evaluation • Statistical confidence intervals • Research-grade analysis

""") # Status indicator if initialization_success: status_msg = "āœ… System initialized successfully" else: status_msg = "āŒ System initialization failed - some features may not work" gr.Markdown(f"**System Status**: {status_msg}") with gr.Tabs(): # Tab 1: Download Test Set with gr.Tab("šŸ“„ Download Test Set", id="download"): gr.Markdown(""" ## šŸ“‹ Get the SALT Test Set Download our test set for translation model evaluation. """) download_btn = gr.Button("šŸ“„ Download Test Set", variant="primary", size="lg") with gr.Row(): with gr.Column(): download_file = gr.File(label="šŸ“‚ Test Set File", interactive=False) with gr.Column(): download_info = gr.Markdown() # Tab 2: Submit Predictions with gr.Tab("šŸš€ Submit Predictions", id="submit"): gr.Markdown(""" ## šŸŽÆ Submit Your Model's Predictions Upload predictions for evaluation across all tracks. """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### šŸ“ Model Information") model_name_input = gr.Textbox( label="šŸ¤– Model Name", placeholder="e.g., MyTranslator-v2.0", info="Unique name for your model" ) author_input = gr.Textbox( label="šŸ‘¤ Author/Organization", placeholder="Your name or organization", value="Anonymous" ) description_input = gr.Textbox( label="šŸ“„ Model Description", placeholder="Architecture, training data, special features...", lines=4 ) predictions_file = gr.File( label="šŸ“‚ Predictions File", file_types=[".csv", ".tsv", ".json"] ) validate_btn = gr.Button("āœ… Validate Submission", variant="secondary") submit_btn = gr.Button("šŸš€ Submit for Evaluation", variant="primary", interactive=False) with gr.Column(scale=1): validation_output = gr.Markdown() gr.Markdown("### šŸ† Evaluation Results") evaluation_output = gr.Markdown() with gr.Row(): with gr.Column(): submission_plot = gr.Plot(label="šŸ“ˆ Performance Analysis") with gr.Column(): results_table = gr.Dataframe(label="šŸ“Š Updated Leaderboard", interactive=False) # Tab 3: Google-Comparable Track with gr.Tab("šŸ¤– Google-Comparable Track", id="google_track"): gr.Markdown(f""" ## {EVALUATION_TRACKS['google_comparable']['name']} **{EVALUATION_TRACKS['google_comparable']['description']}** This track evaluates models on language pairs supported by Google Translate, enabling direct comparison with commercial baselines. """) with gr.Row(): with gr.Column(scale=2): google_search = gr.Textbox(label="šŸ” Search Models", placeholder="Search by model name, author...") with gr.Column(scale=1): google_category = gr.Dropdown( label="šŸ·ļø Category Filter", choices=["all"] + list(MODEL_CATEGORIES.keys()), value="all" ) with gr.Column(scale=1): google_refresh = gr.Button("šŸ”„ Refresh", variant="secondary") google_stats = gr.Markdown() with gr.Row(): with gr.Column(): google_ranking_plot = gr.Plot(label="šŸ† Rankings") with gr.Column(): google_comparison_plot = gr.Plot(label="šŸ“Š Performance Comparison") google_leaderboard = gr.Dataframe(label="šŸ“ˆ Google-Comparable Leaderboard", interactive=False) # Tab 4: UG40-Complete Track with gr.Tab("šŸŒ UG40-Complete Track", id="ug40_track"): gr.Markdown(f""" ## {EVALUATION_TRACKS['ug40_complete']['name']} **{EVALUATION_TRACKS['ug40_complete']['description']}** This track evaluates models on all UG40 language pairs, providing comprehensive assessment of Ugandan language translation capabilities. """) with gr.Row(): with gr.Column(scale=2): ug40_search = gr.Textbox(label="šŸ” Search Models", placeholder="Search by model name, author...") with gr.Column(scale=1): ug40_category = gr.Dropdown( label="šŸ·ļø Category Filter", choices=["all"] + list(MODEL_CATEGORIES.keys()), value="all" ) with gr.Column(scale=1): ug40_refresh = gr.Button("šŸ”„ Refresh", variant="secondary") ug40_stats = gr.Markdown() with gr.Row(): with gr.Column(): ug40_ranking_plot = gr.Plot(label="šŸ† Rankings") with gr.Column(): ug40_comparison_plot = gr.Plot(label="šŸ“Š Performance Comparison") ug40_leaderboard = gr.Dataframe(label="šŸ“ˆ UG40-Complete Leaderboard", interactive=False) # Tab 5: Language Pair Analysis with gr.Tab("šŸ“Š Language Pair Analysis", id="pairs_analysis"): gr.Markdown(""" ## šŸ“Š Language Pair Performance Analysis Compare model performance across individual language pairs with detailed breakdowns. """) with gr.Row(): with gr.Column(scale=1): pairs_track_select = gr.Dropdown( label="šŸ Select Track", choices=list(EVALUATION_TRACKS.keys()), value="google_comparable" ) with gr.Column(scale=1): pairs_refresh = gr.Button("šŸ”„ Analyze Language Pairs", variant="primary") pairs_comparison_plot = gr.Plot(label="šŸ“Š Language Pair Comparison") pairs_table = gr.Dataframe(label="šŸ“ˆ Language Pair Performance", interactive=False) # Tab 6: Documentation with gr.Tab("šŸ“š Documentation", id="docs"): gr.Markdown(f""" # šŸ“– SALT Translation Leaderboard Documentation ## šŸŽÆ Overview The SALT Translation Leaderboard provides rigorous evaluation of translation models on Ugandan languages using three different tracks for fair comparison. ## šŸ Evaluation Tracks **1. šŸ¤– Google-Comparable Track** - **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])} - **Purpose**: Fair comparison with commercial translation systems - **Language Pairs**: {len([1 for src in GOOGLE_SUPPORTED_LANGUAGES for tgt in GOOGLE_SUPPORTED_LANGUAGES if src != tgt])} **2. šŸŒ UG40-Complete Track** - **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages - **Purpose**: Comprehensive Ugandan language capability assessment - **Language Pairs**: {len([1 for src in ALL_UG40_LANGUAGES for tgt in ALL_UG40_LANGUAGES if src != tgt])} ## šŸ“Š Evaluation Metrics ### Primary Metrics - **Quality Score**: Composite metric (0-1) combining BLEU, ChrF, and error rates - **BLEU**: Bilingual Evaluation Understudy (0-100) - **ChrF**: Character-level F-score (0-1) ### Model Categories Models are automatically categorized for fair comparison: - **šŸ¢ Commercial**: Production translation systems - **šŸ”¬ Research**: Academic and research institution models - **šŸ“Š Baseline**: Simple baseline and reference models - **šŸ‘„ Community**: User-submitted models ## šŸ”„ Submission Process ### Step 1: Download Test Set 1. Click "Download Test Set" in the first tab 2. Save the test set file ### Step 2: Generate Predictions 1. Load the test set in your evaluation pipeline 2. For each row, translate `source_text` from `source_language` to `target_language` 3. Save results as CSV with columns: `sample_id`, `prediction` ### Step 3: Submit & Evaluate 1. Fill in model information 2. Upload your predictions file 3. Review validation report 4. Submit for evaluation ## šŸ“‹ File Formats ### Test Set Format ```csv sample_id,source_text,source_language,target_language,domain,google_comparable salt_000001,"Hello world",eng,lug,general,true salt_000002,"How are you?",eng,ach,conversation,true ``` ### Predictions Format ```csv sample_id,prediction salt_000001,"Amakuru ensi" salt_000002,"Ibino nining?" ``` ## šŸ¤ Contributing This leaderboard is designed for the research community. When using results: 1. Consider the appropriate track for your comparison 2. Report confidence intervals when available 3. Acknowledge the model category in comparisons --- *For questions, contact the team at research@sunbird.ai* """) # Event handlers predictions_validated = gr.State(value=None) detected_category_state = gr.State(value="community") # Download test set download_btn.click( fn=download_test_set, outputs=[download_file, download_info] ) # Validate predictions def handle_validation(file, model_name, author, description): report, predictions, category = validate_submission_file(file, model_name, author, description) can_evaluate = predictions is not None if can_evaluate: button_status = "\n\nāœ… **Ready to submit for evaluation!**" else: button_status = "\n\nāŒ **Please fix issues above before evaluation**" enhanced_report = report + button_status return ( enhanced_report, predictions, category, gr.update(interactive=can_evaluate) ) validate_btn.click( fn=handle_validation, inputs=[predictions_file, model_name_input, author_input, description_input], outputs=[validation_output, predictions_validated, detected_category_state, submit_btn] ) # Submit for evaluation submit_btn.click( fn=evaluate_submission, inputs=[predictions_validated, model_name_input, author_input, description_input, detected_category_state], outputs=[evaluation_output, results_table, submission_plot, gr.Plot(visible=False)] ) # Track leaderboard refresh functions google_refresh.click( fn=lambda *args: refresh_track_leaderboard("google_comparable", *args), inputs=[google_search, google_category], outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats] ) ug40_refresh.click( fn=lambda *args: refresh_track_leaderboard("ug40_complete", *args), inputs=[ug40_search, ug40_category], outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats] ) # Language pair analysis pairs_refresh.click( fn=get_language_pair_comparison, inputs=[pairs_track_select], outputs=[pairs_table, pairs_comparison_plot] ) # Load initial data and update dropdowns def load_initial_data(): try: print("Loading initial data...") global current_leaderboard # Make sure we have a leaderboard if current_leaderboard is None: current_leaderboard = load_leaderboard() print(f"Current leaderboard has {len(current_leaderboard)} entries") # Try to load Google track data try: google_data = refresh_track_leaderboard("google_comparable", "", "all") print("Successfully loaded Google track data") return google_data except Exception as e: print(f"Error loading Google track: {e}") # Return empty data if there's an error empty_df = pd.DataFrame() return (empty_df, None, None, "No data available") except Exception as e: print(f"Error in load_initial_data: {e}") empty_df = pd.DataFrame() return (empty_df, None, None, "Error loading data") demo.load( fn=load_initial_data, outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats] ) # Launch the application if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False, show_error=True )