# app.py import subprocess import sys import os from pathlib import Path import traceback from datetime import datetime from typing import Optional, Dict, Tuple, List def setup_salt(): """Clone and setup SALT library like in Colab.""" try: import salt.dataset print("ā SALT library already available") return True except ImportError: pass print("š„ Setting up SALT library...") try: salt_dir = Path("salt") if not salt_dir.exists(): print("š Cloning SALT repository...") subprocess.check_call([ "git", "clone", "https://github.com/sunbirdai/salt.git" ]) else: print("š SALT repository already exists") salt_requirements = salt_dir / "requirements.txt" if salt_requirements.exists(): print("š¦ Installing SALT requirements...") subprocess.check_call([ sys.executable, "-m", "pip", "install", "-q", "-r", str(salt_requirements) ]) salt_path = str(salt_dir.absolute()) if salt_path not in sys.path: sys.path.insert(0, salt_path) print(f"š Added {salt_path} to Python path") import salt.dataset print("ā SALT library setup completed successfully") return True except Exception as e: print(f"ā Failed to setup SALT: {e}") return False # Setup SALT on startup print("š Starting SALT Translation Leaderboard...") if not setup_salt(): print("ā Cannot continue without SALT library") sys.exit(1) import gradio as gr import pandas as pd import json # Import our modules from src.test_set import ( get_public_test_set, get_complete_test_set, create_test_set_download ) from src.validation import validate_submission from src.evaluation import evaluate_predictions, generate_evaluation_report from src.leaderboard import ( load_leaderboard, add_model_to_leaderboard, get_track_leaderboard, prepare_leaderboard_display ) from src.plotting import ( create_leaderboard_plot, create_language_pair_heatmap, create_performance_comparison_plot, create_language_pair_comparison_plot ) from src.utils import sanitize_model_name, get_all_language_pairs from config import * # Global variables for caching current_leaderboard = None public_test_set = None complete_test_set = None def initialize_data(): """Initialize test sets and leaderboard data.""" global public_test_set, complete_test_set, current_leaderboard try: print("š„ Loading test sets...") public_test_set = get_public_test_set() complete_test_set = get_complete_test_set() print("š Loading leaderboard...") current_leaderboard = load_leaderboard() # Debug leaderboard content print(f"Leaderboard loaded with {len(current_leaderboard)} entries") if not current_leaderboard.empty: print(f"Leaderboard columns: {list(current_leaderboard.columns)}") print(f"Sample row types: {current_leaderboard.dtypes.to_dict()}") else: print("Leaderboard is empty - will show empty interface") print(f"ā Initialization complete!") print(f" - Test set: {len(public_test_set):,} samples") print(f" - Current models: {len(current_leaderboard)}") return True except Exception as e: print(f"ā Initialization failed: {e}") import traceback traceback.print_exc() return False def download_test_set() -> Tuple[str, str]: """Create downloadable test set and return file path and info.""" try: global public_test_set if public_test_set is None: public_test_set = get_public_test_set() download_path, stats = create_test_set_download() info_msg = f""" ## š„ SALT Test Set Downloaded Successfully! ### š Dataset Statistics: - **Total Samples**: {stats['total_samples']:,} - **Languages**: {len(stats.get('languages', []))} ({', '.join(stats.get('languages', []))}) - **Google Comparable**: {stats.get('google_comparable_samples', 0):,} samples - **Language Pairs**: {stats.get('language_pairs', 0)} ### š Track Breakdown: """ track_breakdown = stats.get('track_breakdown', {}) for track_name, track_info in track_breakdown.items(): info_msg += f""" **{EVALUATION_TRACKS[track_name]['name']}**: - Samples: {track_info.get('total_samples', 0):,} - Language Pairs: {track_info.get('language_pairs', 0)} """ info_msg += f""" ### š File Format: - `sample_id`: Unique identifier for each sample - `source_text`: Text to be translated - `source_language`: Source language code - `target_language`: Target language code - `domain`: Content domain (if available) - `google_comparable`: Whether this pair can be compared with Google Translate ### š¬ Next Steps: 1. **Run your model** on the source texts to generate translations 2. **Create a predictions file** with columns: `sample_id`, `prediction` 3. **Submit** your predictions using the submission tab """ return download_path, info_msg except Exception as e: error_msg = f"ā Error creating test set download: {str(e)}" return None, error_msg def validate_submission_file(file, model_name: str, author: str, description: str) -> Tuple[str, Optional[pd.DataFrame], str]: """Validate uploaded prediction file.""" try: if file is None: return "ā Please upload a predictions file", None, "community" if not model_name.strip(): return "ā Please provide a model name", None, "community" # Handle different file input types if isinstance(file, bytes): file_content = file elif isinstance(file, str): if os.path.exists(file): with open(file, "rb") as f: file_content = f.read() else: file_content = file.encode("utf-8") elif hasattr(file, "name") and os.path.exists(file.name): with open(file.name, "rb") as f: file_content = f.read() else: return "ā Could not read uploaded file", None, "community" filename = getattr(file, "name", None) or getattr(file, "filename", None) or "predictions.csv" global complete_test_set if complete_test_set is None: complete_test_set = get_complete_test_set() validation_result = validate_submission( file_content, filename, complete_test_set, model_name, author, description ) detected_category = validation_result.get("category", "community") if validation_result.get("can_evaluate", False): return validation_result["report"], validation_result["predictions"], detected_category else: return validation_result["report"], None, detected_category except Exception as e: return f"ā Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}", None, "community" def evaluate_submission( predictions_df: pd.DataFrame, model_name: str, author: str, description: str, detected_category: str, ) -> Tuple[str, pd.DataFrame, object, object]: """Evaluate validated predictions.""" try: if predictions_df is None: return "ā No valid predictions to evaluate", None, None, None global complete_test_set, current_leaderboard if complete_test_set is None: complete_test_set = get_complete_test_set() print(f"š¬ Starting evaluation for {model_name}...") evaluation_results = evaluate_predictions(predictions_df, complete_test_set, detected_category) if evaluation_results.get('error'): return f"ā Evaluation error: {evaluation_results['error']}", None, None, None print("š Adding to leaderboard...") updated_leaderboard = add_model_to_leaderboard( model_name=sanitize_model_name(model_name), author=author or "Anonymous", evaluation_results=evaluation_results, model_category=detected_category, description=description or "" ) current_leaderboard = updated_leaderboard report = generate_evaluation_report(evaluation_results, model_name) # Create visualizations summary_plot = create_performance_comparison_plot(updated_leaderboard, "google_comparable") google_leaderboard = get_track_leaderboard(updated_leaderboard, "google_comparable") display_leaderboard = prepare_leaderboard_display(google_leaderboard, "google_comparable") success_msg = f""" ## š Evaluation Complete! ### š Model Information: - **Model**: {model_name} - **Category**: {MODEL_CATEGORIES.get(detected_category, {}).get('name', detected_category)} - **Author**: {author or 'Anonymous'} {report} """ return success_msg, display_leaderboard, summary_plot, None except Exception as e: error_msg = f"ā Evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}" return error_msg, None, None, None def refresh_track_leaderboard(track: str, search_query: str = "", category_filter: str = "all") -> Tuple[pd.DataFrame, object, object, str]: """Refresh leaderboard for a specific track with filters.""" try: print(f"Refreshing {track} leaderboard...") global current_leaderboard if current_leaderboard is None: print("Loading leaderboard...") current_leaderboard = load_leaderboard() print(f"Leaderboard loaded with {len(current_leaderboard)} entries") # Get track leaderboard with robust error handling try: print(f"Getting track leaderboard for {track}...") track_leaderboard = get_track_leaderboard(current_leaderboard, track, category_filter=category_filter) print(f"Track leaderboard has {len(track_leaderboard)} entries") except Exception as e: print(f"Error getting track leaderboard: {e}") track_leaderboard = pd.DataFrame() # Apply search filter if search_query and not track_leaderboard.empty: try: print(f"Applying search filter: {search_query}") query_lower = search_query.lower() mask = ( track_leaderboard['model_name'].str.lower().str.contains(query_lower, na=False) | track_leaderboard['author'].str.lower().str.contains(query_lower, na=False) ) track_leaderboard = track_leaderboard[mask] print(f"After search filter: {len(track_leaderboard)} entries") except Exception as e: print(f"Error applying search filter: {e}") # Prepare display with error handling try: print("Preparing display...") display_df = prepare_leaderboard_display(track_leaderboard, track) print(f"Display prepared with {len(display_df)} rows") except Exception as e: print(f"Error preparing display: {e}") display_df = pd.DataFrame() # Create plots with error handling try: print("Creating ranking plot...") ranking_plot = create_leaderboard_plot(track_leaderboard, track) except Exception as e: print(f"Error creating ranking plot: {e}") ranking_plot = None try: print("Creating comparison plot...") comparison_plot = create_performance_comparison_plot(track_leaderboard, track) except Exception as e: print(f"Error creating comparison plot: {e}") comparison_plot = None # Generate stats text with safe formatting try: print("Generating stats...") track_config = EVALUATION_TRACKS[track] best_model = "None" best_score = 0.0 if not track_leaderboard.empty: best_model = str(track_leaderboard.iloc[0]['model_name']) quality_col = f'{track}_quality' if quality_col in track_leaderboard.columns: try: score_val = track_leaderboard.iloc[0][quality_col] best_score = float(score_val) if pd.notnull(score_val) else 0.0 except (ValueError, TypeError): best_score = 0.0 stats_text = f""" ### š {track_config['name']} Statistics - **Total Models**: {len(track_leaderboard)} - **Best Model**: {best_model} - **Best Score**: {best_score:.4f} ### š¬ Track Information: {track_config.get('description', 'No description available')} """ print("Stats generated successfully") except Exception as e: print(f"Error generating stats: {e}") stats_text = f"Error loading {track} statistics: {str(e)}" print("Track refresh completed successfully") return display_df, ranking_plot, comparison_plot, stats_text except Exception as e: error_msg = f"Error loading {track} leaderboard: {str(e)}" print(f"MAIN ERROR: {error_msg}") import traceback traceback.print_exc() return pd.DataFrame(), None, None, error_msg def get_language_pair_comparison(track: str) -> Tuple[pd.DataFrame, object]: """Get language pair comparison data and visualization.""" try: global current_leaderboard if current_leaderboard is None: return pd.DataFrame(), None track_leaderboard = get_track_leaderboard(current_leaderboard, track) if track_leaderboard.empty: return pd.DataFrame(), None # Create language pair comparison table pairs_data = [] track_languages = EVALUATION_TRACKS[track]["languages"] for src in track_languages: for tgt in track_languages: if src == tgt: continue pair_key = f"{src}_to_{tgt}" pair_display = f"{LANGUAGE_NAMES.get(src, src)} ā {LANGUAGE_NAMES.get(tgt, tgt)}" for _, model in track_leaderboard.iterrows(): # Extract detailed results if available detailed_col = f'detailed_{track}' if detailed_col in model and pd.notna(model[detailed_col]): try: detailed_results = json.loads(model[detailed_col]) pair_metrics = detailed_results.get('pair_metrics', {}) if pair_key in pair_metrics: metrics = pair_metrics[pair_key] pairs_data.append({ 'Language Pair': pair_display, 'Model': model['model_name'], 'Category': model['model_category'], 'Quality Score': metrics.get('quality_score', {}).get('mean', 0), 'BLEU': metrics.get('bleu', {}).get('mean', 0), 'ChrF': metrics.get('chrf', {}).get('mean', 0), 'Samples': metrics.get('sample_count', 0) }) except (json.JSONDecodeError, KeyError): continue pairs_df = pd.DataFrame(pairs_data) if pairs_df.empty: return pd.DataFrame(), None # Create visualization comparison_plot = create_language_pair_comparison_plot(pairs_df, track) return pairs_df, comparison_plot except Exception as e: print(f"Error in language pair comparison: {e}") return pd.DataFrame(), None # Initialize data on startup initialization_success = initialize_data() # Create Gradio interface with gr.Blocks( title="š SALT Translation Leaderboard", theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 1600px !important; margin: 0 auto; } /* Force readable text in all themes */ .markdown, .gr-markdown, .gr-html { color: var(--body-text-color) !important; background: var(--background-fill-primary) !important; } .markdown h1, .markdown h2, .markdown h3, .gr-markdown h1, .gr-markdown h2, .gr-markdown h3 { color: var(--body-text-color) !important; } .markdown p, .markdown li, .markdown strong, .gr-markdown p, .gr-markdown li, .gr-markdown strong { color: var(--body-text-color) !important; } /* Table styling */ .dataframe, .gr-dataframe { color: var(--body-text-color) !important; background: var(--background-fill-primary) !important; } /* Button and input styling */ .gr-button, .gr-textbox, .gr-dropdown { color: var(--body-text-color) !important; } /* Ensure plot backgrounds work in both themes */ .plot-container { background: var(--background-fill-primary) !important; } """ ) as demo: # Header gr.HTML("""
Rigorous Evaluation of Translation Models on Ugandan Languages
Three-tier evaluation ⢠Statistical confidence intervals ⢠Research-grade analysis