# app.py
import subprocess
import sys
import os
from pathlib import Path
import traceback
from datetime import datetime
from typing import Optional, Dict, Tuple, List

def setup_salt():
    """Clone and setup SALT library like in Colab."""
    try:
        import salt.dataset
        print("✅ SALT library already available")
        return True
    except ImportError:
        pass
    
    print("📥 Setting up SALT library...")
    
    try:
        salt_dir = Path("salt")
        if not salt_dir.exists():
            print("🔄 Cloning SALT repository...")
            subprocess.check_call([
                "git", "clone", "https://github.com/sunbirdai/salt.git"
            ])
        else:
            print("📁 SALT repository already exists")
        
        salt_requirements = salt_dir / "requirements.txt"
        if salt_requirements.exists():
            print("📦 Installing SALT requirements...")
            subprocess.check_call([
                sys.executable, "-m", "pip", "install", "-q", "-r", str(salt_requirements)
            ])
        
        salt_path = str(salt_dir.absolute())
        if salt_path not in sys.path:
            sys.path.insert(0, salt_path)
            print(f"🔗 Added {salt_path} to Python path")
        
        import salt.dataset
        print("✅ SALT library setup completed successfully")
        return True
        
    except Exception as e:
        print(f"❌ Failed to setup SALT: {e}")
        return False

# Setup SALT on startup
print("🚀 Starting SALT Translation Leaderboard...")
if not setup_salt():
    print("❌ Cannot continue without SALT library")
    sys.exit(1)

import gradio as gr
import pandas as pd
import json

# Import our modules
from src.test_set import (
    get_public_test_set, 
    get_complete_test_set,
    create_test_set_download
)
from src.validation import validate_submission
from src.evaluation import evaluate_predictions, generate_evaluation_report
from src.leaderboard import (
    load_leaderboard, 
    add_model_to_leaderboard,
    get_track_leaderboard,
    prepare_leaderboard_display
)
from src.plotting import (
    create_leaderboard_plot, 
    create_language_pair_heatmap,
    create_performance_comparison_plot,
    create_language_pair_comparison_plot
)
from src.utils import sanitize_model_name, get_all_language_pairs
from config import *

# Global variables for caching
current_leaderboard = None
public_test_set = None
complete_test_set = None

def initialize_data():
    """Initialize test sets and leaderboard data."""
    global public_test_set, complete_test_set, current_leaderboard
    
    try:
        print("📥 Loading test sets...")
        public_test_set = get_public_test_set()
        complete_test_set = get_complete_test_set()
        
        print("🏆 Loading leaderboard...")
        current_leaderboard = load_leaderboard()
        
        # Debug leaderboard content
        print(f"Leaderboard loaded with {len(current_leaderboard)} entries")
        if not current_leaderboard.empty:
            print(f"Leaderboard columns: {list(current_leaderboard.columns)}")
            print(f"Sample row types: {current_leaderboard.dtypes.to_dict()}")
        else:
            print("Leaderboard is empty - will show empty interface")
        
        print(f"✅ Initialization complete!")
        print(f"   - Test set: {len(public_test_set):,} samples")
        print(f"   - Current models: {len(current_leaderboard)}")
        
        return True
        
    except Exception as e:
        print(f"❌ Initialization failed: {e}")
        import traceback
        traceback.print_exc()
        return False

def download_test_set() -> Tuple[str, str]:
    """Create downloadable test set and return file path and info."""
    try:
        global public_test_set
        if public_test_set is None:
            public_test_set = get_public_test_set()
        
        download_path, stats = create_test_set_download()
        
        info_msg = f"""
## 📥 SALT Test Set Downloaded Successfully!

### 📊 Dataset Statistics:
- **Total Samples**: {stats['total_samples']:,}
- **Languages**: {len(stats.get('languages', []))} ({', '.join(stats.get('languages', []))})
- **Google Comparable**: {stats.get('google_comparable_samples', 0):,} samples
- **Language Pairs**: {stats.get('language_pairs', 0)}

### 🏁 Track Breakdown:
"""
        
        track_breakdown = stats.get('track_breakdown', {})
        for track_name, track_info in track_breakdown.items():
            info_msg += f"""
**{EVALUATION_TRACKS[track_name]['name']}**:
- Samples: {track_info.get('total_samples', 0):,}
- Language Pairs: {track_info.get('language_pairs', 0)}
"""
        
        info_msg += f"""

### 📋 File Format:
- `sample_id`: Unique identifier for each sample
- `source_text`: Text to be translated
- `source_language`: Source language code
- `target_language`: Target language code  
- `domain`: Content domain (if available)
- `google_comparable`: Whether this pair can be compared with Google Translate

### 🔬 Next Steps:
1. **Run your model** on the source texts to generate translations
2. **Create a predictions file** with columns: `sample_id`, `prediction`
3. **Submit** your predictions using the submission tab
        """
        
        return download_path, info_msg
        
    except Exception as e:
        error_msg = f"❌ Error creating test set download: {str(e)}"
        return None, error_msg

def validate_submission_file(file, model_name: str, author: str, description: str) -> Tuple[str, Optional[pd.DataFrame], str]:
    """Validate uploaded prediction file."""
    try:
        if file is None:
            return "❌ Please upload a predictions file", None, "community"
        if not model_name.strip():
            return "❌ Please provide a model name", None, "community"

        # Handle different file input types
        if isinstance(file, bytes):
            file_content = file
        elif isinstance(file, str):
            if os.path.exists(file):
                with open(file, "rb") as f:
                    file_content = f.read()
            else:
                file_content = file.encode("utf-8")
        elif hasattr(file, "name") and os.path.exists(file.name):
            with open(file.name, "rb") as f:
                file_content = f.read()
        else:
            return "❌ Could not read uploaded file", None, "community"

        filename = getattr(file, "name", None) or getattr(file, "filename", None) or "predictions.csv"

        global complete_test_set
        if complete_test_set is None:
            complete_test_set = get_complete_test_set()

        validation_result = validate_submission(
            file_content, filename, complete_test_set, model_name, author, description
        )

        detected_category = validation_result.get("category", "community")
        
        if validation_result.get("can_evaluate", False):
            return validation_result["report"], validation_result["predictions"], detected_category
        else:
            return validation_result["report"], None, detected_category

    except Exception as e:
        return f"❌ Validation error: {e}\n\nTraceback:\n{traceback.format_exc()}", None, "community"

def evaluate_submission(
    predictions_df: pd.DataFrame,
    model_name: str,
    author: str,
    description: str,
    detected_category: str,
) -> Tuple[str, pd.DataFrame, object, object]:
    """Evaluate validated predictions."""
    try:
        if predictions_df is None:
            return "❌ No valid predictions to evaluate", None, None, None
        
        global complete_test_set, current_leaderboard
        if complete_test_set is None:
            complete_test_set = get_complete_test_set()
        
        print(f"🔬 Starting evaluation for {model_name}...")
        evaluation_results = evaluate_predictions(predictions_df, complete_test_set, detected_category)
        
        if evaluation_results.get('error'):
            return f"❌ Evaluation error: {evaluation_results['error']}", None, None, None
        
        print("🏆 Adding to leaderboard...")
        updated_leaderboard = add_model_to_leaderboard(
            model_name=sanitize_model_name(model_name),
            author=author or "Anonymous",
            evaluation_results=evaluation_results,
            model_category=detected_category,
            description=description or ""
        )
        
        current_leaderboard = updated_leaderboard
        
        report = generate_evaluation_report(evaluation_results, model_name)
        
        # Create visualizations
        summary_plot = create_performance_comparison_plot(updated_leaderboard, "google_comparable")
        google_leaderboard = get_track_leaderboard(updated_leaderboard, "google_comparable")
        display_leaderboard = prepare_leaderboard_display(google_leaderboard, "google_comparable")
        
        success_msg = f"""
## 🎉 Evaluation Complete!

### 📊 Model Information:
- **Model**: {model_name}
- **Category**: {MODEL_CATEGORIES.get(detected_category, {}).get('name', detected_category)}
- **Author**: {author or 'Anonymous'}

{report}
        """
        
        return success_msg, display_leaderboard, summary_plot, None

    except Exception as e:
        error_msg = f"❌ Evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
        return error_msg, None, None, None

def refresh_track_leaderboard(track: str, search_query: str = "", category_filter: str = "all") -> Tuple[pd.DataFrame, object, object, str]:
    """Refresh leaderboard for a specific track with filters."""
    try:
        print(f"Refreshing {track} leaderboard...")
        
        global current_leaderboard
        if current_leaderboard is None:
            print("Loading leaderboard...")
            current_leaderboard = load_leaderboard()
        
        print(f"Leaderboard loaded with {len(current_leaderboard)} entries")
        
        # Get track leaderboard with robust error handling
        try:
            print(f"Getting track leaderboard for {track}...")
            track_leaderboard = get_track_leaderboard(current_leaderboard, track, category_filter=category_filter)
            print(f"Track leaderboard has {len(track_leaderboard)} entries")
        except Exception as e:
            print(f"Error getting track leaderboard: {e}")
            track_leaderboard = pd.DataFrame()
        
        # Apply search filter
        if search_query and not track_leaderboard.empty:
            try:
                print(f"Applying search filter: {search_query}")
                query_lower = search_query.lower()
                mask = (
                    track_leaderboard['model_name'].str.lower().str.contains(query_lower, na=False) |
                    track_leaderboard['author'].str.lower().str.contains(query_lower, na=False)
                )
                track_leaderboard = track_leaderboard[mask]
                print(f"After search filter: {len(track_leaderboard)} entries")
            except Exception as e:
                print(f"Error applying search filter: {e}")
        
        # Prepare display with error handling
        try:
            print("Preparing display...")
            display_df = prepare_leaderboard_display(track_leaderboard, track)
            print(f"Display prepared with {len(display_df)} rows")
        except Exception as e:
            print(f"Error preparing display: {e}")
            display_df = pd.DataFrame()
        
        # Create plots with error handling
        try:
            print("Creating ranking plot...")
            ranking_plot = create_leaderboard_plot(track_leaderboard, track)
        except Exception as e:
            print(f"Error creating ranking plot: {e}")
            ranking_plot = None
        
        try:
            print("Creating comparison plot...")
            comparison_plot = create_performance_comparison_plot(track_leaderboard, track)
        except Exception as e:
            print(f"Error creating comparison plot: {e}")
            comparison_plot = None
        
        # Generate stats text with safe formatting
        try:
            print("Generating stats...")
            track_config = EVALUATION_TRACKS[track]
            best_model = "None"
            best_score = 0.0
            
            if not track_leaderboard.empty:
                best_model = str(track_leaderboard.iloc[0]['model_name'])
                quality_col = f'{track}_quality'
                if quality_col in track_leaderboard.columns:
                    try:
                        score_val = track_leaderboard.iloc[0][quality_col]
                        best_score = float(score_val) if pd.notnull(score_val) else 0.0
                    except (ValueError, TypeError):
                        best_score = 0.0
            
            stats_text = f"""
### 📊 {track_config['name']} Statistics

- **Total Models**: {len(track_leaderboard)}
- **Best Model**: {best_model}
- **Best Score**: {best_score:.4f}

### 🔬 Track Information:
{track_config.get('description', 'No description available')}
            """
            print("Stats generated successfully")
        except Exception as e:
            print(f"Error generating stats: {e}")
            stats_text = f"Error loading {track} statistics: {str(e)}"
        
        print("Track refresh completed successfully")
        return display_df, ranking_plot, comparison_plot, stats_text
        
    except Exception as e:
        error_msg = f"Error loading {track} leaderboard: {str(e)}"
        print(f"MAIN ERROR: {error_msg}")
        import traceback
        traceback.print_exc()
        return pd.DataFrame(), None, None, error_msg

def get_language_pair_comparison(track: str) -> Tuple[pd.DataFrame, object]:
    """Get language pair comparison data and visualization."""
    try:
        global current_leaderboard
        if current_leaderboard is None:
            return pd.DataFrame(), None
        
        track_leaderboard = get_track_leaderboard(current_leaderboard, track)
        
        if track_leaderboard.empty:
            return pd.DataFrame(), None
        
        # Create language pair comparison table
        pairs_data = []
        track_languages = EVALUATION_TRACKS[track]["languages"]
        
        for src in track_languages:
            for tgt in track_languages:
                if src == tgt:
                    continue
                
                pair_key = f"{src}_to_{tgt}"
                pair_display = f"{LANGUAGE_NAMES.get(src, src)} → {LANGUAGE_NAMES.get(tgt, tgt)}"
                
                for _, model in track_leaderboard.iterrows():
                    # Extract detailed results if available
                    detailed_col = f'detailed_{track}'
                    if detailed_col in model and pd.notna(model[detailed_col]):
                        try:
                            detailed_results = json.loads(model[detailed_col])
                            pair_metrics = detailed_results.get('pair_metrics', {})
                            
                            if pair_key in pair_metrics:
                                metrics = pair_metrics[pair_key]
                                pairs_data.append({
                                    'Language Pair': pair_display,
                                    'Model': model['model_name'],
                                    'Category': model['model_category'],
                                    'Quality Score': metrics.get('quality_score', {}).get('mean', 0),
                                    'BLEU': metrics.get('bleu', {}).get('mean', 0),
                                    'ChrF': metrics.get('chrf', {}).get('mean', 0),
                                    'Samples': metrics.get('sample_count', 0)
                                })
                        except (json.JSONDecodeError, KeyError):
                            continue
        
        pairs_df = pd.DataFrame(pairs_data)
        
        if pairs_df.empty:
            return pd.DataFrame(), None
        
        # Create visualization
        comparison_plot = create_language_pair_comparison_plot(pairs_df, track)
        
        return pairs_df, comparison_plot
        
    except Exception as e:
        print(f"Error in language pair comparison: {e}")
        return pd.DataFrame(), None

# Initialize data on startup
initialization_success = initialize_data()

# Create Gradio interface
with gr.Blocks(
    title="🏆 SALT Translation Leaderboard",
    theme=gr.themes.Soft(),
    css="""
    .gradio-container {
        max-width: 1600px !important;
        margin: 0 auto;
    }
    
    /* Force readable text in all themes */
    .markdown, .gr-markdown, .gr-html {
        color: var(--body-text-color) !important;
        background: var(--background-fill-primary) !important;
    }
    
    .markdown h1, .markdown h2, .markdown h3, 
    .gr-markdown h1, .gr-markdown h2, .gr-markdown h3 {
        color: var(--body-text-color) !important;
    }
    
    .markdown p, .markdown li, .markdown strong,
    .gr-markdown p, .gr-markdown li, .gr-markdown strong {
        color: var(--body-text-color) !important;
    }
    
    /* Table styling */
    .dataframe, .gr-dataframe {
        color: var(--body-text-color) !important;
        background: var(--background-fill-primary) !important;
    }
    
    /* Button and input styling */
    .gr-button, .gr-textbox, .gr-dropdown {
        color: var(--body-text-color) !important;
    }
    
    /* Ensure plot backgrounds work in both themes */
    .plot-container {
        background: var(--background-fill-primary) !important;
    }
    """
) as demo:
    
    # Header
    gr.HTML("""
    <div style="text-align: center; margin-bottom: 2rem; padding: 2rem; background: linear-gradient(135deg, #1e3a8a 0%, #3730a3 50%, #1e40af 100%); color: white !important; border-radius: 10px;">
    <h1 style="color: white !important;">🏆 SALT Translation Leaderboard</h1>
    <p style="color: white !important;"><strong>Rigorous Evaluation of Translation Models on Ugandan Languages</strong></p>
    <p style="color: white !important;">Three-tier evaluation • Statistical confidence intervals • Research-grade analysis</p>
    </div>
    """)
    
    # Status indicator
    if initialization_success:
        status_msg = "✅ System initialized successfully"
    else:
        status_msg = "❌ System initialization failed - some features may not work"
    
    gr.Markdown(f"**System Status**: {status_msg}")
    
    with gr.Tabs():
        
        # Tab 1: Download Test Set
        with gr.Tab("📥 Download Test Set", id="download"):
            gr.Markdown("""
            ## 📋 Get the SALT Test Set
            
            Download our test set for translation model evaluation.
            """)
            
            download_btn = gr.Button("📥 Download Test Set", variant="primary", size="lg")
            
            with gr.Row():
                with gr.Column():
                    download_file = gr.File(label="📂 Test Set File", interactive=False)
                with gr.Column():
                    download_info = gr.Markdown()
        
        # Tab 2: Submit Predictions  
        with gr.Tab("🚀 Submit Predictions", id="submit"):
            gr.Markdown("""
            ## 🎯 Submit Your Model's Predictions
            
            Upload predictions for evaluation across all tracks.
            """)
            
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### 📝 Model Information")
                    
                    model_name_input = gr.Textbox(
                        label="🤖 Model Name",
                        placeholder="e.g., MyTranslator-v2.0",
                        info="Unique name for your model"
                    )
                    
                    author_input = gr.Textbox(
                        label="👤 Author/Organization", 
                        placeholder="Your name or organization",
                        value="Anonymous"
                    )
                    
                    description_input = gr.Textbox(
                        label="📄 Model Description",
                        placeholder="Architecture, training data, special features...",
                        lines=4
                    )
                    
                    predictions_file = gr.File(
                        label="📂 Predictions File",
                        file_types=[".csv", ".tsv", ".json"]
                    )
                    
                    validate_btn = gr.Button("✅ Validate Submission", variant="secondary")
                    submit_btn = gr.Button("🚀 Submit for Evaluation", variant="primary", interactive=False)
                
                with gr.Column(scale=1):
                    validation_output = gr.Markdown()
            
            gr.Markdown("### 🏆 Evaluation Results")
            evaluation_output = gr.Markdown()
            
            with gr.Row():
                with gr.Column():
                    submission_plot = gr.Plot(label="📈 Performance Analysis")
                with gr.Column():
                    results_table = gr.Dataframe(label="📊 Updated Leaderboard", interactive=False)
        
        # Tab 3: Google-Comparable Track
        with gr.Tab("🤖 Google-Comparable Track", id="google_track"):
            gr.Markdown(f"""
            ## {EVALUATION_TRACKS['google_comparable']['name']}
            
            **{EVALUATION_TRACKS['google_comparable']['description']}**
            
            This track evaluates models on language pairs supported by Google Translate, 
            enabling direct comparison with commercial baselines.
            """)
            
            with gr.Row():
                with gr.Column(scale=2):
                    google_search = gr.Textbox(label="🔍 Search Models", placeholder="Search by model name, author...")
                with gr.Column(scale=1):
                    google_category = gr.Dropdown(
                        label="🏷️ Category Filter",
                        choices=["all"] + list(MODEL_CATEGORIES.keys()),
                        value="all"
                    )
                with gr.Column(scale=1):
                    google_refresh = gr.Button("🔄 Refresh", variant="secondary")
            
            google_stats = gr.Markdown()
            
            with gr.Row():
                with gr.Column():
                    google_ranking_plot = gr.Plot(label="🏆 Rankings")
                with gr.Column():
                    google_comparison_plot = gr.Plot(label="📊 Performance Comparison")
            
            google_leaderboard = gr.Dataframe(label="📈 Google-Comparable Leaderboard", interactive=False)
        
        # Tab 4: UG40-Complete Track
        with gr.Tab("🌍 UG40-Complete Track", id="ug40_track"):
            gr.Markdown(f"""
            ## {EVALUATION_TRACKS['ug40_complete']['name']}
            
            **{EVALUATION_TRACKS['ug40_complete']['description']}**
            
            This track evaluates models on all UG40 language pairs, 
            providing comprehensive assessment of Ugandan language translation capabilities.
            """)
            
            with gr.Row():
                with gr.Column(scale=2):
                    ug40_search = gr.Textbox(label="🔍 Search Models", placeholder="Search by model name, author...")
                with gr.Column(scale=1):
                    ug40_category = gr.Dropdown(
                        label="🏷️ Category Filter",
                        choices=["all"] + list(MODEL_CATEGORIES.keys()),
                        value="all"
                    )
                with gr.Column(scale=1):
                    ug40_refresh = gr.Button("🔄 Refresh", variant="secondary")
            
            ug40_stats = gr.Markdown()
            
            with gr.Row():
                with gr.Column():
                    ug40_ranking_plot = gr.Plot(label="🏆 Rankings")
                with gr.Column():
                    ug40_comparison_plot = gr.Plot(label="📊 Performance Comparison")
            
            ug40_leaderboard = gr.Dataframe(label="📈 UG40-Complete Leaderboard", interactive=False)
        
        # Tab 5: Language Pair Analysis
        with gr.Tab("📊 Language Pair Analysis", id="pairs_analysis"):
            gr.Markdown("""
            ## 📊 Language Pair Performance Analysis
            
            Compare model performance across individual language pairs with detailed breakdowns.
            """)
            
            with gr.Row():
                with gr.Column(scale=1):
                    pairs_track_select = gr.Dropdown(
                        label="🏁 Select Track",
                        choices=list(EVALUATION_TRACKS.keys()),
                        value="google_comparable"
                    )
                with gr.Column(scale=1):
                    pairs_refresh = gr.Button("🔄 Analyze Language Pairs", variant="primary")
            
            pairs_comparison_plot = gr.Plot(label="📊 Language Pair Comparison")
            pairs_table = gr.Dataframe(label="📈 Language Pair Performance", interactive=False)
        
        # Tab 6: Documentation
        with gr.Tab("📚 Documentation", id="docs"):
            gr.Markdown(f"""
            # 📖 SALT Translation Leaderboard Documentation
            
            ## 🎯 Overview
            
            The SALT Translation Leaderboard provides rigorous evaluation of translation models 
            on Ugandan languages using three different tracks for fair comparison.
            
            ## 🏁 Evaluation Tracks
            
            **1. 🤖 Google-Comparable Track**
            - **Languages**: {', '.join([LANGUAGE_NAMES[lang] for lang in GOOGLE_SUPPORTED_LANGUAGES])}
            - **Purpose**: Fair comparison with commercial translation systems
            - **Language Pairs**: {len([1 for src in GOOGLE_SUPPORTED_LANGUAGES for tgt in GOOGLE_SUPPORTED_LANGUAGES if src != tgt])}
            
            **2. 🌍 UG40-Complete Track**  
            - **Languages**: All {len(ALL_UG40_LANGUAGES)} UG40 languages
            - **Purpose**: Comprehensive Ugandan language capability assessment
            - **Language Pairs**: {len([1 for src in ALL_UG40_LANGUAGES for tgt in ALL_UG40_LANGUAGES if src != tgt])}
            
            ## 📊 Evaluation Metrics
            
            ### Primary Metrics
            - **Quality Score**: Composite metric (0-1) combining BLEU, ChrF, and error rates
            - **BLEU**: Bilingual Evaluation Understudy (0-100)
            - **ChrF**: Character-level F-score (0-1)
            
            ### Model Categories
            
            Models are automatically categorized for fair comparison:
            
            - **🏢 Commercial**: Production translation systems
            - **🔬 Research**: Academic and research institution models
            - **📊 Baseline**: Simple baseline and reference models
            - **👥 Community**: User-submitted models
            
            ## 🔄 Submission Process
            
            ### Step 1: Download Test Set
            1. Click "Download Test Set" in the first tab
            2. Save the test set file
            
            ### Step 2: Generate Predictions
            1. Load the test set in your evaluation pipeline
            2. For each row, translate `source_text` from `source_language` to `target_language`
            3. Save results as CSV with columns: `sample_id`, `prediction`
            
            ### Step 3: Submit & Evaluate
            1. Fill in model information
            2. Upload your predictions file
            3. Review validation report
            4. Submit for evaluation
            
            ## 📋 File Formats
            
            ### Test Set Format
            ```csv
            sample_id,source_text,source_language,target_language,domain,google_comparable
            salt_000001,"Hello world",eng,lug,general,true
            salt_000002,"How are you?",eng,ach,conversation,true
            ```
            
            ### Predictions Format
            ```csv
            sample_id,prediction
            salt_000001,"Amakuru ensi"
            salt_000002,"Ibino nining?"
            ```
            
            ## 🤝 Contributing
            
            This leaderboard is designed for the research community. When using results:
            
            1. Consider the appropriate track for your comparison
            2. Report confidence intervals when available
            3. Acknowledge the model category in comparisons
            
            ---
            
            *For questions, contact the team at research@sunbird.ai*
            """)
    
    # Event handlers
    predictions_validated = gr.State(value=None)
    detected_category_state = gr.State(value="community")
    
    # Download test set
    download_btn.click(
        fn=download_test_set,
        outputs=[download_file, download_info]
    )
    
    # Validate predictions
    def handle_validation(file, model_name, author, description):
        report, predictions, category = validate_submission_file(file, model_name, author, description)
        can_evaluate = predictions is not None
        
        if can_evaluate:
            button_status = "\n\n✅ **Ready to submit for evaluation!**"
        else:
            button_status = "\n\n❌ **Please fix issues above before evaluation**"
        
        enhanced_report = report + button_status
        
        return (
            enhanced_report,
            predictions,
            category,
            gr.update(interactive=can_evaluate)
        )
    
    validate_btn.click(
        fn=handle_validation,
        inputs=[predictions_file, model_name_input, author_input, description_input],
        outputs=[validation_output, predictions_validated, detected_category_state, submit_btn]
    )
    
    # Submit for evaluation
    submit_btn.click(
        fn=evaluate_submission,
        inputs=[predictions_validated, model_name_input, author_input, description_input, detected_category_state],
        outputs=[evaluation_output, results_table, submission_plot, gr.Plot(visible=False)]
    )
    
    # Track leaderboard refresh functions
    google_refresh.click(
        fn=lambda *args: refresh_track_leaderboard("google_comparable", *args),
        inputs=[google_search, google_category],
        outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats]
    )
    
    ug40_refresh.click(
        fn=lambda *args: refresh_track_leaderboard("ug40_complete", *args),
        inputs=[ug40_search, ug40_category],
        outputs=[ug40_leaderboard, ug40_ranking_plot, ug40_comparison_plot, ug40_stats]
    )
    
    # Language pair analysis
    pairs_refresh.click(
        fn=get_language_pair_comparison,
        inputs=[pairs_track_select],
        outputs=[pairs_table, pairs_comparison_plot]
    )
    
    # Load initial data and update dropdowns
    def load_initial_data():
        try:
            print("Loading initial data...")
            global current_leaderboard
            
            # Make sure we have a leaderboard
            if current_leaderboard is None:
                current_leaderboard = load_leaderboard()
            
            print(f"Current leaderboard has {len(current_leaderboard)} entries")
            
            # Try to load Google track data
            try:
                google_data = refresh_track_leaderboard("google_comparable", "", "all")
                print("Successfully loaded Google track data")
                return google_data
            except Exception as e:
                print(f"Error loading Google track: {e}")
                # Return empty data if there's an error
                empty_df = pd.DataFrame()
                return (empty_df, None, None, "No data available")
                
        except Exception as e:
            print(f"Error in load_initial_data: {e}")
            empty_df = pd.DataFrame()
            return (empty_df, None, None, "Error loading data")
    
    demo.load(
        fn=load_initial_data,
        outputs=[google_leaderboard, google_ranking_plot, google_comparison_plot, google_stats]
    )

# Launch the application
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True
    )