"""Main Gradio application for viewing dataset with text comparison.""" import gradio as gr import random from data_loader import get_dataset_size, get_sample from diff_utils import generate_html_diff, get_diff_stats def load_sample_data(sample_index: int | None = None) -> tuple[str, str, str, str]: """Load and process a sample from the dataset.""" if sample_index is None: sample_index = random.randint(0, get_dataset_size() - 1) # Ensure index is within bounds sample_index = max(0, min(sample_index, get_dataset_size() - 1)) original_text, cleaned_text = get_sample(sample_index) # Generate diff HTML diff_html = generate_html_diff(original_text, cleaned_text) # Get statistics stats = get_diff_stats(original_text, cleaned_text) stats_text = f""" **Sample #{sample_index}** - Original length: {stats['original_length']:,} characters - Cleaned length: {stats['cleaned_length']:,} characters - Characters removed: {stats['characters_removed']:,} - Similarity: {stats['similarity_ratio']}% """ return original_text, cleaned_text, diff_html, stats_text def create_interface() -> gr.Blocks: """Create the main Gradio interface.""" with gr.Blocks( title="Dataset Text Comparison Viewer", css=""" .textbox-container { max-height: 400px; overflow-y: auto; border: 1px solid #e0e0e0; border-radius: 8px; padding: 12px; background: #fafafa; } .stats-box { background: #f8f9fa; border: 1px solid #dee2e6; border-radius: 8px; padding: 16px; margin: 8px 0; } """ ) as interface: gr.Markdown("# Dataset Text Comparison Viewer") gr.Markdown("Compare original and cleaned text from **sumuks/essential-web-v1.0-sample-1M-with-cleaned-text**") with gr.Row(): with gr.Column(): sample_input = gr.Number( label="Sample Index", value=0, minimum=0, maximum=get_dataset_size() - 1, step=1 ) with gr.Row(): load_btn = gr.Button("Load Sample", variant="primary") random_btn = gr.Button("Random Sample", variant="secondary") # Statistics display stats_output = gr.Markdown(label="Statistics", elem_classes=["stats-box"]) # Text comparison with gr.Row(): with gr.Column(): gr.Markdown("### Original Text") original_output = gr.Textbox( label="Original", lines=15, max_lines=20, show_copy_button=True, elem_classes=["textbox-container"] ) with gr.Column(): gr.Markdown("### Cleaned Text") cleaned_output = gr.Textbox( label="Cleaned", lines=15, max_lines=20, show_copy_button=True, elem_classes=["textbox-container"] ) # Diff visualization gr.Markdown("### Diff Visualization") diff_output = gr.HTML(label="Diff") # Event handlers load_btn.click( fn=load_sample_data, inputs=[sample_input], outputs=[original_output, cleaned_output, diff_output, stats_output] ) random_btn.click( fn=lambda: load_sample_data(None), inputs=[], outputs=[original_output, cleaned_output, diff_output, stats_output] ) # Load initial sample interface.load( fn=lambda: load_sample_data(0), inputs=[], outputs=[original_output, cleaned_output, diff_output, stats_output] ) return interface if __name__ == "__main__": app = create_interface() app.launch(share=False, debug=True)