Spaces:
Sleeping
Sleeping
"""Main Gradio application for viewing dataset with text comparison.""" | |
import gradio as gr | |
import random | |
from data_loader import get_dataset_size, get_sample | |
from diff_utils import generate_html_diff, get_diff_stats | |
def load_sample_data(sample_index: int | None = None) -> tuple[str, str, str, str]: | |
"""Load and process a sample from the dataset.""" | |
if sample_index is None: | |
sample_index = random.randint(0, get_dataset_size() - 1) | |
# Ensure index is within bounds | |
sample_index = max(0, min(sample_index, get_dataset_size() - 1)) | |
original_text, cleaned_text = get_sample(sample_index) | |
# Generate diff HTML | |
diff_html = generate_html_diff(original_text, cleaned_text) | |
# Get statistics | |
stats = get_diff_stats(original_text, cleaned_text) | |
stats_text = f""" | |
**Sample #{sample_index}** | |
- Original length: {stats['original_length']:,} characters | |
- Cleaned length: {stats['cleaned_length']:,} characters | |
- Characters removed: {stats['characters_removed']:,} | |
- Similarity: {stats['similarity_ratio']}% | |
""" | |
return original_text, cleaned_text, diff_html, stats_text | |
def create_interface() -> gr.Blocks: | |
"""Create the main Gradio interface.""" | |
with gr.Blocks( | |
title="Dataset Text Comparison Viewer", | |
css=""" | |
.textbox-container { | |
max-height: 400px; | |
overflow-y: auto; | |
border: 1px solid #e0e0e0; | |
border-radius: 8px; | |
padding: 12px; | |
background: #fafafa; | |
} | |
.stats-box { | |
background: #f8f9fa; | |
border: 1px solid #dee2e6; | |
border-radius: 8px; | |
padding: 16px; | |
margin: 8px 0; | |
} | |
""" | |
) as interface: | |
gr.Markdown("# Dataset Text Comparison Viewer") | |
gr.Markdown("Compare original and cleaned text from **sumuks/essential-web-v1.0-sample-1M-with-cleaned-text**") | |
with gr.Row(): | |
with gr.Column(): | |
sample_input = gr.Number( | |
label="Sample Index", | |
value=0, | |
minimum=0, | |
maximum=get_dataset_size() - 1, | |
step=1 | |
) | |
with gr.Row(): | |
load_btn = gr.Button("Load Sample", variant="primary") | |
random_btn = gr.Button("Random Sample", variant="secondary") | |
# Statistics display | |
stats_output = gr.Markdown(label="Statistics", elem_classes=["stats-box"]) | |
# Text comparison | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown("### Original Text") | |
original_output = gr.Textbox( | |
label="Original", | |
lines=15, | |
max_lines=20, | |
show_copy_button=True, | |
elem_classes=["textbox-container"] | |
) | |
with gr.Column(): | |
gr.Markdown("### Cleaned Text") | |
cleaned_output = gr.Textbox( | |
label="Cleaned", | |
lines=15, | |
max_lines=20, | |
show_copy_button=True, | |
elem_classes=["textbox-container"] | |
) | |
# Diff visualization | |
gr.Markdown("### Diff Visualization") | |
diff_output = gr.HTML(label="Diff") | |
# Event handlers | |
load_btn.click( | |
fn=load_sample_data, | |
inputs=[sample_input], | |
outputs=[original_output, cleaned_output, diff_output, stats_output] | |
) | |
random_btn.click( | |
fn=lambda: load_sample_data(None), | |
inputs=[], | |
outputs=[original_output, cleaned_output, diff_output, stats_output] | |
) | |
# Load initial sample | |
interface.load( | |
fn=lambda: load_sample_data(0), | |
inputs=[], | |
outputs=[original_output, cleaned_output, diff_output, stats_output] | |
) | |
return interface | |
if __name__ == "__main__": | |
app = create_interface() | |
app.launch(share=False, debug=True) |