sumuks's picture
sumuks HF Staff
Create app.py
6815523 verified
"""Main Gradio application for viewing dataset with text comparison."""
import gradio as gr
import random
from data_loader import get_dataset_size, get_sample
from diff_utils import generate_html_diff, get_diff_stats
def load_sample_data(sample_index: int | None = None) -> tuple[str, str, str, str]:
"""Load and process a sample from the dataset."""
if sample_index is None:
sample_index = random.randint(0, get_dataset_size() - 1)
# Ensure index is within bounds
sample_index = max(0, min(sample_index, get_dataset_size() - 1))
original_text, cleaned_text = get_sample(sample_index)
# Generate diff HTML
diff_html = generate_html_diff(original_text, cleaned_text)
# Get statistics
stats = get_diff_stats(original_text, cleaned_text)
stats_text = f"""
**Sample #{sample_index}**
- Original length: {stats['original_length']:,} characters
- Cleaned length: {stats['cleaned_length']:,} characters
- Characters removed: {stats['characters_removed']:,}
- Similarity: {stats['similarity_ratio']}%
"""
return original_text, cleaned_text, diff_html, stats_text
def create_interface() -> gr.Blocks:
"""Create the main Gradio interface."""
with gr.Blocks(
title="Dataset Text Comparison Viewer",
css="""
.textbox-container {
max-height: 400px;
overflow-y: auto;
border: 1px solid #e0e0e0;
border-radius: 8px;
padding: 12px;
background: #fafafa;
}
.stats-box {
background: #f8f9fa;
border: 1px solid #dee2e6;
border-radius: 8px;
padding: 16px;
margin: 8px 0;
}
"""
) as interface:
gr.Markdown("# Dataset Text Comparison Viewer")
gr.Markdown("Compare original and cleaned text from **sumuks/essential-web-v1.0-sample-1M-with-cleaned-text**")
with gr.Row():
with gr.Column():
sample_input = gr.Number(
label="Sample Index",
value=0,
minimum=0,
maximum=get_dataset_size() - 1,
step=1
)
with gr.Row():
load_btn = gr.Button("Load Sample", variant="primary")
random_btn = gr.Button("Random Sample", variant="secondary")
# Statistics display
stats_output = gr.Markdown(label="Statistics", elem_classes=["stats-box"])
# Text comparison
with gr.Row():
with gr.Column():
gr.Markdown("### Original Text")
original_output = gr.Textbox(
label="Original",
lines=15,
max_lines=20,
show_copy_button=True,
elem_classes=["textbox-container"]
)
with gr.Column():
gr.Markdown("### Cleaned Text")
cleaned_output = gr.Textbox(
label="Cleaned",
lines=15,
max_lines=20,
show_copy_button=True,
elem_classes=["textbox-container"]
)
# Diff visualization
gr.Markdown("### Diff Visualization")
diff_output = gr.HTML(label="Diff")
# Event handlers
load_btn.click(
fn=load_sample_data,
inputs=[sample_input],
outputs=[original_output, cleaned_output, diff_output, stats_output]
)
random_btn.click(
fn=lambda: load_sample_data(None),
inputs=[],
outputs=[original_output, cleaned_output, diff_output, stats_output]
)
# Load initial sample
interface.load(
fn=lambda: load_sample_data(0),
inputs=[],
outputs=[original_output, cleaned_output, diff_output, stats_output]
)
return interface
if __name__ == "__main__":
app = create_interface()
app.launch(share=False, debug=True)