Spaces:
Sleeping
Sleeping
File size: 4,245 Bytes
6815523 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
"""Main Gradio application for viewing dataset with text comparison."""
import gradio as gr
import random
from data_loader import get_dataset_size, get_sample
from diff_utils import generate_html_diff, get_diff_stats
def load_sample_data(sample_index: int | None = None) -> tuple[str, str, str, str]:
"""Load and process a sample from the dataset."""
if sample_index is None:
sample_index = random.randint(0, get_dataset_size() - 1)
# Ensure index is within bounds
sample_index = max(0, min(sample_index, get_dataset_size() - 1))
original_text, cleaned_text = get_sample(sample_index)
# Generate diff HTML
diff_html = generate_html_diff(original_text, cleaned_text)
# Get statistics
stats = get_diff_stats(original_text, cleaned_text)
stats_text = f"""
**Sample #{sample_index}**
- Original length: {stats['original_length']:,} characters
- Cleaned length: {stats['cleaned_length']:,} characters
- Characters removed: {stats['characters_removed']:,}
- Similarity: {stats['similarity_ratio']}%
"""
return original_text, cleaned_text, diff_html, stats_text
def create_interface() -> gr.Blocks:
"""Create the main Gradio interface."""
with gr.Blocks(
title="Dataset Text Comparison Viewer",
css="""
.textbox-container {
max-height: 400px;
overflow-y: auto;
border: 1px solid #e0e0e0;
border-radius: 8px;
padding: 12px;
background: #fafafa;
}
.stats-box {
background: #f8f9fa;
border: 1px solid #dee2e6;
border-radius: 8px;
padding: 16px;
margin: 8px 0;
}
"""
) as interface:
gr.Markdown("# Dataset Text Comparison Viewer")
gr.Markdown("Compare original and cleaned text from **sumuks/essential-web-v1.0-sample-1M-with-cleaned-text**")
with gr.Row():
with gr.Column():
sample_input = gr.Number(
label="Sample Index",
value=0,
minimum=0,
maximum=get_dataset_size() - 1,
step=1
)
with gr.Row():
load_btn = gr.Button("Load Sample", variant="primary")
random_btn = gr.Button("Random Sample", variant="secondary")
# Statistics display
stats_output = gr.Markdown(label="Statistics", elem_classes=["stats-box"])
# Text comparison
with gr.Row():
with gr.Column():
gr.Markdown("### Original Text")
original_output = gr.Textbox(
label="Original",
lines=15,
max_lines=20,
show_copy_button=True,
elem_classes=["textbox-container"]
)
with gr.Column():
gr.Markdown("### Cleaned Text")
cleaned_output = gr.Textbox(
label="Cleaned",
lines=15,
max_lines=20,
show_copy_button=True,
elem_classes=["textbox-container"]
)
# Diff visualization
gr.Markdown("### Diff Visualization")
diff_output = gr.HTML(label="Diff")
# Event handlers
load_btn.click(
fn=load_sample_data,
inputs=[sample_input],
outputs=[original_output, cleaned_output, diff_output, stats_output]
)
random_btn.click(
fn=lambda: load_sample_data(None),
inputs=[],
outputs=[original_output, cleaned_output, diff_output, stats_output]
)
# Load initial sample
interface.load(
fn=lambda: load_sample_data(0),
inputs=[],
outputs=[original_output, cleaned_output, diff_output, stats_output]
)
return interface
if __name__ == "__main__":
app = create_interface()
app.launch(share=False, debug=True) |