File size: 4,245 Bytes
6815523
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""Main Gradio application for viewing dataset with text comparison."""

import gradio as gr
import random
from data_loader import get_dataset_size, get_sample
from diff_utils import generate_html_diff, get_diff_stats


def load_sample_data(sample_index: int | None = None) -> tuple[str, str, str, str]:
    """Load and process a sample from the dataset."""
    if sample_index is None:
        sample_index = random.randint(0, get_dataset_size() - 1)
    
    # Ensure index is within bounds
    sample_index = max(0, min(sample_index, get_dataset_size() - 1))
    
    original_text, cleaned_text = get_sample(sample_index)
    
    # Generate diff HTML
    diff_html = generate_html_diff(original_text, cleaned_text)
    
    # Get statistics
    stats = get_diff_stats(original_text, cleaned_text)
    stats_text = f"""
    **Sample #{sample_index}**
    
    - Original length: {stats['original_length']:,} characters
    - Cleaned length: {stats['cleaned_length']:,} characters
    - Characters removed: {stats['characters_removed']:,}
    - Similarity: {stats['similarity_ratio']}%
    """
    
    return original_text, cleaned_text, diff_html, stats_text


def create_interface() -> gr.Blocks:
    """Create the main Gradio interface."""
    with gr.Blocks(
        title="Dataset Text Comparison Viewer",
        css="""
        .textbox-container { 
            max-height: 400px; 
            overflow-y: auto; 
            border: 1px solid #e0e0e0; 
            border-radius: 8px; 
            padding: 12px; 
            background: #fafafa; 
        }
        .stats-box {
            background: #f8f9fa;
            border: 1px solid #dee2e6;
            border-radius: 8px;
            padding: 16px;
            margin: 8px 0;
        }
        """
    ) as interface:
        
        gr.Markdown("# Dataset Text Comparison Viewer")
        gr.Markdown("Compare original and cleaned text from **sumuks/essential-web-v1.0-sample-1M-with-cleaned-text**")
        
        with gr.Row():
            with gr.Column():
                sample_input = gr.Number(
                    label="Sample Index", 
                    value=0, 
                    minimum=0, 
                    maximum=get_dataset_size() - 1,
                    step=1
                )
                
                with gr.Row():
                    load_btn = gr.Button("Load Sample", variant="primary")
                    random_btn = gr.Button("Random Sample", variant="secondary")
        
        # Statistics display
        stats_output = gr.Markdown(label="Statistics", elem_classes=["stats-box"])
        
        # Text comparison
        with gr.Row():
            with gr.Column():
                gr.Markdown("### Original Text")
                original_output = gr.Textbox(
                    label="Original",
                    lines=15,
                    max_lines=20,
                    show_copy_button=True,
                    elem_classes=["textbox-container"]
                )
            
            with gr.Column():
                gr.Markdown("### Cleaned Text")
                cleaned_output = gr.Textbox(
                    label="Cleaned",
                    lines=15,
                    max_lines=20,
                    show_copy_button=True,
                    elem_classes=["textbox-container"]
                )
        
        # Diff visualization
        gr.Markdown("### Diff Visualization")
        diff_output = gr.HTML(label="Diff")
        
        # Event handlers
        load_btn.click(
            fn=load_sample_data,
            inputs=[sample_input],
            outputs=[original_output, cleaned_output, diff_output, stats_output]
        )
        
        random_btn.click(
            fn=lambda: load_sample_data(None),
            inputs=[],
            outputs=[original_output, cleaned_output, diff_output, stats_output]
        )
        
        # Load initial sample
        interface.load(
            fn=lambda: load_sample_data(0),
            inputs=[],
            outputs=[original_output, cleaned_output, diff_output, stats_output]
        )
    
    return interface


if __name__ == "__main__":
    app = create_interface()
    app.launch(share=False, debug=True)