sumuks HF Staff commited on
Commit
6815523
·
verified ·
1 Parent(s): a0a041d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -0
app.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Main Gradio application for viewing dataset with text comparison."""
2
+
3
+ import gradio as gr
4
+ import random
5
+ from data_loader import get_dataset_size, get_sample
6
+ from diff_utils import generate_html_diff, get_diff_stats
7
+
8
+
9
+ def load_sample_data(sample_index: int | None = None) -> tuple[str, str, str, str]:
10
+ """Load and process a sample from the dataset."""
11
+ if sample_index is None:
12
+ sample_index = random.randint(0, get_dataset_size() - 1)
13
+
14
+ # Ensure index is within bounds
15
+ sample_index = max(0, min(sample_index, get_dataset_size() - 1))
16
+
17
+ original_text, cleaned_text = get_sample(sample_index)
18
+
19
+ # Generate diff HTML
20
+ diff_html = generate_html_diff(original_text, cleaned_text)
21
+
22
+ # Get statistics
23
+ stats = get_diff_stats(original_text, cleaned_text)
24
+ stats_text = f"""
25
+ **Sample #{sample_index}**
26
+
27
+ - Original length: {stats['original_length']:,} characters
28
+ - Cleaned length: {stats['cleaned_length']:,} characters
29
+ - Characters removed: {stats['characters_removed']:,}
30
+ - Similarity: {stats['similarity_ratio']}%
31
+ """
32
+
33
+ return original_text, cleaned_text, diff_html, stats_text
34
+
35
+
36
+ def create_interface() -> gr.Blocks:
37
+ """Create the main Gradio interface."""
38
+ with gr.Blocks(
39
+ title="Dataset Text Comparison Viewer",
40
+ css="""
41
+ .textbox-container {
42
+ max-height: 400px;
43
+ overflow-y: auto;
44
+ border: 1px solid #e0e0e0;
45
+ border-radius: 8px;
46
+ padding: 12px;
47
+ background: #fafafa;
48
+ }
49
+ .stats-box {
50
+ background: #f8f9fa;
51
+ border: 1px solid #dee2e6;
52
+ border-radius: 8px;
53
+ padding: 16px;
54
+ margin: 8px 0;
55
+ }
56
+ """
57
+ ) as interface:
58
+
59
+ gr.Markdown("# Dataset Text Comparison Viewer")
60
+ gr.Markdown("Compare original and cleaned text from **sumuks/essential-web-v1.0-sample-1M-with-cleaned-text**")
61
+
62
+ with gr.Row():
63
+ with gr.Column():
64
+ sample_input = gr.Number(
65
+ label="Sample Index",
66
+ value=0,
67
+ minimum=0,
68
+ maximum=get_dataset_size() - 1,
69
+ step=1
70
+ )
71
+
72
+ with gr.Row():
73
+ load_btn = gr.Button("Load Sample", variant="primary")
74
+ random_btn = gr.Button("Random Sample", variant="secondary")
75
+
76
+ # Statistics display
77
+ stats_output = gr.Markdown(label="Statistics", elem_classes=["stats-box"])
78
+
79
+ # Text comparison
80
+ with gr.Row():
81
+ with gr.Column():
82
+ gr.Markdown("### Original Text")
83
+ original_output = gr.Textbox(
84
+ label="Original",
85
+ lines=15,
86
+ max_lines=20,
87
+ show_copy_button=True,
88
+ elem_classes=["textbox-container"]
89
+ )
90
+
91
+ with gr.Column():
92
+ gr.Markdown("### Cleaned Text")
93
+ cleaned_output = gr.Textbox(
94
+ label="Cleaned",
95
+ lines=15,
96
+ max_lines=20,
97
+ show_copy_button=True,
98
+ elem_classes=["textbox-container"]
99
+ )
100
+
101
+ # Diff visualization
102
+ gr.Markdown("### Diff Visualization")
103
+ diff_output = gr.HTML(label="Diff")
104
+
105
+ # Event handlers
106
+ load_btn.click(
107
+ fn=load_sample_data,
108
+ inputs=[sample_input],
109
+ outputs=[original_output, cleaned_output, diff_output, stats_output]
110
+ )
111
+
112
+ random_btn.click(
113
+ fn=lambda: load_sample_data(None),
114
+ inputs=[],
115
+ outputs=[original_output, cleaned_output, diff_output, stats_output]
116
+ )
117
+
118
+ # Load initial sample
119
+ interface.load(
120
+ fn=lambda: load_sample_data(0),
121
+ inputs=[],
122
+ outputs=[original_output, cleaned_output, diff_output, stats_output]
123
+ )
124
+
125
+ return interface
126
+
127
+
128
+ if __name__ == "__main__":
129
+ app = create_interface()
130
+ app.launch(share=False, debug=True)