sumuks HF Staff commited on
Commit
5c51375
·
verified ·
1 Parent(s): 6815523

Create diff_utils.py

Browse files
Files changed (1) hide show
  1. diff_utils.py +95 -0
diff_utils.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Diff visualization utilities."""
2
+
3
+ import difflib
4
+ from typing import Iterator
5
+
6
+
7
+ def generate_html_diff(original: str, cleaned: str) -> str:
8
+ """Generate HTML diff between original and cleaned text."""
9
+ differ = difflib.HtmlDiff(tabsize=2, wrapcolumn=80)
10
+
11
+ original_lines = original.splitlines(keepends=True)
12
+ cleaned_lines = cleaned.splitlines(keepends=True)
13
+
14
+ html_diff = differ.make_file(
15
+ original_lines,
16
+ cleaned_lines,
17
+ "Original Text",
18
+ "Cleaned Text",
19
+ context=True,
20
+ numlines=3
21
+ )
22
+
23
+ return _style_diff_html(html_diff)
24
+
25
+
26
+ def _style_diff_html(html_diff: str) -> str:
27
+ """Add custom styling to the diff HTML."""
28
+ custom_style = """
29
+ <style>
30
+ .diff {
31
+ font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
32
+ font-size: 12px;
33
+ line-height: 1.4;
34
+ border: 1px solid #ddd;
35
+ border-radius: 8px;
36
+ overflow: auto;
37
+ max-height: 600px;
38
+ }
39
+ .diff table {
40
+ width: 100%;
41
+ border-collapse: collapse;
42
+ margin: 0;
43
+ }
44
+ .diff td {
45
+ padding: 2px 8px;
46
+ vertical-align: top;
47
+ white-space: pre-wrap;
48
+ word-wrap: break-word;
49
+ }
50
+ .diff_header {
51
+ background: #f8f9fa;
52
+ font-weight: bold;
53
+ text-align: center;
54
+ border-bottom: 1px solid #dee2e6;
55
+ }
56
+ .diff_next {
57
+ background: #e9ecef;
58
+ text-align: center;
59
+ font-size: 10px;
60
+ }
61
+ .diff_add {
62
+ background: #d4edda;
63
+ color: #155724;
64
+ }
65
+ .diff_chg {
66
+ background: #fff3cd;
67
+ color: #856404;
68
+ }
69
+ .diff_sub {
70
+ background: #f8d7da;
71
+ color: #721c24;
72
+ }
73
+ </style>
74
+ """
75
+
76
+ # Insert custom style before closing </head> tag
77
+ styled_html = html_diff.replace('</head>', f'{custom_style}</head>')
78
+
79
+ # Add diff class to the table
80
+ styled_html = styled_html.replace('<table class="diff"', '<div class="diff"><table class="diff"')
81
+ styled_html = styled_html.replace('</table>', '</table></div>')
82
+
83
+ return styled_html
84
+
85
+
86
+ def get_diff_stats(original: str, cleaned: str) -> dict[str, int]:
87
+ """Get statistics about the diff between original and cleaned text."""
88
+ differ = difflib.SequenceMatcher(None, original, cleaned)
89
+
90
+ return {
91
+ 'original_length': len(original),
92
+ 'cleaned_length': len(cleaned),
93
+ 'similarity_ratio': round(differ.ratio() * 100, 2),
94
+ 'characters_removed': len(original) - len(cleaned)
95
+ }