Armando Medina
commited on
Update app.py
Browse filesrevereted back to known good working
app.py
CHANGED
@@ -25,7 +25,6 @@ COMPLIANCE_ENTITIES = {
|
|
25 |
SUPPORTED_FILE_TYPES = [".pdf", ".docx", ".txt", ".png", ".jpg", ".jpeg"]
|
26 |
|
27 |
def extract_text(doc):
|
28 |
-
MAX_OCR_CHARS = 5000 # Prevent large OCR outputs from images
|
29 |
if not hasattr(doc, "name"):
|
30 |
return "ERROR: No file uploaded."
|
31 |
try:
|
@@ -43,9 +42,6 @@ def extract_text(doc):
|
|
43 |
elif fname.endswith((".png", ".jpg", ".jpeg")):
|
44 |
img = Image.open(doc.name)
|
45 |
text = pytesseract.image_to_string(img)
|
46 |
-
# Truncate large OCR blobs
|
47 |
-
if len(text) > MAX_OCR_CHARS:
|
48 |
-
text = text[:MAX_OCR_CHARS] + "\n...[truncated]"
|
49 |
else:
|
50 |
return "ERROR: Unsupported file type."
|
51 |
if not text.strip():
|
@@ -251,7 +247,6 @@ def executive_summary_template(findings, score, regime):
|
|
251 |
return " ".join(summary_lines)
|
252 |
|
253 |
def agentic_compliance(doc, regime):
|
254 |
-
MAX_FINDINGS_DISPLAY = 20 # Only show 20 findings max
|
255 |
text = extract_text(doc)
|
256 |
if text.startswith("ERROR"):
|
257 |
return text, None, None, None
|
@@ -266,13 +261,10 @@ def agentic_compliance(doc, regime):
|
|
266 |
summary = summarize_narrative(relevant, regime)
|
267 |
exec_summary = executive_summary_template(relevant, score, regime)
|
268 |
|
269 |
-
findings_sample = relevant[:MAX_FINDINGS_DISPLAY]
|
270 |
findings_md = "\n".join([
|
271 |
f"- **{f['entity']}** (`{f['text']}`), score: {f.get('score', 0):.2f}"
|
272 |
-
for f in
|
273 |
-
]) if
|
274 |
-
if len(relevant) > MAX_FINDINGS_DISPLAY:
|
275 |
-
findings_md += f"\n...and {len(relevant) - MAX_FINDINGS_DISPLAY} more not shown."
|
276 |
|
277 |
fixes_md = "\n".join([f"- {fix}" for fix in fixes]) if fixes else "No action needed."
|
278 |
legend_md = score_legend()
|
@@ -292,21 +284,31 @@ def agentic_compliance(doc, regime):
|
|
292 |
redacted_image = None # No inline preview for PDFs
|
293 |
|
294 |
md = f"""### Compliance Regime: **{regime}**
|
|
|
295 |
**Executive Summary:**
|
296 |
{exec_summary}
|
297 |
-
|
|
|
298 |
{findings_md}
|
|
|
299 |
**Risk Score:** {score}
|
|
|
300 |
**Actionable Recommendations:**
|
301 |
{fixes_md}
|
|
|
302 |
**Summary:**
|
303 |
{summary}
|
|
|
304 |
---
|
|
|
305 |
{legend_md}
|
|
|
306 |
---
|
|
|
307 |
**Redacted Document Preview:**
|
308 |
<details>
|
309 |
<summary>Show/Hide Redacted Text</summary>
|
|
|
310 |
</details>
|
311 |
"""
|
312 |
return md.strip(), redacted_path, redacted_file_path, redacted_image
|
|
|
25 |
SUPPORTED_FILE_TYPES = [".pdf", ".docx", ".txt", ".png", ".jpg", ".jpeg"]
|
26 |
|
27 |
def extract_text(doc):
|
|
|
28 |
if not hasattr(doc, "name"):
|
29 |
return "ERROR: No file uploaded."
|
30 |
try:
|
|
|
42 |
elif fname.endswith((".png", ".jpg", ".jpeg")):
|
43 |
img = Image.open(doc.name)
|
44 |
text = pytesseract.image_to_string(img)
|
|
|
|
|
|
|
45 |
else:
|
46 |
return "ERROR: Unsupported file type."
|
47 |
if not text.strip():
|
|
|
247 |
return " ".join(summary_lines)
|
248 |
|
249 |
def agentic_compliance(doc, regime):
|
|
|
250 |
text = extract_text(doc)
|
251 |
if text.startswith("ERROR"):
|
252 |
return text, None, None, None
|
|
|
261 |
summary = summarize_narrative(relevant, regime)
|
262 |
exec_summary = executive_summary_template(relevant, score, regime)
|
263 |
|
|
|
264 |
findings_md = "\n".join([
|
265 |
f"- **{f['entity']}** (`{f['text']}`), score: {f.get('score', 0):.2f}"
|
266 |
+
for f in relevant
|
267 |
+
]) if relevant else "No relevant PII found for this regime."
|
|
|
|
|
268 |
|
269 |
fixes_md = "\n".join([f"- {fix}" for fix in fixes]) if fixes else "No action needed."
|
270 |
legend_md = score_legend()
|
|
|
284 |
redacted_image = None # No inline preview for PDFs
|
285 |
|
286 |
md = f"""### Compliance Regime: **{regime}**
|
287 |
+
|
288 |
**Executive Summary:**
|
289 |
{exec_summary}
|
290 |
+
|
291 |
+
**Findings:**
|
292 |
{findings_md}
|
293 |
+
|
294 |
**Risk Score:** {score}
|
295 |
+
|
296 |
**Actionable Recommendations:**
|
297 |
{fixes_md}
|
298 |
+
|
299 |
**Summary:**
|
300 |
{summary}
|
301 |
+
|
302 |
---
|
303 |
+
|
304 |
{legend_md}
|
305 |
+
|
306 |
---
|
307 |
+
|
308 |
**Redacted Document Preview:**
|
309 |
<details>
|
310 |
<summary>Show/Hide Redacted Text</summary>
|
311 |
+
|
312 |
</details>
|
313 |
"""
|
314 |
return md.strip(), redacted_path, redacted_file_path, redacted_image
|