Armando Medina
commited on
Upload 6 files
Browse files- .gitattributes +1 -0
- app.py +343 -0
- packages.txt +1 -0
- requirements.txt +13 -0
- sample_form.pdf +0 -0
- sample_img.png +3 -0
- sample_incident_report.docx +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
sample_img.png filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,343 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pdfplumber
|
3 |
+
from presidio_analyzer import AnalyzerEngine
|
4 |
+
from presidio_anonymizer import AnonymizerEngine
|
5 |
+
from presidio_image_redactor import ImageRedactorEngine
|
6 |
+
from transformers import pipeline
|
7 |
+
import torch
|
8 |
+
import numpy as np
|
9 |
+
import re
|
10 |
+
import os
|
11 |
+
from docx import Document
|
12 |
+
from PIL import Image
|
13 |
+
import pytesseract
|
14 |
+
import fitz # pymupdf
|
15 |
+
import io
|
16 |
+
|
17 |
+
# --- LLM for executive summary (Pegasus-XSum) ---
|
18 |
+
# summarizer = pipeline("summarization", model="")
|
19 |
+
|
20 |
+
analyzer = AnalyzerEngine()
|
21 |
+
anonymizer = AnonymizerEngine()
|
22 |
+
image_redactor = ImageRedactorEngine()
|
23 |
+
|
24 |
+
COMPLIANCE_ENTITIES = {
|
25 |
+
"HIPAA": ["PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER", "MEDICAL_RECORD_NUMBER", "SSN"],
|
26 |
+
"GDPR": ["PERSON", "EMAIL_ADDRESS", "LOCATION"],
|
27 |
+
"CCPA": ["PERSON", "EMAIL_ADDRESS", "IP_ADDRESS", "SSN", "CREDIT_CARD"]
|
28 |
+
}
|
29 |
+
|
30 |
+
SUPPORTED_FILE_TYPES = [".pdf", ".docx", ".txt", ".png", ".jpg", ".jpeg"]
|
31 |
+
|
32 |
+
def extract_text(doc):
|
33 |
+
if not hasattr(doc, "name"):
|
34 |
+
return "ERROR: No file uploaded."
|
35 |
+
try:
|
36 |
+
fname = doc.name.lower()
|
37 |
+
if fname.endswith(".pdf"):
|
38 |
+
with pdfplumber.open(doc.name) as pdf:
|
39 |
+
pages = [page.extract_text() or "" for page in pdf.pages]
|
40 |
+
text = "\n".join(pages)
|
41 |
+
elif fname.endswith(".docx"):
|
42 |
+
document = Document(doc.name)
|
43 |
+
text = "\n".join([p.text for p in document.paragraphs])
|
44 |
+
elif fname.endswith(".txt"):
|
45 |
+
with open(doc.name, "r", encoding="utf-8") as f:
|
46 |
+
text = f.read()
|
47 |
+
elif fname.endswith((".png", ".jpg", ".jpeg")):
|
48 |
+
img = Image.open(doc.name)
|
49 |
+
text = pytesseract.image_to_string(img)
|
50 |
+
else:
|
51 |
+
return "ERROR: Unsupported file type."
|
52 |
+
if not text.strip():
|
53 |
+
return "ERROR: Document contains no extractable text."
|
54 |
+
return text
|
55 |
+
except Exception as e:
|
56 |
+
return f"ERROR: {e}"
|
57 |
+
|
58 |
+
def detect_pii(text):
|
59 |
+
try:
|
60 |
+
entities = [
|
61 |
+
"PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER", "MEDICAL_RECORD_NUMBER",
|
62 |
+
"SSN", "CREDIT_CARD", "LOCATION", "IP_ADDRESS"
|
63 |
+
]
|
64 |
+
presidio_results = analyzer.analyze(text=text, entities=entities, language="en")
|
65 |
+
findings = [
|
66 |
+
{
|
67 |
+
"entity": r.entity_type,
|
68 |
+
"score": r.score,
|
69 |
+
"start": r.start,
|
70 |
+
"end": r.end,
|
71 |
+
"text": text[r.start:r.end].strip()
|
72 |
+
}
|
73 |
+
for r in presidio_results
|
74 |
+
]
|
75 |
+
# Add regex-based findings (SSN, IP) for extra coverage
|
76 |
+
findings += find_ssns(text)
|
77 |
+
findings += find_ip_addresses(text)
|
78 |
+
return findings, presidio_results
|
79 |
+
except Exception as e:
|
80 |
+
return [{"entity": "ERROR", "text": str(e)}], []
|
81 |
+
|
82 |
+
def find_ip_addresses(text):
|
83 |
+
pattern = r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'
|
84 |
+
return [
|
85 |
+
{
|
86 |
+
"entity": "IP_ADDRESS",
|
87 |
+
"score": 1.0,
|
88 |
+
"start": m.start(),
|
89 |
+
"end": m.end(),
|
90 |
+
"text": m.group()
|
91 |
+
}
|
92 |
+
for m in re.finditer(pattern, text)
|
93 |
+
]
|
94 |
+
|
95 |
+
def find_ssns(text):
|
96 |
+
pattern = r'(?i)(ssn|social security number)[\s:]*([0-9]{3}-[0-9]{2}-[0-9]{4})'
|
97 |
+
findings = []
|
98 |
+
for m in re.finditer(pattern, text):
|
99 |
+
findings.append({
|
100 |
+
"entity": "SSN",
|
101 |
+
"score": 1.0,
|
102 |
+
"start": m.start(2),
|
103 |
+
"end": m.end(2),
|
104 |
+
"text": m.group(2)
|
105 |
+
})
|
106 |
+
# Standalone SSN fallback
|
107 |
+
for m in re.finditer(r'\b[0-9]{3}-[0-9]{2}-[0-9]{4}\b', text):
|
108 |
+
findings.append({
|
109 |
+
"entity": "SSN",
|
110 |
+
"score": 0.95,
|
111 |
+
"start": m.start(),
|
112 |
+
"end": m.end(),
|
113 |
+
"text": m.group()
|
114 |
+
})
|
115 |
+
return findings
|
116 |
+
|
117 |
+
def clean_person_entities(findings):
|
118 |
+
cleaned = []
|
119 |
+
for f in findings:
|
120 |
+
if f["entity"] == "PERSON":
|
121 |
+
name = " ".join(f["text"].split()[:2])
|
122 |
+
if name.lower() not in ["date", "department"]:
|
123 |
+
f = f.copy()
|
124 |
+
f["text"] = name
|
125 |
+
cleaned.append(f)
|
126 |
+
else:
|
127 |
+
cleaned.append(f)
|
128 |
+
return cleaned
|
129 |
+
|
130 |
+
def dedupe_findings(findings):
|
131 |
+
seen = set()
|
132 |
+
deduped = []
|
133 |
+
for f in findings:
|
134 |
+
key = (f["entity"], f["text"], f["start"], f["end"])
|
135 |
+
if key not in seen:
|
136 |
+
seen.add(key)
|
137 |
+
deduped.append(f)
|
138 |
+
return deduped
|
139 |
+
|
140 |
+
def risk_score(findings):
|
141 |
+
weights = {
|
142 |
+
"PERSON": 1, "EMAIL_ADDRESS": 2, "CREDIT_CARD": 4, "SSN": 5,
|
143 |
+
"IP_ADDRESS": 2, "PHONE_NUMBER": 2, "MEDICAL_RECORD_NUMBER": 3
|
144 |
+
}
|
145 |
+
return sum(weights.get(f["entity"], 1) for f in findings)
|
146 |
+
|
147 |
+
def suggest_fixes(findings):
|
148 |
+
fixes = []
|
149 |
+
for f in findings:
|
150 |
+
ent = f["entity"]
|
151 |
+
if ent == "PERSON":
|
152 |
+
fixes.append("Remove or mask full names.")
|
153 |
+
if ent == "EMAIL_ADDRESS":
|
154 |
+
fixes.append("Anonymize email addresses.")
|
155 |
+
if ent == "CREDIT_CARD":
|
156 |
+
fixes.append("Remove or mask credit card numbers.")
|
157 |
+
if ent == "SSN":
|
158 |
+
fixes.append("Remove or mask social security numbers.")
|
159 |
+
if ent == "PHONE_NUMBER":
|
160 |
+
fixes.append("Mask phone numbers.")
|
161 |
+
if ent == "LOCATION":
|
162 |
+
fixes.append("Remove or generalize location data.")
|
163 |
+
if ent == "IP_ADDRESS":
|
164 |
+
fixes.append("Remove or anonymize IP addresses.")
|
165 |
+
if ent == "MEDICAL_RECORD_NUMBER":
|
166 |
+
fixes.append("Anonymize medical record numbers.")
|
167 |
+
return list(set(fixes))
|
168 |
+
|
169 |
+
def summarize_narrative(findings, regime):
|
170 |
+
if not findings:
|
171 |
+
return "No sensitive or regulated information was found in this document."
|
172 |
+
entity_types = [f["entity"] for f in findings]
|
173 |
+
summary_lines = []
|
174 |
+
summary_lines.append(f"Under **{regime}**, the document contains:")
|
175 |
+
for entity in sorted(set(entity_types)):
|
176 |
+
count = entity_types.count(entity)
|
177 |
+
summary_lines.append(f"- **{entity.replace('_', ' ').title()}**: {count} instance(s)")
|
178 |
+
summary_lines.append("These must be anonymized or removed to ensure compliance.")
|
179 |
+
return "\n".join(summary_lines)
|
180 |
+
|
181 |
+
def score_legend():
|
182 |
+
return (
|
183 |
+
"**Risk Score Legend:**\n"
|
184 |
+
"- 0–3: Low risk (little or no PII detected)\n"
|
185 |
+
"- 4–7: Moderate risk (some PII detected, take caution)\n"
|
186 |
+
"- 8+: High risk (multiple/high-value PII found—document needs urgent attention)\n"
|
187 |
+
"\n"
|
188 |
+
"Score is calculated based on entity sensitivity. For example, SSN and credit cards are higher risk than names."
|
189 |
+
)
|
190 |
+
|
191 |
+
def redact_text(text, all_findings):
|
192 |
+
"""Redact ALL PII (Presidio + regex) by replacing span with [REDACTED] in the text."""
|
193 |
+
# Sort findings in reverse order so indexes don't get messed up
|
194 |
+
all_findings = sorted(all_findings, key=lambda f: f["start"], reverse=True)
|
195 |
+
redacted_text = text
|
196 |
+
for f in all_findings:
|
197 |
+
# Only redact actual text spans (skip any nulls or 1-2 char)
|
198 |
+
if not f["text"] or len(f["text"]) < 3:
|
199 |
+
continue
|
200 |
+
redacted_text = redacted_text[:f["start"]] + "[REDACTED]" + redacted_text[f["end"]:]
|
201 |
+
return redacted_text
|
202 |
+
|
203 |
+
def save_redacted_file(redacted_text):
|
204 |
+
path = "/tmp/redacted_output.txt"
|
205 |
+
with open(path, "w", encoding="utf-8") as f:
|
206 |
+
f.write(redacted_text)
|
207 |
+
return path
|
208 |
+
|
209 |
+
def redact_image_with_presidio(image_path):
|
210 |
+
img = Image.open(image_path)
|
211 |
+
redacted_img = image_redactor.redact(img)
|
212 |
+
out_path = "/tmp/redacted_image.png"
|
213 |
+
redacted_img.save(out_path)
|
214 |
+
return out_path
|
215 |
+
|
216 |
+
def redact_pdf_with_presidio(pdf_path):
|
217 |
+
doc = fitz.open(pdf_path)
|
218 |
+
output_pdf = fitz.open()
|
219 |
+
for page in doc:
|
220 |
+
pix = page.get_pixmap()
|
221 |
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
222 |
+
redacted_img = image_redactor.redact(img)
|
223 |
+
img_byte_arr = io.BytesIO()
|
224 |
+
redacted_img.save(img_byte_arr, format='PNG')
|
225 |
+
img_byte_arr.seek(0)
|
226 |
+
rect = fitz.Rect(0, 0, pix.width, pix.height)
|
227 |
+
out_page = output_pdf.new_page(width=pix.width, height=pix.height)
|
228 |
+
out_page.insert_image(rect, stream=img_byte_arr.getvalue())
|
229 |
+
out_path = "/tmp/redacted_output.pdf"
|
230 |
+
output_pdf.save(out_path)
|
231 |
+
output_pdf.close()
|
232 |
+
return out_path
|
233 |
+
|
234 |
+
def executive_summary_template(findings, score, regime):
|
235 |
+
if not findings:
|
236 |
+
return (
|
237 |
+
f"No sensitive information detected under {regime}. Document is considered low risk."
|
238 |
+
)
|
239 |
+
risk_level = (
|
240 |
+
"High Risk" if score >= 8 else
|
241 |
+
"Moderate Risk" if score >= 4 else "Low Risk"
|
242 |
+
)
|
243 |
+
entity_counts = {}
|
244 |
+
for f in findings:
|
245 |
+
entity_counts[f["entity"]] = entity_counts.get(f["entity"], 0) + 1
|
246 |
+
|
247 |
+
summary_lines = [
|
248 |
+
f"This document falls under {regime} with a risk score of {score} ({risk_level})."
|
249 |
+
]
|
250 |
+
if entity_counts:
|
251 |
+
summary_lines.append(
|
252 |
+
"Sensitive information detected: " +
|
253 |
+
", ".join([f"{k} ({v})" for k, v in entity_counts.items()])
|
254 |
+
+ "."
|
255 |
+
)
|
256 |
+
summary_lines.append(
|
257 |
+
"Recommendation: Anonymize or redact all sensitive entities to ensure compliance."
|
258 |
+
)
|
259 |
+
return " ".join(summary_lines)
|
260 |
+
|
261 |
+
|
262 |
+
def agentic_compliance(doc, regime):
|
263 |
+
text = extract_text(doc)
|
264 |
+
if text.startswith("ERROR"):
|
265 |
+
return text, None, None
|
266 |
+
|
267 |
+
findings, presidio_results = detect_pii(text)
|
268 |
+
findings = clean_person_entities(findings)
|
269 |
+
findings = dedupe_findings(findings)
|
270 |
+
|
271 |
+
# Only show entities relevant to regime, but redact all
|
272 |
+
entities_needed = COMPLIANCE_ENTITIES.get(regime, [])
|
273 |
+
relevant = [f for f in findings if f["entity"] in entities_needed]
|
274 |
+
score = risk_score(relevant)
|
275 |
+
fixes = suggest_fixes(relevant)
|
276 |
+
summary = summarize_narrative(relevant, regime)
|
277 |
+
exec_summary = executive_summary_template(relevant, score, regime)
|
278 |
+
|
279 |
+
findings_md = "\n".join([
|
280 |
+
f"- **{f['entity']}** (`{f['text']}`), score: {f.get('score', 0):.2f}"
|
281 |
+
for f in relevant
|
282 |
+
]) if relevant else "No relevant PII found for this regime."
|
283 |
+
|
284 |
+
fixes_md = "\n".join([f"- {fix}" for fix in fixes]) if fixes else "No action needed."
|
285 |
+
legend_md = score_legend()
|
286 |
+
|
287 |
+
# Redact *all* PII detected (not just Presidio, but also regex findings)
|
288 |
+
redacted = redact_text(text, findings)
|
289 |
+
redacted_path = save_redacted_file(redacted)
|
290 |
+
|
291 |
+
# Generate redacted file (image or PDF) if applicable
|
292 |
+
redacted_file_path = None
|
293 |
+
if hasattr(doc, "name"):
|
294 |
+
fname = doc.name.lower()
|
295 |
+
if fname.endswith((".png", ".jpg", ".jpeg")):
|
296 |
+
redacted_file_path = redact_image_with_presidio(doc.name)
|
297 |
+
elif fname.endswith(".pdf"):
|
298 |
+
redacted_file_path = redact_pdf_with_presidio(doc.name)
|
299 |
+
|
300 |
+
md = f"""### Compliance Regime: **{regime}**
|
301 |
+
|
302 |
+
**Executive Summary:**
|
303 |
+
{exec_summary}
|
304 |
+
|
305 |
+
**Findings:**
|
306 |
+
{findings_md}
|
307 |
+
|
308 |
+
**Risk Score:** {score}
|
309 |
+
|
310 |
+
**Actionable Recommendations:**
|
311 |
+
{fixes_md}
|
312 |
+
|
313 |
+
**Summary:**
|
314 |
+
{summary}
|
315 |
+
|
316 |
+
---
|
317 |
+
|
318 |
+
{legend_md}
|
319 |
+
|
320 |
+
---
|
321 |
+
|
322 |
+
**Redacted Document Preview:**
|
323 |
+
<details>
|
324 |
+
<summary>Show/Hide Redacted Text</summary>
|
325 |
+
|
326 |
+
</details>
|
327 |
+
"""
|
328 |
+
return md.strip(), redacted_path, redacted_file_path
|
329 |
+
|
330 |
+
# --- Gradio UI ---
|
331 |
+
with gr.Blocks(title="Agentic Compliance MCP Server") as demo:
|
332 |
+
gr.Markdown("# Agentic Compliance MCP\nUpload a document and select a compliance regime.")
|
333 |
+
with gr.Tab("Compliance Agent"):
|
334 |
+
doc = gr.File(label="Upload Document", file_types=SUPPORTED_FILE_TYPES)
|
335 |
+
regime = gr.Dropdown(choices=list(COMPLIANCE_ENTITIES.keys()), label="Compliance Regime")
|
336 |
+
out = gr.Markdown(label="Compliance Output")
|
337 |
+
redacted_out = gr.File(label="Download Redacted Text")
|
338 |
+
file_redacted_out = gr.File(label="Download Redacted PDF/Image")
|
339 |
+
gr.Button("Run Compliance Agent").click(
|
340 |
+
agentic_compliance, inputs=[doc, regime], outputs=[out, redacted_out, file_redacted_out]
|
341 |
+
)
|
342 |
+
|
343 |
+
demo.launch(mcp_server=True)
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
tesseract-ocr
|
requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
pdfplumber
|
3 |
+
transformers
|
4 |
+
torch
|
5 |
+
numpy
|
6 |
+
presidio-analyzer
|
7 |
+
presidio-anonymizer
|
8 |
+
presidio-image-redactor
|
9 |
+
spacy
|
10 |
+
python-docx
|
11 |
+
pytesseract
|
12 |
+
pillow
|
13 |
+
pymupdf
|
sample_form.pdf
ADDED
Binary file (1.52 kB). View file
|
|
sample_img.png
ADDED
![]() |
Git LFS Details
|
sample_incident_report.docx
ADDED
Binary file (16.9 kB). View file
|
|