Armando Medina commited on
Commit
8036eb5
·
verified ·
1 Parent(s): 7293214

Upload 6 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ sample_img.png filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pdfplumber
3
+ from presidio_analyzer import AnalyzerEngine
4
+ from presidio_anonymizer import AnonymizerEngine
5
+ from presidio_image_redactor import ImageRedactorEngine
6
+ from transformers import pipeline
7
+ import torch
8
+ import numpy as np
9
+ import re
10
+ import os
11
+ from docx import Document
12
+ from PIL import Image
13
+ import pytesseract
14
+ import fitz # pymupdf
15
+ import io
16
+
17
+ # --- LLM for executive summary (Pegasus-XSum) ---
18
+ # summarizer = pipeline("summarization", model="")
19
+
20
+ analyzer = AnalyzerEngine()
21
+ anonymizer = AnonymizerEngine()
22
+ image_redactor = ImageRedactorEngine()
23
+
24
+ COMPLIANCE_ENTITIES = {
25
+ "HIPAA": ["PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER", "MEDICAL_RECORD_NUMBER", "SSN"],
26
+ "GDPR": ["PERSON", "EMAIL_ADDRESS", "LOCATION"],
27
+ "CCPA": ["PERSON", "EMAIL_ADDRESS", "IP_ADDRESS", "SSN", "CREDIT_CARD"]
28
+ }
29
+
30
+ SUPPORTED_FILE_TYPES = [".pdf", ".docx", ".txt", ".png", ".jpg", ".jpeg"]
31
+
32
+ def extract_text(doc):
33
+ if not hasattr(doc, "name"):
34
+ return "ERROR: No file uploaded."
35
+ try:
36
+ fname = doc.name.lower()
37
+ if fname.endswith(".pdf"):
38
+ with pdfplumber.open(doc.name) as pdf:
39
+ pages = [page.extract_text() or "" for page in pdf.pages]
40
+ text = "\n".join(pages)
41
+ elif fname.endswith(".docx"):
42
+ document = Document(doc.name)
43
+ text = "\n".join([p.text for p in document.paragraphs])
44
+ elif fname.endswith(".txt"):
45
+ with open(doc.name, "r", encoding="utf-8") as f:
46
+ text = f.read()
47
+ elif fname.endswith((".png", ".jpg", ".jpeg")):
48
+ img = Image.open(doc.name)
49
+ text = pytesseract.image_to_string(img)
50
+ else:
51
+ return "ERROR: Unsupported file type."
52
+ if not text.strip():
53
+ return "ERROR: Document contains no extractable text."
54
+ return text
55
+ except Exception as e:
56
+ return f"ERROR: {e}"
57
+
58
+ def detect_pii(text):
59
+ try:
60
+ entities = [
61
+ "PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER", "MEDICAL_RECORD_NUMBER",
62
+ "SSN", "CREDIT_CARD", "LOCATION", "IP_ADDRESS"
63
+ ]
64
+ presidio_results = analyzer.analyze(text=text, entities=entities, language="en")
65
+ findings = [
66
+ {
67
+ "entity": r.entity_type,
68
+ "score": r.score,
69
+ "start": r.start,
70
+ "end": r.end,
71
+ "text": text[r.start:r.end].strip()
72
+ }
73
+ for r in presidio_results
74
+ ]
75
+ # Add regex-based findings (SSN, IP) for extra coverage
76
+ findings += find_ssns(text)
77
+ findings += find_ip_addresses(text)
78
+ return findings, presidio_results
79
+ except Exception as e:
80
+ return [{"entity": "ERROR", "text": str(e)}], []
81
+
82
+ def find_ip_addresses(text):
83
+ pattern = r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'
84
+ return [
85
+ {
86
+ "entity": "IP_ADDRESS",
87
+ "score": 1.0,
88
+ "start": m.start(),
89
+ "end": m.end(),
90
+ "text": m.group()
91
+ }
92
+ for m in re.finditer(pattern, text)
93
+ ]
94
+
95
+ def find_ssns(text):
96
+ pattern = r'(?i)(ssn|social security number)[\s:]*([0-9]{3}-[0-9]{2}-[0-9]{4})'
97
+ findings = []
98
+ for m in re.finditer(pattern, text):
99
+ findings.append({
100
+ "entity": "SSN",
101
+ "score": 1.0,
102
+ "start": m.start(2),
103
+ "end": m.end(2),
104
+ "text": m.group(2)
105
+ })
106
+ # Standalone SSN fallback
107
+ for m in re.finditer(r'\b[0-9]{3}-[0-9]{2}-[0-9]{4}\b', text):
108
+ findings.append({
109
+ "entity": "SSN",
110
+ "score": 0.95,
111
+ "start": m.start(),
112
+ "end": m.end(),
113
+ "text": m.group()
114
+ })
115
+ return findings
116
+
117
+ def clean_person_entities(findings):
118
+ cleaned = []
119
+ for f in findings:
120
+ if f["entity"] == "PERSON":
121
+ name = " ".join(f["text"].split()[:2])
122
+ if name.lower() not in ["date", "department"]:
123
+ f = f.copy()
124
+ f["text"] = name
125
+ cleaned.append(f)
126
+ else:
127
+ cleaned.append(f)
128
+ return cleaned
129
+
130
+ def dedupe_findings(findings):
131
+ seen = set()
132
+ deduped = []
133
+ for f in findings:
134
+ key = (f["entity"], f["text"], f["start"], f["end"])
135
+ if key not in seen:
136
+ seen.add(key)
137
+ deduped.append(f)
138
+ return deduped
139
+
140
+ def risk_score(findings):
141
+ weights = {
142
+ "PERSON": 1, "EMAIL_ADDRESS": 2, "CREDIT_CARD": 4, "SSN": 5,
143
+ "IP_ADDRESS": 2, "PHONE_NUMBER": 2, "MEDICAL_RECORD_NUMBER": 3
144
+ }
145
+ return sum(weights.get(f["entity"], 1) for f in findings)
146
+
147
+ def suggest_fixes(findings):
148
+ fixes = []
149
+ for f in findings:
150
+ ent = f["entity"]
151
+ if ent == "PERSON":
152
+ fixes.append("Remove or mask full names.")
153
+ if ent == "EMAIL_ADDRESS":
154
+ fixes.append("Anonymize email addresses.")
155
+ if ent == "CREDIT_CARD":
156
+ fixes.append("Remove or mask credit card numbers.")
157
+ if ent == "SSN":
158
+ fixes.append("Remove or mask social security numbers.")
159
+ if ent == "PHONE_NUMBER":
160
+ fixes.append("Mask phone numbers.")
161
+ if ent == "LOCATION":
162
+ fixes.append("Remove or generalize location data.")
163
+ if ent == "IP_ADDRESS":
164
+ fixes.append("Remove or anonymize IP addresses.")
165
+ if ent == "MEDICAL_RECORD_NUMBER":
166
+ fixes.append("Anonymize medical record numbers.")
167
+ return list(set(fixes))
168
+
169
+ def summarize_narrative(findings, regime):
170
+ if not findings:
171
+ return "No sensitive or regulated information was found in this document."
172
+ entity_types = [f["entity"] for f in findings]
173
+ summary_lines = []
174
+ summary_lines.append(f"Under **{regime}**, the document contains:")
175
+ for entity in sorted(set(entity_types)):
176
+ count = entity_types.count(entity)
177
+ summary_lines.append(f"- **{entity.replace('_', ' ').title()}**: {count} instance(s)")
178
+ summary_lines.append("These must be anonymized or removed to ensure compliance.")
179
+ return "\n".join(summary_lines)
180
+
181
+ def score_legend():
182
+ return (
183
+ "**Risk Score Legend:**\n"
184
+ "- 0–3: Low risk (little or no PII detected)\n"
185
+ "- 4–7: Moderate risk (some PII detected, take caution)\n"
186
+ "- 8+: High risk (multiple/high-value PII found—document needs urgent attention)\n"
187
+ "\n"
188
+ "Score is calculated based on entity sensitivity. For example, SSN and credit cards are higher risk than names."
189
+ )
190
+
191
+ def redact_text(text, all_findings):
192
+ """Redact ALL PII (Presidio + regex) by replacing span with [REDACTED] in the text."""
193
+ # Sort findings in reverse order so indexes don't get messed up
194
+ all_findings = sorted(all_findings, key=lambda f: f["start"], reverse=True)
195
+ redacted_text = text
196
+ for f in all_findings:
197
+ # Only redact actual text spans (skip any nulls or 1-2 char)
198
+ if not f["text"] or len(f["text"]) < 3:
199
+ continue
200
+ redacted_text = redacted_text[:f["start"]] + "[REDACTED]" + redacted_text[f["end"]:]
201
+ return redacted_text
202
+
203
+ def save_redacted_file(redacted_text):
204
+ path = "/tmp/redacted_output.txt"
205
+ with open(path, "w", encoding="utf-8") as f:
206
+ f.write(redacted_text)
207
+ return path
208
+
209
+ def redact_image_with_presidio(image_path):
210
+ img = Image.open(image_path)
211
+ redacted_img = image_redactor.redact(img)
212
+ out_path = "/tmp/redacted_image.png"
213
+ redacted_img.save(out_path)
214
+ return out_path
215
+
216
+ def redact_pdf_with_presidio(pdf_path):
217
+ doc = fitz.open(pdf_path)
218
+ output_pdf = fitz.open()
219
+ for page in doc:
220
+ pix = page.get_pixmap()
221
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
222
+ redacted_img = image_redactor.redact(img)
223
+ img_byte_arr = io.BytesIO()
224
+ redacted_img.save(img_byte_arr, format='PNG')
225
+ img_byte_arr.seek(0)
226
+ rect = fitz.Rect(0, 0, pix.width, pix.height)
227
+ out_page = output_pdf.new_page(width=pix.width, height=pix.height)
228
+ out_page.insert_image(rect, stream=img_byte_arr.getvalue())
229
+ out_path = "/tmp/redacted_output.pdf"
230
+ output_pdf.save(out_path)
231
+ output_pdf.close()
232
+ return out_path
233
+
234
+ def executive_summary_template(findings, score, regime):
235
+ if not findings:
236
+ return (
237
+ f"No sensitive information detected under {regime}. Document is considered low risk."
238
+ )
239
+ risk_level = (
240
+ "High Risk" if score >= 8 else
241
+ "Moderate Risk" if score >= 4 else "Low Risk"
242
+ )
243
+ entity_counts = {}
244
+ for f in findings:
245
+ entity_counts[f["entity"]] = entity_counts.get(f["entity"], 0) + 1
246
+
247
+ summary_lines = [
248
+ f"This document falls under {regime} with a risk score of {score} ({risk_level})."
249
+ ]
250
+ if entity_counts:
251
+ summary_lines.append(
252
+ "Sensitive information detected: " +
253
+ ", ".join([f"{k} ({v})" for k, v in entity_counts.items()])
254
+ + "."
255
+ )
256
+ summary_lines.append(
257
+ "Recommendation: Anonymize or redact all sensitive entities to ensure compliance."
258
+ )
259
+ return " ".join(summary_lines)
260
+
261
+
262
+ def agentic_compliance(doc, regime):
263
+ text = extract_text(doc)
264
+ if text.startswith("ERROR"):
265
+ return text, None, None
266
+
267
+ findings, presidio_results = detect_pii(text)
268
+ findings = clean_person_entities(findings)
269
+ findings = dedupe_findings(findings)
270
+
271
+ # Only show entities relevant to regime, but redact all
272
+ entities_needed = COMPLIANCE_ENTITIES.get(regime, [])
273
+ relevant = [f for f in findings if f["entity"] in entities_needed]
274
+ score = risk_score(relevant)
275
+ fixes = suggest_fixes(relevant)
276
+ summary = summarize_narrative(relevant, regime)
277
+ exec_summary = executive_summary_template(relevant, score, regime)
278
+
279
+ findings_md = "\n".join([
280
+ f"- **{f['entity']}** (`{f['text']}`), score: {f.get('score', 0):.2f}"
281
+ for f in relevant
282
+ ]) if relevant else "No relevant PII found for this regime."
283
+
284
+ fixes_md = "\n".join([f"- {fix}" for fix in fixes]) if fixes else "No action needed."
285
+ legend_md = score_legend()
286
+
287
+ # Redact *all* PII detected (not just Presidio, but also regex findings)
288
+ redacted = redact_text(text, findings)
289
+ redacted_path = save_redacted_file(redacted)
290
+
291
+ # Generate redacted file (image or PDF) if applicable
292
+ redacted_file_path = None
293
+ if hasattr(doc, "name"):
294
+ fname = doc.name.lower()
295
+ if fname.endswith((".png", ".jpg", ".jpeg")):
296
+ redacted_file_path = redact_image_with_presidio(doc.name)
297
+ elif fname.endswith(".pdf"):
298
+ redacted_file_path = redact_pdf_with_presidio(doc.name)
299
+
300
+ md = f"""### Compliance Regime: **{regime}**
301
+
302
+ **Executive Summary:**
303
+ {exec_summary}
304
+
305
+ **Findings:**
306
+ {findings_md}
307
+
308
+ **Risk Score:** {score}
309
+
310
+ **Actionable Recommendations:**
311
+ {fixes_md}
312
+
313
+ **Summary:**
314
+ {summary}
315
+
316
+ ---
317
+
318
+ {legend_md}
319
+
320
+ ---
321
+
322
+ **Redacted Document Preview:**
323
+ <details>
324
+ <summary>Show/Hide Redacted Text</summary>
325
+
326
+ </details>
327
+ """
328
+ return md.strip(), redacted_path, redacted_file_path
329
+
330
+ # --- Gradio UI ---
331
+ with gr.Blocks(title="Agentic Compliance MCP Server") as demo:
332
+ gr.Markdown("# Agentic Compliance MCP\nUpload a document and select a compliance regime.")
333
+ with gr.Tab("Compliance Agent"):
334
+ doc = gr.File(label="Upload Document", file_types=SUPPORTED_FILE_TYPES)
335
+ regime = gr.Dropdown(choices=list(COMPLIANCE_ENTITIES.keys()), label="Compliance Regime")
336
+ out = gr.Markdown(label="Compliance Output")
337
+ redacted_out = gr.File(label="Download Redacted Text")
338
+ file_redacted_out = gr.File(label="Download Redacted PDF/Image")
339
+ gr.Button("Run Compliance Agent").click(
340
+ agentic_compliance, inputs=[doc, regime], outputs=[out, redacted_out, file_redacted_out]
341
+ )
342
+
343
+ demo.launch(mcp_server=True)
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ tesseract-ocr
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ pdfplumber
3
+ transformers
4
+ torch
5
+ numpy
6
+ presidio-analyzer
7
+ presidio-anonymizer
8
+ presidio-image-redactor
9
+ spacy
10
+ python-docx
11
+ pytesseract
12
+ pillow
13
+ pymupdf
sample_form.pdf ADDED
Binary file (1.52 kB). View file
 
sample_img.png ADDED

Git LFS Details

  • SHA256: c0504915374cc8761931aca79698a0058a2550ada9079dc86e364963deb01a30
  • Pointer size: 131 Bytes
  • Size of remote file: 707 kB
sample_incident_report.docx ADDED
Binary file (16.9 kB). View file