Spaces:

Agents-MCP-Hackathon
/

PII_Compliance_Check_MCP

Running

App Files Files Community

Armando Medina commited on Jun 2

Commit

8036eb5

verified ·

1 Parent(s): 7293214

Upload 6 files

Browse files

Files changed (7) hide show

.gitattributes +1 -0
app.py +343 -0
packages.txt +1 -0
requirements.txt +13 -0
sample_form.pdf +0 -0
sample_img.png +3 -0
sample_incident_report.docx +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+sample_img.png filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,343 @@

+import gradio as gr
+import pdfplumber
+from presidio_analyzer import AnalyzerEngine
+from presidio_anonymizer import AnonymizerEngine
+from presidio_image_redactor import ImageRedactorEngine
+from transformers import pipeline
+import torch
+import numpy as np
+import re
+import os
+from docx import Document
+from PIL import Image
+import pytesseract
+import fitz  # pymupdf
+import io
+# --- LLM for executive summary (Pegasus-XSum) ---
+# summarizer = pipeline("summarization", model="")
+analyzer = AnalyzerEngine()
+anonymizer = AnonymizerEngine()
+image_redactor = ImageRedactorEngine()
+COMPLIANCE_ENTITIES = {
+    "HIPAA": ["PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER", "MEDICAL_RECORD_NUMBER", "SSN"],
+    "GDPR": ["PERSON", "EMAIL_ADDRESS", "LOCATION"],
+    "CCPA": ["PERSON", "EMAIL_ADDRESS", "IP_ADDRESS", "SSN", "CREDIT_CARD"]
+}
+SUPPORTED_FILE_TYPES = [".pdf", ".docx", ".txt", ".png", ".jpg", ".jpeg"]
+def extract_text(doc):
+    if not hasattr(doc, "name"):
+        return "ERROR: No file uploaded."
+    try:
+        fname = doc.name.lower()
+        if fname.endswith(".pdf"):
+            with pdfplumber.open(doc.name) as pdf:
+                pages = [page.extract_text() or "" for page in pdf.pages]
+            text = "\n".join(pages)
+        elif fname.endswith(".docx"):
+            document = Document(doc.name)
+            text = "\n".join([p.text for p in document.paragraphs])
+        elif fname.endswith(".txt"):
+            with open(doc.name, "r", encoding="utf-8") as f:
+                text = f.read()
+        elif fname.endswith((".png", ".jpg", ".jpeg")):
+            img = Image.open(doc.name)
+            text = pytesseract.image_to_string(img)
+        else:
+            return "ERROR: Unsupported file type."
+        if not text.strip():
+            return "ERROR: Document contains no extractable text."
+        return text
+    except Exception as e:
+        return f"ERROR: {e}"
+def detect_pii(text):
+    try:
+        entities = [
+            "PERSON", "EMAIL_ADDRESS", "PHONE_NUMBER", "MEDICAL_RECORD_NUMBER",
+            "SSN", "CREDIT_CARD", "LOCATION", "IP_ADDRESS"
+        ]
+        presidio_results = analyzer.analyze(text=text, entities=entities, language="en")
+        findings = [
+            {
+                "entity": r.entity_type,
+                "score": r.score,
+                "start": r.start,
+                "end": r.end,
+                "text": text[r.start:r.end].strip()
+            }
+            for r in presidio_results
+        ]
+        # Add regex-based findings (SSN, IP) for extra coverage
+        findings += find_ssns(text)
+        findings += find_ip_addresses(text)
+        return findings, presidio_results
+    except Exception as e:
+        return [{"entity": "ERROR", "text": str(e)}], []
+def find_ip_addresses(text):
+    pattern = r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'
+    return [
+        {
+            "entity": "IP_ADDRESS",
+            "score": 1.0,
+            "start": m.start(),
+            "end": m.end(),
+            "text": m.group()
+        }
+        for m in re.finditer(pattern, text)
+    ]
+def find_ssns(text):
+    pattern = r'(?i)(ssn|social security number)[\s:]*([0-9]{3}-[0-9]{2}-[0-9]{4})'
+    findings = []
+    for m in re.finditer(pattern, text):
+        findings.append({
+            "entity": "SSN",
+            "score": 1.0,
+            "start": m.start(2),
+            "end": m.end(2),
+            "text": m.group(2)
+        })
+    # Standalone SSN fallback
+    for m in re.finditer(r'\b[0-9]{3}-[0-9]{2}-[0-9]{4}\b', text):
+        findings.append({
+            "entity": "SSN",
+            "score": 0.95,
+            "start": m.start(),
+            "end": m.end(),
+            "text": m.group()
+        })
+    return findings
+def clean_person_entities(findings):
+    cleaned = []
+    for f in findings:
+        if f["entity"] == "PERSON":
+            name = " ".join(f["text"].split()[:2])
+            if name.lower() not in ["date", "department"]:
+                f = f.copy()
+                f["text"] = name
+                cleaned.append(f)
+        else:
+            cleaned.append(f)
+    return cleaned
+def dedupe_findings(findings):
+    seen = set()
+    deduped = []
+    for f in findings:
+        key = (f["entity"], f["text"], f["start"], f["end"])
+        if key not in seen:
+            seen.add(key)
+            deduped.append(f)
+    return deduped
+def risk_score(findings):
+    weights = {
+        "PERSON": 1, "EMAIL_ADDRESS": 2, "CREDIT_CARD": 4, "SSN": 5,
+        "IP_ADDRESS": 2, "PHONE_NUMBER": 2, "MEDICAL_RECORD_NUMBER": 3
+    }
+    return sum(weights.get(f["entity"], 1) for f in findings)
+def suggest_fixes(findings):
+    fixes = []
+    for f in findings:
+        ent = f["entity"]
+        if ent == "PERSON":
+            fixes.append("Remove or mask full names.")
+        if ent == "EMAIL_ADDRESS":
+            fixes.append("Anonymize email addresses.")
+        if ent == "CREDIT_CARD":
+            fixes.append("Remove or mask credit card numbers.")
+        if ent == "SSN":
+            fixes.append("Remove or mask social security numbers.")
+        if ent == "PHONE_NUMBER":
+            fixes.append("Mask phone numbers.")
+        if ent == "LOCATION":
+            fixes.append("Remove or generalize location data.")
+        if ent == "IP_ADDRESS":
+            fixes.append("Remove or anonymize IP addresses.")
+        if ent == "MEDICAL_RECORD_NUMBER":
+            fixes.append("Anonymize medical record numbers.")
+    return list(set(fixes))
+def summarize_narrative(findings, regime):
+    if not findings:
+        return "No sensitive or regulated information was found in this document."
+    entity_types = [f["entity"] for f in findings]
+    summary_lines = []
+    summary_lines.append(f"Under **{regime}**, the document contains:")
+    for entity in sorted(set(entity_types)):
+        count = entity_types.count(entity)
+        summary_lines.append(f"- **{entity.replace('_', ' ').title()}**: {count} instance(s)")
+    summary_lines.append("These must be anonymized or removed to ensure compliance.")
+    return "\n".join(summary_lines)
+def score_legend():
+    return (
+        "**Risk Score Legend:**\n"
+        "- 0–3: Low risk (little or no PII detected)\n"
+        "- 4–7: Moderate risk (some PII detected, take caution)\n"
+        "- 8+: High risk (multiple/high-value PII found—document needs urgent attention)\n"
+        "\n"
+        "Score is calculated based on entity sensitivity. For example, SSN and credit cards are higher risk than names."
+    )
+def redact_text(text, all_findings):
+    """Redact ALL PII (Presidio + regex) by replacing span with [REDACTED] in the text."""
+    # Sort findings in reverse order so indexes don't get messed up
+    all_findings = sorted(all_findings, key=lambda f: f["start"], reverse=True)
+    redacted_text = text
+    for f in all_findings:
+        # Only redact actual text spans (skip any nulls or 1-2 char)
+        if not f["text"] or len(f["text"]) < 3:
+            continue
+        redacted_text = redacted_text[:f["start"]] + "[REDACTED]" + redacted_text[f["end"]:]
+    return redacted_text
+def save_redacted_file(redacted_text):
+    path = "/tmp/redacted_output.txt"
+    with open(path, "w", encoding="utf-8") as f:
+        f.write(redacted_text)
+    return path
+def redact_image_with_presidio(image_path):
+    img = Image.open(image_path)
+    redacted_img = image_redactor.redact(img)
+    out_path = "/tmp/redacted_image.png"
+    redacted_img.save(out_path)
+    return out_path
+def redact_pdf_with_presidio(pdf_path):
+    doc = fitz.open(pdf_path)
+    output_pdf = fitz.open()
+    for page in doc:
+        pix = page.get_pixmap()
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        redacted_img = image_redactor.redact(img)
+        img_byte_arr = io.BytesIO()
+        redacted_img.save(img_byte_arr, format='PNG')
+        img_byte_arr.seek(0)
+        rect = fitz.Rect(0, 0, pix.width, pix.height)
+        out_page = output_pdf.new_page(width=pix.width, height=pix.height)
+        out_page.insert_image(rect, stream=img_byte_arr.getvalue())
+    out_path = "/tmp/redacted_output.pdf"
+    output_pdf.save(out_path)
+    output_pdf.close()
+    return out_path
+def executive_summary_template(findings, score, regime):
+    if not findings:
+        return (
+            f"No sensitive information detected under {regime}. Document is considered low risk."
+        )
+    risk_level = (
+        "High Risk" if score >= 8 else
+        "Moderate Risk" if score >= 4 else "Low Risk"
+    )
+    entity_counts = {}
+    for f in findings:
+        entity_counts[f["entity"]] = entity_counts.get(f["entity"], 0) + 1
+    summary_lines = [
+        f"This document falls under {regime} with a risk score of {score} ({risk_level})."
+    ]
+    if entity_counts:
+        summary_lines.append(
+            "Sensitive information detected: " +
+            ", ".join([f"{k} ({v})" for k, v in entity_counts.items()])
+            + "."
+        )
+    summary_lines.append(
+        "Recommendation: Anonymize or redact all sensitive entities to ensure compliance."
+    )
+    return " ".join(summary_lines)
+def agentic_compliance(doc, regime):
+    text = extract_text(doc)
+    if text.startswith("ERROR"):
+        return text, None, None
+    findings, presidio_results = detect_pii(text)
+    findings = clean_person_entities(findings)
+    findings = dedupe_findings(findings)
+    # Only show entities relevant to regime, but redact all
+    entities_needed = COMPLIANCE_ENTITIES.get(regime, [])
+    relevant = [f for f in findings if f["entity"] in entities_needed]
+    score = risk_score(relevant)
+    fixes = suggest_fixes(relevant)
+    summary = summarize_narrative(relevant, regime)
+    exec_summary = executive_summary_template(relevant, score, regime)
+    findings_md = "\n".join([
+        f"- **{f['entity']}** (`{f['text']}`), score: {f.get('score', 0):.2f}"
+        for f in relevant
+    ]) if relevant else "No relevant PII found for this regime."
+    fixes_md = "\n".join([f"- {fix}" for fix in fixes]) if fixes else "No action needed."
+    legend_md = score_legend()
+    # Redact *all* PII detected (not just Presidio, but also regex findings)
+    redacted = redact_text(text, findings)
+    redacted_path = save_redacted_file(redacted)
+    # Generate redacted file (image or PDF) if applicable
+    redacted_file_path = None
+    if hasattr(doc, "name"):
+        fname = doc.name.lower()
+        if fname.endswith((".png", ".jpg", ".jpeg")):
+            redacted_file_path = redact_image_with_presidio(doc.name)
+        elif fname.endswith(".pdf"):
+            redacted_file_path = redact_pdf_with_presidio(doc.name)
+    md = f"""### Compliance Regime: **{regime}**
+**Executive Summary:**
+{exec_summary}
+**Findings:**
+{findings_md}
+**Risk Score:** {score}
+**Actionable Recommendations:**
+{fixes_md}
+**Summary:**
+{summary}
+---
+{legend_md}
+---
+**Redacted Document Preview:**
+<details>
+<summary>Show/Hide Redacted Text</summary>
+</details>
+"""
+    return md.strip(), redacted_path, redacted_file_path
+# --- Gradio UI ---
+with gr.Blocks(title="Agentic Compliance MCP Server") as demo:
+    gr.Markdown("# Agentic Compliance MCP\nUpload a document and select a compliance regime.")
+    with gr.Tab("Compliance Agent"):
+        doc = gr.File(label="Upload Document", file_types=SUPPORTED_FILE_TYPES)
+        regime = gr.Dropdown(choices=list(COMPLIANCE_ENTITIES.keys()), label="Compliance Regime")
+        out = gr.Markdown(label="Compliance Output")
+        redacted_out = gr.File(label="Download Redacted Text")
+        file_redacted_out = gr.File(label="Download Redacted PDF/Image")
+        gr.Button("Run Compliance Agent").click(
+            agentic_compliance, inputs=[doc, regime], outputs=[out, redacted_out, file_redacted_out]
+        )
+demo.launch(mcp_server=True)

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ tesseract-ocr

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+gradio
+pdfplumber
+transformers
+torch
+numpy
+presidio-analyzer
+presidio-anonymizer
+presidio-image-redactor
+spacy
+python-docx
+pytesseract
+pillow
+pymupdf

sample_form.pdf ADDED Viewed

Binary file (1.52 kB). View file

sample_img.png ADDED Viewed

Git LFS Details

SHA256: c0504915374cc8761931aca79698a0058a2550ada9079dc86e364963deb01a30
Pointer size: 131 Bytes
Size of remote file: 707 kB

sample_incident_report.docx ADDED Viewed

Binary file (16.9 kB). View file