Spaces:
Running
Running
import runpod | |
import base64 | |
import fitz # PyMuPDF | |
def handler(job): | |
"""Simple PDF to text handler for testing""" | |
try: | |
job_input = job["input"] | |
# Get PDF data from base64 | |
pdf_base64 = job_input.get("pdf_base64") | |
filename = job_input.get("filename", "document.pdf") | |
if not pdf_base64: | |
return {"error": "No PDF data provided", "status": "failed"} | |
# Decode base64 PDF | |
pdf_data = base64.b64decode(pdf_base64) | |
# Extract text using PyMuPDF | |
doc = fitz.open(stream=pdf_data, filetype="pdf") | |
text_content = "" | |
for page_num, page in enumerate(doc): | |
text_content += f"\n\n--- Page {page_num + 1} ---\n\n" | |
text_content += page.get_text() | |
doc.close() | |
# Convert to simple markdown | |
markdown_content = f"# {filename}\n\n" | |
markdown_content += f"*Extracted using PyMuPDF (simplified version)*\n\n" | |
markdown_content += text_content | |
return { | |
"markdown": markdown_content, | |
"filename": filename, | |
"status": "success", | |
"pages": len(doc) | |
} | |
except Exception as e: | |
return { | |
"error": str(e), | |
"status": "failed" | |
} | |
# RunPod serverless entrypoint | |
runpod.serverless.start({"handler": handler}) |