Spaces:
Running
Running
from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks | |
from fastapi.responses import HTMLResponse, FileResponse | |
import os | |
import tempfile | |
import shutil | |
from pathlib import Path | |
import asyncio | |
from typing import Dict, Optional | |
import uuid | |
app = FastAPI(title="MinerU PDF Converter", version="0.2.0") | |
async def root(): | |
"""Simple hello world endpoint""" | |
return { | |
"message": "Hello World from MinerU PDF Converter!", | |
"status": "running", | |
"environment": os.environ.get("SPACE_ID", "local") | |
} | |
async def health_check(): | |
"""Health check endpoint""" | |
return {"status": "healthy", "service": "pdf2md"} | |
async def test_page(): | |
"""Simple HTML test page""" | |
return """ | |
<html> | |
<head> | |
<title>PDF to Markdown - Test</title> | |
<style> | |
body { | |
font-family: Arial, sans-serif; | |
max-width: 800px; | |
margin: 0 auto; | |
padding: 20px; | |
} | |
.status { | |
background: #e8f5e9; | |
padding: 10px; | |
border-radius: 5px; | |
margin: 20px 0; | |
} | |
</style> | |
</head> | |
<body> | |
<h1>PDF to Markdown Converter</h1> | |
<div class="status"> | |
✅ Service is running! | |
</div> | |
<p>This is a test deployment. Full functionality coming soon.</p> | |
<p> | |
<a href="/docs">API Documentation</a> | | |
<a href="/health">Health Check</a> | |
</p> | |
</body> | |
</html> | |
""" | |
async def api_info(): | |
"""API information endpoint""" | |
return { | |
"name": "PDF to Markdown Converter API", | |
"version": "0.2.0", | |
"endpoints": { | |
"/": "Main endpoint", | |
"/health": "Health check", | |
"/test": "Test HTML page", | |
"/docs": "FastAPI automatic documentation", | |
"/api/info": "This endpoint", | |
"/api/convert": "Convert PDF to Markdown (POST)", | |
"/api/status/{task_id}": "Check conversion status", | |
"/api/download/{task_id}": "Download converted markdown" | |
} | |
} | |
# Store for conversion tasks | |
conversion_tasks: Dict[str, dict] = {} | |
async def convert_pdf( | |
background_tasks: BackgroundTasks, | |
file: UploadFile = File(...) | |
): | |
"""Convert PDF to Markdown""" | |
if not file.filename.endswith('.pdf'): | |
raise HTTPException(status_code=400, detail="Only PDF files are supported") | |
# Generate unique task ID | |
task_id = str(uuid.uuid4()) | |
# Save uploaded file | |
temp_dir = Path(tempfile.mkdtemp()) | |
pdf_path = temp_dir / file.filename | |
try: | |
with open(pdf_path, "wb") as buffer: | |
shutil.copyfileobj(file.file, buffer) | |
except Exception as e: | |
shutil.rmtree(temp_dir) | |
raise HTTPException(status_code=500, detail=f"Failed to save file: {str(e)}") | |
# Initialize task status | |
conversion_tasks[task_id] = { | |
"status": "processing", | |
"filename": file.filename, | |
"result": None, | |
"error": None, | |
"temp_dir": str(temp_dir) | |
} | |
# Start conversion in background | |
background_tasks.add_task(process_pdf_conversion, task_id, str(pdf_path)) | |
return { | |
"task_id": task_id, | |
"status": "processing", | |
"message": "PDF conversion started", | |
"check_status_url": f"/api/status/{task_id}" | |
} | |
async def process_pdf_conversion(task_id: str, pdf_path: str): | |
"""Process PDF conversion in background""" | |
try: | |
# For now, just simulate conversion | |
await asyncio.sleep(2) # Simulate processing | |
# Create a dummy markdown file | |
output_path = Path(pdf_path).with_suffix('.md') | |
with open(output_path, 'w') as f: | |
f.write(f"# Converted from {Path(pdf_path).name}\n\n") | |
f.write("This is a placeholder conversion. Full MinerU integration coming soon.\n") | |
conversion_tasks[task_id]["status"] = "completed" | |
conversion_tasks[task_id]["result"] = str(output_path) | |
except Exception as e: | |
conversion_tasks[task_id]["status"] = "failed" | |
conversion_tasks[task_id]["error"] = str(e) | |
async def get_conversion_status(task_id: str): | |
"""Check conversion status""" | |
if task_id not in conversion_tasks: | |
raise HTTPException(status_code=404, detail="Task not found") | |
task = conversion_tasks[task_id] | |
response = { | |
"task_id": task_id, | |
"status": task["status"], | |
"filename": task["filename"] | |
} | |
if task["status"] == "completed": | |
response["download_url"] = f"/api/download/{task_id}" | |
elif task["status"] == "failed": | |
response["error"] = task["error"] | |
return response | |
async def download_converted_file(task_id: str): | |
"""Download converted markdown file""" | |
if task_id not in conversion_tasks: | |
raise HTTPException(status_code=404, detail="Task not found") | |
task = conversion_tasks[task_id] | |
if task["status"] != "completed": | |
raise HTTPException(status_code=400, detail="Conversion not completed") | |
if not task["result"] or not Path(task["result"]).exists(): | |
raise HTTPException(status_code=404, detail="Converted file not found") | |
return FileResponse( | |
task["result"], | |
media_type="text/markdown", | |
filename=Path(task["result"]).name | |
) |