marcosremar2 commited on
Commit
39baca3
·
1 Parent(s): 4a9d776

Simplify app for testing

Browse files

- Create minimal version to test PDF upload
- Remove complex dependencies temporarily
- Add test endpoint to verify deployment
- Version 0.3.0 for tracking

Files changed (3) hide show
  1. app.py +18 -159
  2. app_full.py +182 -0
  3. test_api_local.py +66 -0
app.py CHANGED
@@ -1,14 +1,8 @@
1
- from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
2
- from fastapi.responses import HTMLResponse, FileResponse
3
  import os
4
- import tempfile
5
- import shutil
6
- from pathlib import Path
7
- import asyncio
8
- from typing import Dict, Optional
9
- import uuid
10
 
11
- app = FastAPI(title="MinerU PDF Converter", version="0.2.0")
12
 
13
  @app.get("/")
14
  async def root():
@@ -16,167 +10,32 @@ async def root():
16
  return {
17
  "message": "Hello World from MinerU PDF Converter!",
18
  "status": "running",
 
19
  "environment": os.environ.get("SPACE_ID", "local")
20
  }
21
 
22
  @app.get("/health")
23
  async def health_check():
24
  """Health check endpoint"""
25
- return {"status": "healthy", "service": "pdf2md"}
26
-
27
- @app.get("/test", response_class=HTMLResponse)
28
- async def test_page():
29
- """Simple HTML test page"""
30
- return """
31
- <html>
32
- <head>
33
- <title>PDF to Markdown - Test</title>
34
- <style>
35
- body {
36
- font-family: Arial, sans-serif;
37
- max-width: 800px;
38
- margin: 0 auto;
39
- padding: 20px;
40
- }
41
- .status {
42
- background: #e8f5e9;
43
- padding: 10px;
44
- border-radius: 5px;
45
- margin: 20px 0;
46
- }
47
- </style>
48
- </head>
49
- <body>
50
- <h1>PDF to Markdown Converter</h1>
51
- <div class="status">
52
- ✅ Service is running!
53
- </div>
54
- <p>This is a test deployment. Full functionality coming soon.</p>
55
- <p>
56
- <a href="/docs">API Documentation</a> |
57
- <a href="/health">Health Check</a>
58
- </p>
59
- </body>
60
- </html>
61
- """
62
-
63
- @app.get("/api/info")
64
- async def api_info():
65
- """API information endpoint"""
66
- return {
67
- "name": "PDF to Markdown Converter API",
68
- "version": "0.2.0",
69
- "endpoints": {
70
- "/": "Main endpoint",
71
- "/health": "Health check",
72
- "/test": "Test HTML page",
73
- "/docs": "FastAPI automatic documentation",
74
- "/api/info": "This endpoint",
75
- "/api/convert": "Convert PDF to Markdown (POST)",
76
- "/api/status/{task_id}": "Check conversion status",
77
- "/api/download/{task_id}": "Download converted markdown"
78
- }
79
- }
80
-
81
- # Store for conversion tasks
82
- conversion_tasks: Dict[str, dict] = {}
83
 
84
  @app.post("/api/convert")
85
- async def convert_pdf(
86
- background_tasks: BackgroundTasks,
87
- file: UploadFile = File(...)
88
- ):
89
- """Convert PDF to Markdown"""
90
  if not file.filename.endswith('.pdf'):
91
  raise HTTPException(status_code=400, detail="Only PDF files are supported")
92
 
93
- # Generate unique task ID
94
- task_id = str(uuid.uuid4())
95
-
96
- # Save uploaded file
97
- temp_dir = Path(tempfile.mkdtemp())
98
- pdf_path = temp_dir / file.filename
99
-
100
- try:
101
- with open(pdf_path, "wb") as buffer:
102
- shutil.copyfileobj(file.file, buffer)
103
- except Exception as e:
104
- shutil.rmtree(temp_dir)
105
- raise HTTPException(status_code=500, detail=f"Failed to save file: {str(e)}")
106
-
107
- # Initialize task status
108
- conversion_tasks[task_id] = {
109
- "status": "processing",
110
- "filename": file.filename,
111
- "result": None,
112
- "error": None,
113
- "temp_dir": str(temp_dir)
114
- }
115
-
116
- # Start conversion in background
117
- background_tasks.add_task(process_pdf_conversion, task_id, str(pdf_path))
118
-
119
  return {
120
- "task_id": task_id,
121
- "status": "processing",
122
- "message": "PDF conversion started",
123
- "check_status_url": f"/api/status/{task_id}"
124
  }
125
 
126
- async def process_pdf_conversion(task_id: str, pdf_path: str):
127
- """Process PDF conversion in background"""
128
- try:
129
- # For now, just simulate conversion
130
- await asyncio.sleep(2) # Simulate processing
131
-
132
- # Create a dummy markdown file
133
- output_path = Path(pdf_path).with_suffix('.md')
134
- with open(output_path, 'w') as f:
135
- f.write(f"# Converted from {Path(pdf_path).name}\n\n")
136
- f.write("This is a placeholder conversion. Full MinerU integration coming soon.\n")
137
-
138
- conversion_tasks[task_id]["status"] = "completed"
139
- conversion_tasks[task_id]["result"] = str(output_path)
140
-
141
- except Exception as e:
142
- conversion_tasks[task_id]["status"] = "failed"
143
- conversion_tasks[task_id]["error"] = str(e)
144
-
145
- @app.get("/api/status/{task_id}")
146
- async def get_conversion_status(task_id: str):
147
- """Check conversion status"""
148
- if task_id not in conversion_tasks:
149
- raise HTTPException(status_code=404, detail="Task not found")
150
-
151
- task = conversion_tasks[task_id]
152
- response = {
153
- "task_id": task_id,
154
- "status": task["status"],
155
- "filename": task["filename"]
156
- }
157
-
158
- if task["status"] == "completed":
159
- response["download_url"] = f"/api/download/{task_id}"
160
- elif task["status"] == "failed":
161
- response["error"] = task["error"]
162
-
163
- return response
164
-
165
- @app.get("/api/download/{task_id}")
166
- async def download_converted_file(task_id: str):
167
- """Download converted markdown file"""
168
- if task_id not in conversion_tasks:
169
- raise HTTPException(status_code=404, detail="Task not found")
170
-
171
- task = conversion_tasks[task_id]
172
- if task["status"] != "completed":
173
- raise HTTPException(status_code=400, detail="Conversion not completed")
174
-
175
- if not task["result"] or not Path(task["result"]).exists():
176
- raise HTTPException(status_code=404, detail="Converted file not found")
177
-
178
- return FileResponse(
179
- task["result"],
180
- media_type="text/markdown",
181
- filename=Path(task["result"]).name
182
- )
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException
2
+ from fastapi.responses import HTMLResponse, JSONResponse
3
  import os
 
 
 
 
 
 
4
 
5
+ app = FastAPI(title="MinerU PDF Converter", version="0.3.0")
6
 
7
  @app.get("/")
8
  async def root():
 
10
  return {
11
  "message": "Hello World from MinerU PDF Converter!",
12
  "status": "running",
13
+ "version": "0.3.0",
14
  "environment": os.environ.get("SPACE_ID", "local")
15
  }
16
 
17
  @app.get("/health")
18
  async def health_check():
19
  """Health check endpoint"""
20
+ return {"status": "healthy", "service": "pdf2md", "version": "0.3.0"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  @app.post("/api/convert")
23
+ async def convert_pdf(file: UploadFile = File(...)):
24
+ """Test PDF upload endpoint"""
 
 
 
25
  if not file.filename.endswith('.pdf'):
26
  raise HTTPException(status_code=400, detail="Only PDF files are supported")
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  return {
29
+ "message": "PDF upload endpoint is working!",
30
+ "filename": file.filename,
31
+ "size": len(await file.read()),
32
+ "status": "test_mode"
33
  }
34
 
35
+ @app.get("/api/test")
36
+ async def test_endpoint():
37
+ """Test that new endpoints are available"""
38
+ return {
39
+ "message": "New endpoints are working!",
40
+ "endpoints": ["/api/convert", "/api/test"]
41
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app_full.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
2
+ from fastapi.responses import HTMLResponse, FileResponse
3
+ import os
4
+ import tempfile
5
+ import shutil
6
+ from pathlib import Path
7
+ import asyncio
8
+ from typing import Dict, Optional
9
+ import uuid
10
+
11
+ app = FastAPI(title="MinerU PDF Converter", version="0.2.0")
12
+
13
+ @app.get("/")
14
+ async def root():
15
+ """Simple hello world endpoint"""
16
+ return {
17
+ "message": "Hello World from MinerU PDF Converter!",
18
+ "status": "running",
19
+ "environment": os.environ.get("SPACE_ID", "local")
20
+ }
21
+
22
+ @app.get("/health")
23
+ async def health_check():
24
+ """Health check endpoint"""
25
+ return {"status": "healthy", "service": "pdf2md"}
26
+
27
+ @app.get("/test", response_class=HTMLResponse)
28
+ async def test_page():
29
+ """Simple HTML test page"""
30
+ return """
31
+ <html>
32
+ <head>
33
+ <title>PDF to Markdown - Test</title>
34
+ <style>
35
+ body {
36
+ font-family: Arial, sans-serif;
37
+ max-width: 800px;
38
+ margin: 0 auto;
39
+ padding: 20px;
40
+ }
41
+ .status {
42
+ background: #e8f5e9;
43
+ padding: 10px;
44
+ border-radius: 5px;
45
+ margin: 20px 0;
46
+ }
47
+ </style>
48
+ </head>
49
+ <body>
50
+ <h1>PDF to Markdown Converter</h1>
51
+ <div class="status">
52
+ ✅ Service is running!
53
+ </div>
54
+ <p>This is a test deployment. Full functionality coming soon.</p>
55
+ <p>
56
+ <a href="/docs">API Documentation</a> |
57
+ <a href="/health">Health Check</a>
58
+ </p>
59
+ </body>
60
+ </html>
61
+ """
62
+
63
+ @app.get("/api/info")
64
+ async def api_info():
65
+ """API information endpoint"""
66
+ return {
67
+ "name": "PDF to Markdown Converter API",
68
+ "version": "0.2.0",
69
+ "endpoints": {
70
+ "/": "Main endpoint",
71
+ "/health": "Health check",
72
+ "/test": "Test HTML page",
73
+ "/docs": "FastAPI automatic documentation",
74
+ "/api/info": "This endpoint",
75
+ "/api/convert": "Convert PDF to Markdown (POST)",
76
+ "/api/status/{task_id}": "Check conversion status",
77
+ "/api/download/{task_id}": "Download converted markdown"
78
+ }
79
+ }
80
+
81
+ # Store for conversion tasks
82
+ conversion_tasks: Dict[str, dict] = {}
83
+
84
+ @app.post("/api/convert")
85
+ async def convert_pdf(
86
+ background_tasks: BackgroundTasks,
87
+ file: UploadFile = File(...)
88
+ ):
89
+ """Convert PDF to Markdown"""
90
+ if not file.filename.endswith('.pdf'):
91
+ raise HTTPException(status_code=400, detail="Only PDF files are supported")
92
+
93
+ # Generate unique task ID
94
+ task_id = str(uuid.uuid4())
95
+
96
+ # Save uploaded file
97
+ temp_dir = Path(tempfile.mkdtemp())
98
+ pdf_path = temp_dir / file.filename
99
+
100
+ try:
101
+ with open(pdf_path, "wb") as buffer:
102
+ shutil.copyfileobj(file.file, buffer)
103
+ except Exception as e:
104
+ shutil.rmtree(temp_dir)
105
+ raise HTTPException(status_code=500, detail=f"Failed to save file: {str(e)}")
106
+
107
+ # Initialize task status
108
+ conversion_tasks[task_id] = {
109
+ "status": "processing",
110
+ "filename": file.filename,
111
+ "result": None,
112
+ "error": None,
113
+ "temp_dir": str(temp_dir)
114
+ }
115
+
116
+ # Start conversion in background
117
+ background_tasks.add_task(process_pdf_conversion, task_id, str(pdf_path))
118
+
119
+ return {
120
+ "task_id": task_id,
121
+ "status": "processing",
122
+ "message": "PDF conversion started",
123
+ "check_status_url": f"/api/status/{task_id}"
124
+ }
125
+
126
+ async def process_pdf_conversion(task_id: str, pdf_path: str):
127
+ """Process PDF conversion in background"""
128
+ try:
129
+ # For now, just simulate conversion
130
+ await asyncio.sleep(2) # Simulate processing
131
+
132
+ # Create a dummy markdown file
133
+ output_path = Path(pdf_path).with_suffix('.md')
134
+ with open(output_path, 'w') as f:
135
+ f.write(f"# Converted from {Path(pdf_path).name}\n\n")
136
+ f.write("This is a placeholder conversion. Full MinerU integration coming soon.\n")
137
+
138
+ conversion_tasks[task_id]["status"] = "completed"
139
+ conversion_tasks[task_id]["result"] = str(output_path)
140
+
141
+ except Exception as e:
142
+ conversion_tasks[task_id]["status"] = "failed"
143
+ conversion_tasks[task_id]["error"] = str(e)
144
+
145
+ @app.get("/api/status/{task_id}")
146
+ async def get_conversion_status(task_id: str):
147
+ """Check conversion status"""
148
+ if task_id not in conversion_tasks:
149
+ raise HTTPException(status_code=404, detail="Task not found")
150
+
151
+ task = conversion_tasks[task_id]
152
+ response = {
153
+ "task_id": task_id,
154
+ "status": task["status"],
155
+ "filename": task["filename"]
156
+ }
157
+
158
+ if task["status"] == "completed":
159
+ response["download_url"] = f"/api/download/{task_id}"
160
+ elif task["status"] == "failed":
161
+ response["error"] = task["error"]
162
+
163
+ return response
164
+
165
+ @app.get("/api/download/{task_id}")
166
+ async def download_converted_file(task_id: str):
167
+ """Download converted markdown file"""
168
+ if task_id not in conversion_tasks:
169
+ raise HTTPException(status_code=404, detail="Task not found")
170
+
171
+ task = conversion_tasks[task_id]
172
+ if task["status"] != "completed":
173
+ raise HTTPException(status_code=400, detail="Conversion not completed")
174
+
175
+ if not task["result"] or not Path(task["result"]).exists():
176
+ raise HTTPException(status_code=404, detail="Converted file not found")
177
+
178
+ return FileResponse(
179
+ task["result"],
180
+ media_type="text/markdown",
181
+ filename=Path(task["result"]).name
182
+ )
test_api_local.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Test PDF conversion API locally"""
3
+
4
+ import requests
5
+ import time
6
+ import sys
7
+
8
+ def test_pdf_conversion(pdf_path, api_base_url="http://localhost:7860"):
9
+ """Test PDF conversion through API"""
10
+
11
+ # 1. Upload PDF
12
+ print(f"Uploading PDF: {pdf_path}")
13
+ with open(pdf_path, 'rb') as f:
14
+ files = {'file': (pdf_path.split('/')[-1], f, 'application/pdf')}
15
+ response = requests.post(f"{api_base_url}/api/convert", files=files)
16
+
17
+ if response.status_code != 200:
18
+ print(f"Upload failed: {response.status_code}")
19
+ print(response.text)
20
+ return
21
+
22
+ result = response.json()
23
+ task_id = result['task_id']
24
+ print(f"Task ID: {task_id}")
25
+ print(f"Status: {result['status']}")
26
+
27
+ # 2. Check status
28
+ print("\nChecking conversion status...")
29
+ while True:
30
+ response = requests.get(f"{api_base_url}/api/status/{task_id}")
31
+ if response.status_code != 200:
32
+ print(f"Status check failed: {response.status_code}")
33
+ break
34
+
35
+ status = response.json()
36
+ print(f"Status: {status['status']}")
37
+
38
+ if status['status'] == 'completed':
39
+ print(f"Download URL: {status['download_url']}")
40
+
41
+ # 3. Download result
42
+ response = requests.get(f"{api_base_url}{status['download_url']}")
43
+ if response.status_code == 200:
44
+ output_file = f"output_{task_id}.md"
45
+ with open(output_file, 'w') as f:
46
+ f.write(response.text)
47
+ print(f"\nMarkdown saved to: {output_file}")
48
+ print("\nContent preview:")
49
+ print(response.text[:500])
50
+ break
51
+
52
+ elif status['status'] == 'failed':
53
+ print(f"Conversion failed: {status.get('error', 'Unknown error')}")
54
+ break
55
+
56
+ time.sleep(1)
57
+
58
+ if __name__ == "__main__":
59
+ if len(sys.argv) > 1:
60
+ pdf_path = sys.argv[1]
61
+ else:
62
+ pdf_path = "/Users/marcos/Documents/projects/pdf2md/batch-files/test-simple.pdf"
63
+
64
+ # Test on HF Space
65
+ print("Testing on Hugging Face Space...")
66
+ test_pdf_conversion(pdf_path, "https://marcosremar2-mineru2.hf.space")