docling-app / app.py
AyoubChLin's picture
[INIT]
19907be verified
raw
history blame
1.86 kB
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
import shutil
import os
from uuid import uuid4
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from threading import Lock
app = FastAPI()
# Singleton class for PdfConverter
class PdfConverterSingleton:
_instance = None
_lock = Lock()
def __new__(cls):
if cls._instance is None:
with cls._lock:
if cls._instance is None:
instance = super().__new__(cls)
instance._initialize()
cls._instance = instance
return cls._instance
def _initialize(self):
self.converter = PdfConverter(artifact_dict=create_model_dict())
def get_text(self, pdf_path: str) -> str:
rendered = self.converter(pdf_path)
text, _, _ = text_from_rendered(rendered)
return str(text)
# API function to call converter
def extract_text_from_pdf(pdf_path: str) -> str:
return PdfConverterSingleton().get_text(pdf_path)
# Endpoint to upload a file and extract markdown text
@app.post("/extract-pdf-text")
async def extract_pdf_text(file: UploadFile = File(...)):
if file.content_type != "application/pdf":
raise HTTPException(status_code=400, detail="Only PDF files are supported.")
temp_filename = f"/tmp/{uuid4().hex}.pdf"
try:
with open(temp_filename, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
text = extract_text_from_pdf(temp_filename)
return JSONResponse(content={"markdown_text": text})
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
finally:
if os.path.exists(temp_filename):
os.remove(temp_filename)