Skip to content

Instantly share code, notes, and snippets.

@adrianseeley
Created April 28, 2025 23:07
Show Gist options
  • Save adrianseeley/6825c1efd56467147eb759c6445e784b to your computer and use it in GitHub Desktop.
Save adrianseeley/6825c1efd56467147eb759c6445e784b to your computer and use it in GitHub Desktop.
pdfToMarkdownServer.py
import uvicorn
import os
import tempfile
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import PlainTextResponse
# MinerU imports
from magic_pdf.data.data_reader_writer import FileBasedDataWriter
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
app = FastAPI(title="PDFTOMARKDOWN SERVER")
@app.post("/convert", response_class=PlainTextResponse)
async def convert_pdf(file: UploadFile = File(...)):
# Validate upload
if file.content_type != "application/pdf":
raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
pdf_bytes = await file.read()
# Create dataset from bytes
ds = PymuDocDataset(pdf_bytes)
# Choose parse method
if ds.classify() == SupportedPdfParseMethod.OCR:
infer = ds.apply(doc_analyze, ocr=True, lang='en')
else:
infer = ds.apply(doc_analyze, ocr=False, lang='en')
# Process pipeline in temporary workspace
with tempfile.TemporaryDirectory() as tmp:
img_dir = os.path.join(tmp, "images")
os.makedirs(img_dir, exist_ok=True)
writer = FileBasedDataWriter(img_dir)
# Generate markdown
if ds.classify() == SupportedPdfParseMethod.OCR:
pipe = infer.pipe_ocr_mode(writer, lang='en')
else:
pipe = infer.pipe_txt_mode(writer, lang='en')
markdown = pipe.get_markdown("images")
return markdown
if __name__ == "__main__":
uvicorn.run("server:app", host="0.0.0.0", port=8080)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment