Skip to content

Instantly share code, notes, and snippets.

@noaione
Last active April 30, 2026 01:51
Show Gist options
  • Select an option

  • Save noaione/6bbf1ec7289ea9e2c7d21376209b80a4 to your computer and use it in GitHub Desktop.

Select an option

Save noaione/6bbf1ec7289ea9e2c7d21376209b80a4 to your computer and use it in GitHub Desktop.
badly merge two PDF render source together to avoid outline text issues because some PDF publisher is stupid
"""
npdf is: https://github.com/noaione/npdf (based on poppler splash backend)
and the issue: https://gitlab.freedesktop.org/poppler/poppler/-/work_items/1653
export usually done:
- npdf export -r DPI -c auto -o source/npdf
why?
- the poppler splash backend is generally works much better at rendering Manga (and more compact)
- it does has problem sometimes when handling fake "outline" text (It's basically fill + outline on separate layers)
- the pymupdf (use Cairo) works MUCH better at handling this outline stuff BUT, they're bad because they just bloated the shit out of the filesize
- so, we use this output for text only and then slap it to the poppler output
NOTE: This doesn't works great if the output color level from splash and cairo backend is different (which it does most of the time)
"""
from pathlib import Path
from typing import TypedDict
import pymupdf
from PIL import Image
PDF_SOURCE = Path("input.pdf")
SOURCE_NPDF_FOLDER = Path("source") / "npdf" # filename format MUST be: page-XXXX.png
TARGET_DIR = Path("source") / "fixed"
TARGET_DIR.mkdir(parents=True, exist_ok=True)
TARGET_DPI = 625
class TextBoxArea(TypedDict):
x: int
y: int
width: int
height: int
class TextData(TypedDict):
page: int
text: str
pixel_box: TextBoxArea
def extract_text_boxes_to_pixels(doc: pymupdf.Document, target_dpi=300) -> list[TextData]:
"""The following function is vibecoded with Gemini 3.1 Pro, manually cleaned up later by me"""
# 1 point = 1/72 inch.
scale_factor = target_dpi / 72.0
extracted_boxes: list[TextData] = []
for page_num in range(len(doc)):
page = doc[page_num]
# Extract logical text blocks
blocks = page.get_text("blocks")
for block in blocks:
# Unpack the block tuple
x0, y0, x1, y1, text, block_no, block_type = block
# Filter out image blocks (we only want text boxes)
if block_type != 0:
continue
# Clean up the extracted text
clean_text = text.strip()
if not clean_text:
continue
# Calculate width and height in points
pt_width = x1 - x0
pt_height = y1 - y0
# Convert to 2D pixel plane coordinates
px_x = x0 * scale_factor
px_y = y0 * scale_factor
px_width = pt_width * scale_factor
px_height = pt_height * scale_factor
extracted_boxes.append({
"page": page_num + 1,
"text": clean_text,
"pixel_box": {"x": round(px_x), "y": round(px_y), "width": round(px_width), "height": round(px_height)},
})
return extracted_boxes
def export_page_with_pymupdf(page: pymupdf.Page, target_dpi: int = 625):
pixmap = page.get_pixmap(dpi=target_dpi, colorspace=pymupdf.csRGB, annots=False)
return pixmap.pil_image()
def merge_with_poppler_export(index: int, data: list[TextData], page: pymupdf.Page, target_dpi: int = 625):
poppler_img_path = SOURCE_NPDF_FOLDER / f"page-{index:04d}.png"
if not poppler_img_path.exists():
print(f"Skipping page {index} as it doesn't exist in source")
return
print(" Exporing pymupdf...")
pymu_image = export_page_with_pymupdf(page, target_dpi)
print(" Opening poppler image...")
poppler_image = Image.open(poppler_img_path)
# Apply the text box from pymu_image to poppler_image
print(" Applying box data...")
for box in data:
x, y, w, h = box["pixel_box"]["x"], box["pixel_box"]["y"], box["pixel_box"]["width"], box["pixel_box"]["height"]
poppler_image.paste(pymu_image.crop((x, y, x + w, y + h)), (x, y))
poppler_image.save(TARGET_DIR / f"page-{index:04d}.png")
# --- Usage ---
if __name__ == "__main__":
# Example: Processing at 300 DPI
doc: pymupdf.Document = pymupdf.open(PDF_SOURCE)
try:
results = extract_text_boxes_to_pixels(doc, target_dpi=TARGET_DPI)
# group by page
grouped_by_pages = {}
for box in results:
page = box["page"]
if page not in grouped_by_pages:
grouped_by_pages[page] = []
grouped_by_pages[page].append(box)
for page, boxes in grouped_by_pages.items():
print(f"Processing page {page}")
merge_with_poppler_export(page, boxes, doc[page - 1], target_dpi=TARGET_DPI)
except Exception as e:
print(f"Error processing PDF: {e}")
doc.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment