noaione · April 30, 2026 01:51
diff --git a/merge_pdf_text.py b/merge_pdf_text.py
 """
 npdf is: https://github.com/noaione/npdf (based on poppler splash backend)
 and the issue: https://gitlab.freedesktop.org/poppler/poppler/-/work_items/1653

 export usually done:
 - npdf export -r DPI -c auto -o source/npdf

 why?
 - the poppler splash backend is generally works much better at rendering Manga (and more compact)
  - it does has problem sometimes when handling fake "outline" text (It's basically fill + outline on separate layers)
 - the pymupdf (use Cairo) works MUCH better at handling this outline stuff BUT, they're bad because they just bloated the shit out of the filesize
  - so, we use this output for text only and then slap it to the poppler output
  
 NOTE: This doesn't works great if the output color level from splash and cairo backend is different (which it does most of the time)
 """

 from pathlib import Path
 from typing import TypedDict

 import pymupdf
 from PIL import Image

 PDF_SOURCE = Path("input.pdf")
 SOURCE_NPDF_FOLDER = Path("source") / "npdf"  # filename format MUST be: page-XXXX.png
 TARGET_DIR = Path("source") / "fixed"
 TARGET_DIR.mkdir(parents=True, exist_ok=True)
 TARGET_DPI = 625


 class TextBoxArea(TypedDict):
    x: int
    y: int
    width: int
    height: int


 class TextData(TypedDict):
    page: int
    text: str
    pixel_box: TextBoxArea


 def extract_text_boxes_to_pixels(doc: pymupdf.Document, target_dpi=300) -> list[TextData]:
    """The following function is vibecoded with Gemini 3.1 Pro, manually cleaned up later by me"""
    # 1 point = 1/72 inch.
    scale_factor = target_dpi / 72.0
    extracted_boxes: list[TextData] = []

    for page_num in range(len(doc)):
        page = doc[page_num]

        # Extract logical text blocks
        blocks = page.get_text("blocks")

        for block in blocks:
            # Unpack the block tuple
            x0, y0, x1, y1, text, block_no, block_type = block

            # Filter out image blocks (we only want text boxes)
            if block_type != 0:
                continue

            # Clean up the extracted text
            clean_text = text.strip()
            if not clean_text:
                continue

            # Calculate width and height in points
            pt_width = x1 - x0
            pt_height = y1 - y0

            # Convert to 2D pixel plane coordinates
            px_x = x0 * scale_factor
            px_y = y0 * scale_factor
            px_width = pt_width * scale_factor
            px_height = pt_height * scale_factor

            extracted_boxes.append({
                "page": page_num + 1,
                "text": clean_text,
                "pixel_box": {"x": round(px_x), "y": round(px_y), "width": round(px_width), "height": round(px_height)},
            })

    return extracted_boxes


 def export_page_with_pymupdf(page: pymupdf.Page, target_dpi: int = 625):
    pixmap = page.get_pixmap(dpi=target_dpi, colorspace=pymupdf.csRGB, annots=False)
    return pixmap.pil_image()


 def merge_with_poppler_export(index: int, data: list[TextData], page: pymupdf.Page, target_dpi: int = 625):
    poppler_img_path = SOURCE_NPDF_FOLDER / f"page-{index:04d}.png"
    if not poppler_img_path.exists():
        print(f"Skipping page {index} as it doesn't exist in source")
        return

    print(" Exporing pymupdf...")
    pymu_image = export_page_with_pymupdf(page, target_dpi)
    print(" Opening poppler image...")
    poppler_image = Image.open(poppler_img_path)
    # Apply the text box from pymu_image to poppler_image
    print(" Applying box data...")
    for box in data:
        x, y, w, h = box["pixel_box"]["x"], box["pixel_box"]["y"], box["pixel_box"]["width"], box["pixel_box"]["height"]
        poppler_image.paste(pymu_image.crop((x, y, x + w, y + h)), (x, y))
    poppler_image.save(TARGET_DIR / f"page-{index:04d}.png")


 # --- Usage ---
 if __name__ == "__main__":
    # Example: Processing at 300 DPI
    doc: pymupdf.Document = pymupdf.open(PDF_SOURCE)

    try:
        results = extract_text_boxes_to_pixels(doc, target_dpi=TARGET_DPI)

        # group by page
        grouped_by_pages = {}
        for box in results:
            page = box["page"]
            if page not in grouped_by_pages:
                grouped_by_pages[page] = []
            grouped_by_pages[page].append(box)

        for page, boxes in grouped_by_pages.items():
            print(f"Processing page {page}")
            merge_with_poppler_export(page, boxes, doc[page - 1], target_dpi=TARGET_DPI)

    except Exception as e:
        print(f"Error processing PDF: {e}")

    doc.close()
	"""
	npdf is: https://github.com/noaione/npdf (based on poppler splash backend)
	and the issue: https://gitlab.freedesktop.org/poppler/poppler/-/work_items/1653

	export usually done:
	- npdf export -r DPI -c auto -o source/npdf

	why?
	- the poppler splash backend is generally works much better at rendering Manga (and more compact)
	- it does has problem sometimes when handling fake "outline" text (It's basically fill + outline on separate layers)
	- the pymupdf (use Cairo) works MUCH better at handling this outline stuff BUT, they're bad because they just bloated the shit out of the filesize
	- so, we use this output for text only and then slap it to the poppler output

	NOTE: This doesn't works great if the output color level from splash and cairo backend is different (which it does most of the time)
	"""

	from pathlib import Path
	from typing import TypedDict

	import pymupdf
	from PIL import Image

	PDF_SOURCE = Path("input.pdf")
	SOURCE_NPDF_FOLDER = Path("source") / "npdf" # filename format MUST be: page-XXXX.png
	TARGET_DIR = Path("source") / "fixed"
	TARGET_DIR.mkdir(parents=True, exist_ok=True)
	TARGET_DPI = 625


	class TextBoxArea(TypedDict):
	x: int
	y: int
	width: int
	height: int


	class TextData(TypedDict):
	page: int
	text: str
	pixel_box: TextBoxArea


	def extract_text_boxes_to_pixels(doc: pymupdf.Document, target_dpi=300) -> list[TextData]:
	"""The following function is vibecoded with Gemini 3.1 Pro, manually cleaned up later by me"""
	# 1 point = 1/72 inch.
	scale_factor = target_dpi / 72.0
	extracted_boxes: list[TextData] = []

	for page_num in range(len(doc)):
	page = doc[page_num]

	# Extract logical text blocks
	blocks = page.get_text("blocks")

	for block in blocks:
	# Unpack the block tuple
	x0, y0, x1, y1, text, block_no, block_type = block

	# Filter out image blocks (we only want text boxes)
	if block_type != 0:
	continue

	# Clean up the extracted text
	clean_text = text.strip()
	if not clean_text:
	continue

	# Calculate width and height in points
	pt_width = x1 - x0
	pt_height = y1 - y0

	# Convert to 2D pixel plane coordinates
	px_x = x0 * scale_factor
	px_y = y0 * scale_factor
	px_width = pt_width * scale_factor
	px_height = pt_height * scale_factor

	extracted_boxes.append({
	"page": page_num + 1,
	"text": clean_text,
	"pixel_box": {"x": round(px_x), "y": round(px_y), "width": round(px_width), "height": round(px_height)},
	})

	return extracted_boxes


	def export_page_with_pymupdf(page: pymupdf.Page, target_dpi: int = 625):
	pixmap = page.get_pixmap(dpi=target_dpi, colorspace=pymupdf.csRGB, annots=False)
	return pixmap.pil_image()


	def merge_with_poppler_export(index: int, data: list[TextData], page: pymupdf.Page, target_dpi: int = 625):
	poppler_img_path = SOURCE_NPDF_FOLDER / f"page-{index:04d}.png"
	if not poppler_img_path.exists():
	print(f"Skipping page {index} as it doesn't exist in source")
	return

	print(" Exporing pymupdf...")
	pymu_image = export_page_with_pymupdf(page, target_dpi)
	print(" Opening poppler image...")
	poppler_image = Image.open(poppler_img_path)
	# Apply the text box from pymu_image to poppler_image
	print(" Applying box data...")
	for box in data:
	x, y, w, h = box["pixel_box"]["x"], box["pixel_box"]["y"], box["pixel_box"]["width"], box["pixel_box"]["height"]
	poppler_image.paste(pymu_image.crop((x, y, x + w, y + h)), (x, y))
	poppler_image.save(TARGET_DIR / f"page-{index:04d}.png")


	# --- Usage ---
	if __name__ == "__main__":
	# Example: Processing at 300 DPI
	doc: pymupdf.Document = pymupdf.open(PDF_SOURCE)

	try:
	results = extract_text_boxes_to_pixels(doc, target_dpi=TARGET_DPI)

	# group by page
	grouped_by_pages = {}
	for box in results:
	page = box["page"]
	if page not in grouped_by_pages:
	grouped_by_pages[page] = []
	grouped_by_pages[page].append(box)

	for page, boxes in grouped_by_pages.items():
	print(f"Processing page {page}")
	merge_with_poppler_export(page, boxes, doc[page - 1], target_dpi=TARGET_DPI)

	except Exception as e:
	print(f"Error processing PDF: {e}")

	doc.close()
No results found