Last active
April 30, 2026 01:51
-
-
Save noaione/6bbf1ec7289ea9e2c7d21376209b80a4 to your computer and use it in GitHub Desktop.
badly merge two PDF render source together to avoid outline text issues because some PDF publisher is stupid
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| npdf is: https://github.com/noaione/npdf (based on poppler splash backend) | |
| and the issue: https://gitlab.freedesktop.org/poppler/poppler/-/work_items/1653 | |
| export usually done: | |
| - npdf export -r DPI -c auto -o source/npdf | |
| why? | |
| - the poppler splash backend is generally works much better at rendering Manga (and more compact) | |
| - it does has problem sometimes when handling fake "outline" text (It's basically fill + outline on separate layers) | |
| - the pymupdf (use Cairo) works MUCH better at handling this outline stuff BUT, they're bad because they just bloated the shit out of the filesize | |
| - so, we use this output for text only and then slap it to the poppler output | |
| NOTE: This doesn't works great if the output color level from splash and cairo backend is different (which it does most of the time) | |
| """ | |
| from pathlib import Path | |
| from typing import TypedDict | |
| import pymupdf | |
| from PIL import Image | |
| PDF_SOURCE = Path("input.pdf") | |
| SOURCE_NPDF_FOLDER = Path("source") / "npdf" # filename format MUST be: page-XXXX.png | |
| TARGET_DIR = Path("source") / "fixed" | |
| TARGET_DIR.mkdir(parents=True, exist_ok=True) | |
| TARGET_DPI = 625 | |
| class TextBoxArea(TypedDict): | |
| x: int | |
| y: int | |
| width: int | |
| height: int | |
| class TextData(TypedDict): | |
| page: int | |
| text: str | |
| pixel_box: TextBoxArea | |
| def extract_text_boxes_to_pixels(doc: pymupdf.Document, target_dpi=300) -> list[TextData]: | |
| """The following function is vibecoded with Gemini 3.1 Pro, manually cleaned up later by me""" | |
| # 1 point = 1/72 inch. | |
| scale_factor = target_dpi / 72.0 | |
| extracted_boxes: list[TextData] = [] | |
| for page_num in range(len(doc)): | |
| page = doc[page_num] | |
| # Extract logical text blocks | |
| blocks = page.get_text("blocks") | |
| for block in blocks: | |
| # Unpack the block tuple | |
| x0, y0, x1, y1, text, block_no, block_type = block | |
| # Filter out image blocks (we only want text boxes) | |
| if block_type != 0: | |
| continue | |
| # Clean up the extracted text | |
| clean_text = text.strip() | |
| if not clean_text: | |
| continue | |
| # Calculate width and height in points | |
| pt_width = x1 - x0 | |
| pt_height = y1 - y0 | |
| # Convert to 2D pixel plane coordinates | |
| px_x = x0 * scale_factor | |
| px_y = y0 * scale_factor | |
| px_width = pt_width * scale_factor | |
| px_height = pt_height * scale_factor | |
| extracted_boxes.append({ | |
| "page": page_num + 1, | |
| "text": clean_text, | |
| "pixel_box": {"x": round(px_x), "y": round(px_y), "width": round(px_width), "height": round(px_height)}, | |
| }) | |
| return extracted_boxes | |
| def export_page_with_pymupdf(page: pymupdf.Page, target_dpi: int = 625): | |
| pixmap = page.get_pixmap(dpi=target_dpi, colorspace=pymupdf.csRGB, annots=False) | |
| return pixmap.pil_image() | |
| def merge_with_poppler_export(index: int, data: list[TextData], page: pymupdf.Page, target_dpi: int = 625): | |
| poppler_img_path = SOURCE_NPDF_FOLDER / f"page-{index:04d}.png" | |
| if not poppler_img_path.exists(): | |
| print(f"Skipping page {index} as it doesn't exist in source") | |
| return | |
| print(" Exporing pymupdf...") | |
| pymu_image = export_page_with_pymupdf(page, target_dpi) | |
| print(" Opening poppler image...") | |
| poppler_image = Image.open(poppler_img_path) | |
| # Apply the text box from pymu_image to poppler_image | |
| print(" Applying box data...") | |
| for box in data: | |
| x, y, w, h = box["pixel_box"]["x"], box["pixel_box"]["y"], box["pixel_box"]["width"], box["pixel_box"]["height"] | |
| poppler_image.paste(pymu_image.crop((x, y, x + w, y + h)), (x, y)) | |
| poppler_image.save(TARGET_DIR / f"page-{index:04d}.png") | |
| # --- Usage --- | |
| if __name__ == "__main__": | |
| # Example: Processing at 300 DPI | |
| doc: pymupdf.Document = pymupdf.open(PDF_SOURCE) | |
| try: | |
| results = extract_text_boxes_to_pixels(doc, target_dpi=TARGET_DPI) | |
| # group by page | |
| grouped_by_pages = {} | |
| for box in results: | |
| page = box["page"] | |
| if page not in grouped_by_pages: | |
| grouped_by_pages[page] = [] | |
| grouped_by_pages[page].append(box) | |
| for page, boxes in grouped_by_pages.items(): | |
| print(f"Processing page {page}") | |
| merge_with_poppler_export(page, boxes, doc[page - 1], target_dpi=TARGET_DPI) | |
| except Exception as e: | |
| print(f"Error processing PDF: {e}") | |
| doc.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment