Skip to content

Instantly share code, notes, and snippets.

@mara004
Last active April 24, 2025 20:57
Show Gist options
  • Save mara004/87276da4f8be31c80c38036c6ab667d7 to your computer and use it in GitHub Desktop.
Save mara004/87276da4f8be31c80c38036c6ab667d7 to your computer and use it in GitHub Desktop.
PDF rendering with pdf.js, from Python
# Four lines intentionally left blank
# SPDX-FileCopyrightText: 2025 geisserml <[email protected]>
# SPDX-License-Identifier: Apache-2.0 OR MPL-2.0
# See also https://github.com/extremeheat/JSPyBridge/blob/master/examples/python/pdfjs.py
# Py-Depends: pillow, javascript >= 1.1.0 (jspybridge)
# Js-Depends: pdfjs-dist, canvas
# Use `python -m pip install` and `python -m javascript --install`
# NOTE This currently assumes you have a custom pdf.js build in the same directory as this file, because require("pdfjs-dist") appears broken on the author's nodejs 20. See upstream build instructions. Commit 8b50836d is confirmed to work. Patch the require() calls if you want otherwise.
import time
import argparse
from pathlib import Path
import PIL.Image
import javascript
THIS_DIR = str(Path(__file__).resolve().parent)
# NOTE canvas must be the build pdfjs is linked against, otherwise it'll fail with type error
pdfjs = javascript.require( str(THIS_DIR / Path("pdf.js/build/generic/build/pdf.js")) )
libcanvas = javascript.require( str(THIS_DIR / Path("pdf.js/node_modules/canvas")) )
def render_pdf(input, outdir, scale):
pdf = pdfjs.getDocument(input).promise
n_pages = pdf.numPages
n_digits = len(str(n_pages))
for i in range(1, n_pages+1):
page = pdf.getPage(i)
viewport = page.getViewport({"scale": scale})
w, h = int(viewport.width), int(viewport.height)
canvas = libcanvas.createCanvas(w, h)
context = canvas.getContext("2d")
page.render({"canvasContext": context, "viewport": viewport}).promise
# Note that blobValueOf() is much faster than valueOf()["data"] for large byte buffers.
js_buffer = canvas.toBuffer("raw")
starttm = time.time()
py_buffer = js_buffer.blobValueOf()
print(f"Data transfer took {time.time() - starttm}s"); del starttm
pil_image = PIL.Image.frombuffer("RGBX", (w, h), py_buffer, "raw", "BGRX", 0, 1)
pil_image.save(outdir / f"out_{i:0{n_digits}d}.jpg")
pdf.destroy()
def main():
parser = argparse.ArgumentParser(
description="Render a PDF file with Mozilla pdf.js via JsPyBridge.\n" +
"Known issues: - URL support is buggy; - certain PDFs may hit memory limits.",
)
path_type = lambda p: Path(p).expanduser().resolve()
input_type = lambda p: p if p.startswith("http") else str(path_type(p))
parser.add_argument(
"input", type=input_type,
help="Input file path or URL.",
)
parser.add_argument("--outdir", "-o", type=path_type, required=True)
parser.add_argument("--scale", type=float, default=4)
args = parser.parse_args()
if not args.outdir.exists():
args.outdir.mkdir(parents=True, exist_ok=True)
render_pdf(args.input, args.outdir, scale=args.scale)
main()
# SPDX-FileCopyrightText: 2025 geisserml <[email protected]>
# SPDX-License-Identifier: Apache-2.0 OR MPL-2.0
# This is an experimental pdf.js interface using shared memory.
# Unfortunately, shm-typed-array does not support Windows (except through Cygwin/WSL), so this is not exactly portable.
# For another version by the same author that uses pipe-based data transfer via JSPyBridge's .blobValueOf(), see above.
# Py-Depends: pillow, javascript (JSPyBridge), posix_ipc
# Js-Depends: pdfjs-dist, canvas, shm-typed-array
# You can use `python -m pip install`, and `python -m javascript --install`
# NOTE This currently assumes you have a custom pdf.js build in the same directory as this file, because require("pdfjs-dist") appears broken on the author's nodejs 20. See upstream build instructions. Commit 8b50836d is confirmed to work. Patch the require() calls if you want otherwise.
import time
starttm = time.time()
import mmap
import argparse
from pathlib import Path
# third-party
import PIL.Image
import javascript
import posix_ipc
THIS_DIR = str(Path(__file__).resolve().parent)
# NOTE canvas must be the build pdfjs is linked against, otherwise it'll fail with type error
pdfjs = javascript.require( str(THIS_DIR / Path("pdf.js/build/generic/build/pdf.js")) )
libcanvas = javascript.require( str(THIS_DIR / Path("pdf.js/node_modules/canvas")) )
libshm = javascript.require("shm-typed-array")
print(f"Imports took {time.time() - starttm}s"); del starttm
def render_pdf(input, outdir, scale):
pdf = pdfjs.getDocument(input).promise
n_pages = pdf.numPages
n_digits = len(str(n_pages))
starttm = time.time()
sizes = []
for i in range(n_pages):
page = pdf.getPage(i+1)
viewport = page.getViewport({"scale": scale})
w, h = int(viewport.width), int(viewport.height)
sizes.append( (w, h) )
max_alloc = max(w*h for w, h in sizes) * 4
print(f"Shared memory size in bytes: {max_alloc} (took {time.time() - starttm}s to determine)"); del starttm
memkey = "/pypdfjs_render_shm"
js_shm = libshm.create(max_alloc, "Buffer", memkey)
assert js_shm is not None, "Shared memory of this name already exists, go to /dev/shm and remove it."
py_shm_handle = posix_ipc.SharedMemory(memkey)
try:
py_shm = mmap.mmap(py_shm_handle.fd, py_shm_handle.size)
for i in range(n_pages):
page = pdf.getPage(i+1)
viewport = page.getViewport({"scale": scale})
w, h = sizes[i]
canvas = libcanvas.createCanvas(w, h)
context = canvas.getContext("2d")
page.render({"canvasContext": context, "viewport": viewport}).promise
# the author is not aware of a way to create a canvas backed by an external buffer, so this copies
js_buffer = canvas.toBuffer("raw")
starttm = time.time()
js_buffer.copy(js_shm)
py_shm.seek(0)
print(f"Data transfer took {time.time() - starttm}s")
pil_image = PIL.Image.frombuffer("RGBX", (w, h), py_shm, "raw", "BGRX", 0, 1)
pil_image.save(outdir / f"out_{i:0{n_digits}d}.jpg")
finally:
# Need to use native (non-js) functions to reliably destroy shared memory. Bridge seems to break in case of KeyboardInterrupt.
py_shm_handle.close_fd()
py_shm_handle.unlink()
assert not Path("/dev/shm" + memkey).exists()
pdf.destroy()
def main():
parser = argparse.ArgumentParser(
description="Render a PDF file with Mozilla pdf.js via JsPyBridge.\n" +
"Known issues: - URL support is buggy; - certain PDFs may hit memory limits.",
)
path_type = lambda p: Path(p).expanduser().resolve()
input_type = lambda p: p if p.startswith("http") else str(path_type(p))
parser.add_argument(
"input", type=input_type,
help="Input file path or URL.",
)
parser.add_argument("--outdir", "-o", type=path_type, required=True)
parser.add_argument("--scale", type=float, default=4)
args = parser.parse_args()
if not args.outdir.exists():
args.outdir.mkdir(parents=True, exist_ok=True)
render_pdf(args.input, args.outdir, scale=args.scale)
main()
# SPDX-FileCopyrightText: 2025 geisserml <[email protected]>
# SPDX-License-Identifier: Apache-2.0 OR MPL-2.0
def import_js():
global javascript
import javascript
from threading import Thread
import_thread = Thread(target=import_js)
print("Starting import thread")
import_thread.start()
import mmap
import ctypes as ct
import argparse
from pathlib import Path
from ctypes.util import find_library
import PIL.Image
LIBC_PATH = find_library("c")
py_libc = ct.CDLL(LIBC_PATH)
# flags, see /usr/include/bits/fcntl-linux.h
O_CREAT = 0o100 # 64
O_EXCL = 0o200 # 128
O_NONBLOCK = 0o4000 # 2048
O_RDONLY = 0o0 # 0
O_WRONLY = 0o1 # 1
O_RDWR = 0o2 # 2
O_TRUNC = 0o1000 # 512
# <unistd.h>
py_ftruncate = py_libc.ftruncate
py_ftruncate.argtypes = [ct.c_int, ct.c_long]
py_ftruncate.restype = ct.c_int
# <sys/mman.h>
py_shm_open = py_libc.shm_open
py_shm_open.argtypes = [ct.c_char_p, ct.c_int, ct.c_uint]
py_shm_open.restype = ct.c_int
py_shm_unlink = py_libc.shm_unlink
py_shm_unlink.argtypes = [ct.c_char_p]
py_shm_unlink.restype = ct.c_int
THIS_DIR = str(Path(__file__).resolve().parent)
PDFJS_PATH = THIS_DIR / Path("pdf.js/build/generic/build/pdf.js")
CANVAS_PATH = THIS_DIR / Path("pdf.js/node_modules/canvas")
print("Waiting for import thread to finish")
import_thread.join()
print("Importing JS libraries")
globalThis = javascript.globalThis
pdfjs = javascript.require(str(PDFJS_PATH))
libcanvas = javascript.require(str(CANVAS_PATH))
koffi = javascript.require("koffi")
js_libc = koffi.load(LIBC_PATH)
js_shm_open = js_libc.func("int shm_open(char* name, int oflag, unsigned int mode)")
js_mmap = js_libc.func("void* mmap(void* addr, size_t len, int prot, int flags, int fd, long offset)")
print("Done")
def render_pdf(input, outdir, scale):
pdf = pdfjs.getDocument(input).promise
n_pages = pdf.numPages
n_digits = len(str(n_pages))
print("Determine shared memory size ...")
sizes = []
for i in range(1, n_pages+1):
page = pdf.getPage(i)
viewport = page.getViewport({"scale": scale})
w, h = int(viewport.width), int(viewport.height)
sizes.append( (w, h) )
shm_size = max(w*h for w, h in sizes) * 4
print(f"Shared memory size in bytes: {shm_size}")
shm_key_s = "pdfjs_render_shm"
shm_key_b = shm_key_s.encode("ascii")
try:
print("Creating shared memory ...")
shm_fd_py = py_shm_open(shm_key_b, O_CREAT|O_RDWR, 0o666)
py_ftruncate(shm_fd_py, shm_size)
print("Setting up cross-language handles for shared memory ...")
shm_fd_js = js_shm_open(shm_key_s, O_RDWR, 0o666)
print(shm_fd_py, shm_fd_js)
memmap_py = mmap.mmap(shm_fd_py, shm_size, flags=mmap.MAP_SHARED, prot=mmap.PROT_READ)
memview_py = memoryview(memmap_py)
memmap_ptr_js = js_mmap(0, shm_size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, shm_fd_js, 0)
array_buffer_js = koffi.view(memmap_ptr_js, shm_size)
array_js = globalThis.Uint8Array(array_buffer_js, 0, shm_size)
print(memmap_py, memmap_ptr_js, array_buffer_js, array_js)
for i in range(1, n_pages+1):
print(f"Page {i}")
page = pdf.getPage(i)
viewport = page.getViewport({"scale": scale})
w, h = int(viewport.width), int(viewport.height)
# page_memsize = w*h*4
canvas = libcanvas.createCanvas(w, h)
context = canvas.getContext("2d")
page.render({"canvasContext": context, "viewport": viewport}).promise
# TODO file a feature request with the node-canvas library to create a canvas in shared memory directly
js_buffer = canvas.toBuffer("raw")
js_buffer.copy(array_js)
# We assume that PIL doesn't mind if the buffer is longer
# NOTE passing a memoryview requires PIL >= 9.5
pil_image = PIL.Image.frombuffer("RGBX", (w, h), memview_py, "raw", "BGRX", 0, 1)
pil_image.save(outdir / f"out_{i:0{n_digits}d}.jpg")
finally:
try:
# The memoryview must be explicitly del'ed to avoid the following exception when attempting to close the mmap: "BufferError: cannot close exported pointers exist"
del memview_py
memmap_py.close()
finally:
py_shm_unlink(shm_key_b)
pdf.destroy()
def main():
parser = argparse.ArgumentParser(
description="Render a PDF file with Mozilla pdf.js via JsPyBridge."
)
path_type = lambda p: Path(p).expanduser().resolve()
input_type = lambda p: p if p.startswith("http") else str(path_type(p))
parser.add_argument(
"input", type=input_type,
help="Input file path or URL.",
)
parser.add_argument("--outdir", "-o", type=path_type, required=True)
parser.add_argument("--scale", type=float, default=4)
args = parser.parse_args()
if not args.outdir.exists():
args.outdir.mkdir(parents=True, exist_ok=True)
render_pdf(args.input, args.outdir, scale=args.scale)
main()
@mara004
Copy link
Author

mara004 commented Nov 30, 2023

Historical note: the author created extremeheat/JSPyBridge#103 (blobValueOf()) on behalf of this use case.

@mara004
Copy link
Author

mara004 commented Apr 3, 2025

We might want to rewrite the shared memory approach with ctypes and koffi (or maybe bun:ffi or node-ffi-rs) for direct access of the OS shared memory APIs, as any third-party JS shared memory packages seemed unsatisfactory.

Technically, this should be doable, as most of it could be written with ctypes, and we would just need to get a JS object view of the shared memory in the end (Buffer or something), so it should be possible to keep the koffi part minimal.
Ideally, canvas should expose the underlying cairo API to create a canvas backed by an external (caller-provided) buffer.

@mara004
Copy link
Author

mara004 commented Apr 14, 2025

Future ideas

Chores:

  • handle windows shared memory
  • factor out helper functions for shared memory (rather than inline FFI usage in pdf_render()), maybe move the bindings to a separate file
  • provide an option whether to use shared memory or blobValueOf(), merging the separate files into one.
  • add ability to plug in different shmem size predictors, or to pass a custom value (use blobValueOf() if page is larger than buffer)

pdfJS features:

  • handling different bitmap formats
  • crop, matrix transformations
  • play with other RenderParameters, e.g. pageColors, intent, ...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment