Last active
April 24, 2025 20:57
-
-
Save mara004/87276da4f8be31c80c38036c6ab667d7 to your computer and use it in GitHub Desktop.
PDF rendering with pdf.js, from Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Four lines intentionally left blank | |
# SPDX-FileCopyrightText: 2025 geisserml <[email protected]> | |
# SPDX-License-Identifier: Apache-2.0 OR MPL-2.0 | |
# See also https://github.com/extremeheat/JSPyBridge/blob/master/examples/python/pdfjs.py | |
# Py-Depends: pillow, javascript >= 1.1.0 (jspybridge) | |
# Js-Depends: pdfjs-dist, canvas | |
# Use `python -m pip install` and `python -m javascript --install` | |
# NOTE This currently assumes you have a custom pdf.js build in the same directory as this file, because require("pdfjs-dist") appears broken on the author's nodejs 20. See upstream build instructions. Commit 8b50836d is confirmed to work. Patch the require() calls if you want otherwise. | |
import time | |
import argparse | |
from pathlib import Path | |
import PIL.Image | |
import javascript | |
THIS_DIR = str(Path(__file__).resolve().parent) | |
# NOTE canvas must be the build pdfjs is linked against, otherwise it'll fail with type error | |
pdfjs = javascript.require( str(THIS_DIR / Path("pdf.js/build/generic/build/pdf.js")) ) | |
libcanvas = javascript.require( str(THIS_DIR / Path("pdf.js/node_modules/canvas")) ) | |
def render_pdf(input, outdir, scale): | |
pdf = pdfjs.getDocument(input).promise | |
n_pages = pdf.numPages | |
n_digits = len(str(n_pages)) | |
for i in range(1, n_pages+1): | |
page = pdf.getPage(i) | |
viewport = page.getViewport({"scale": scale}) | |
w, h = int(viewport.width), int(viewport.height) | |
canvas = libcanvas.createCanvas(w, h) | |
context = canvas.getContext("2d") | |
page.render({"canvasContext": context, "viewport": viewport}).promise | |
# Note that blobValueOf() is much faster than valueOf()["data"] for large byte buffers. | |
js_buffer = canvas.toBuffer("raw") | |
starttm = time.time() | |
py_buffer = js_buffer.blobValueOf() | |
print(f"Data transfer took {time.time() - starttm}s"); del starttm | |
pil_image = PIL.Image.frombuffer("RGBX", (w, h), py_buffer, "raw", "BGRX", 0, 1) | |
pil_image.save(outdir / f"out_{i:0{n_digits}d}.jpg") | |
pdf.destroy() | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Render a PDF file with Mozilla pdf.js via JsPyBridge.\n" + | |
"Known issues: - URL support is buggy; - certain PDFs may hit memory limits.", | |
) | |
path_type = lambda p: Path(p).expanduser().resolve() | |
input_type = lambda p: p if p.startswith("http") else str(path_type(p)) | |
parser.add_argument( | |
"input", type=input_type, | |
help="Input file path or URL.", | |
) | |
parser.add_argument("--outdir", "-o", type=path_type, required=True) | |
parser.add_argument("--scale", type=float, default=4) | |
args = parser.parse_args() | |
if not args.outdir.exists(): | |
args.outdir.mkdir(parents=True, exist_ok=True) | |
render_pdf(args.input, args.outdir, scale=args.scale) | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2025 geisserml <[email protected]> | |
# SPDX-License-Identifier: Apache-2.0 OR MPL-2.0 | |
# This is an experimental pdf.js interface using shared memory. | |
# Unfortunately, shm-typed-array does not support Windows (except through Cygwin/WSL), so this is not exactly portable. | |
# For another version by the same author that uses pipe-based data transfer via JSPyBridge's .blobValueOf(), see above. | |
# Py-Depends: pillow, javascript (JSPyBridge), posix_ipc | |
# Js-Depends: pdfjs-dist, canvas, shm-typed-array | |
# You can use `python -m pip install`, and `python -m javascript --install` | |
# NOTE This currently assumes you have a custom pdf.js build in the same directory as this file, because require("pdfjs-dist") appears broken on the author's nodejs 20. See upstream build instructions. Commit 8b50836d is confirmed to work. Patch the require() calls if you want otherwise. | |
import time | |
starttm = time.time() | |
import mmap | |
import argparse | |
from pathlib import Path | |
# third-party | |
import PIL.Image | |
import javascript | |
import posix_ipc | |
THIS_DIR = str(Path(__file__).resolve().parent) | |
# NOTE canvas must be the build pdfjs is linked against, otherwise it'll fail with type error | |
pdfjs = javascript.require( str(THIS_DIR / Path("pdf.js/build/generic/build/pdf.js")) ) | |
libcanvas = javascript.require( str(THIS_DIR / Path("pdf.js/node_modules/canvas")) ) | |
libshm = javascript.require("shm-typed-array") | |
print(f"Imports took {time.time() - starttm}s"); del starttm | |
def render_pdf(input, outdir, scale): | |
pdf = pdfjs.getDocument(input).promise | |
n_pages = pdf.numPages | |
n_digits = len(str(n_pages)) | |
starttm = time.time() | |
sizes = [] | |
for i in range(n_pages): | |
page = pdf.getPage(i+1) | |
viewport = page.getViewport({"scale": scale}) | |
w, h = int(viewport.width), int(viewport.height) | |
sizes.append( (w, h) ) | |
max_alloc = max(w*h for w, h in sizes) * 4 | |
print(f"Shared memory size in bytes: {max_alloc} (took {time.time() - starttm}s to determine)"); del starttm | |
memkey = "/pypdfjs_render_shm" | |
js_shm = libshm.create(max_alloc, "Buffer", memkey) | |
assert js_shm is not None, "Shared memory of this name already exists, go to /dev/shm and remove it." | |
py_shm_handle = posix_ipc.SharedMemory(memkey) | |
try: | |
py_shm = mmap.mmap(py_shm_handle.fd, py_shm_handle.size) | |
for i in range(n_pages): | |
page = pdf.getPage(i+1) | |
viewport = page.getViewport({"scale": scale}) | |
w, h = sizes[i] | |
canvas = libcanvas.createCanvas(w, h) | |
context = canvas.getContext("2d") | |
page.render({"canvasContext": context, "viewport": viewport}).promise | |
# the author is not aware of a way to create a canvas backed by an external buffer, so this copies | |
js_buffer = canvas.toBuffer("raw") | |
starttm = time.time() | |
js_buffer.copy(js_shm) | |
py_shm.seek(0) | |
print(f"Data transfer took {time.time() - starttm}s") | |
pil_image = PIL.Image.frombuffer("RGBX", (w, h), py_shm, "raw", "BGRX", 0, 1) | |
pil_image.save(outdir / f"out_{i:0{n_digits}d}.jpg") | |
finally: | |
# Need to use native (non-js) functions to reliably destroy shared memory. Bridge seems to break in case of KeyboardInterrupt. | |
py_shm_handle.close_fd() | |
py_shm_handle.unlink() | |
assert not Path("/dev/shm" + memkey).exists() | |
pdf.destroy() | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Render a PDF file with Mozilla pdf.js via JsPyBridge.\n" + | |
"Known issues: - URL support is buggy; - certain PDFs may hit memory limits.", | |
) | |
path_type = lambda p: Path(p).expanduser().resolve() | |
input_type = lambda p: p if p.startswith("http") else str(path_type(p)) | |
parser.add_argument( | |
"input", type=input_type, | |
help="Input file path or URL.", | |
) | |
parser.add_argument("--outdir", "-o", type=path_type, required=True) | |
parser.add_argument("--scale", type=float, default=4) | |
args = parser.parse_args() | |
if not args.outdir.exists(): | |
args.outdir.mkdir(parents=True, exist_ok=True) | |
render_pdf(args.input, args.outdir, scale=args.scale) | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# SPDX-FileCopyrightText: 2025 geisserml <[email protected]> | |
# SPDX-License-Identifier: Apache-2.0 OR MPL-2.0 | |
def import_js(): | |
global javascript | |
import javascript | |
from threading import Thread | |
import_thread = Thread(target=import_js) | |
print("Starting import thread") | |
import_thread.start() | |
import mmap | |
import ctypes as ct | |
import argparse | |
from pathlib import Path | |
from ctypes.util import find_library | |
import PIL.Image | |
LIBC_PATH = find_library("c") | |
py_libc = ct.CDLL(LIBC_PATH) | |
# flags, see /usr/include/bits/fcntl-linux.h | |
O_CREAT = 0o100 # 64 | |
O_EXCL = 0o200 # 128 | |
O_NONBLOCK = 0o4000 # 2048 | |
O_RDONLY = 0o0 # 0 | |
O_WRONLY = 0o1 # 1 | |
O_RDWR = 0o2 # 2 | |
O_TRUNC = 0o1000 # 512 | |
# <unistd.h> | |
py_ftruncate = py_libc.ftruncate | |
py_ftruncate.argtypes = [ct.c_int, ct.c_long] | |
py_ftruncate.restype = ct.c_int | |
# <sys/mman.h> | |
py_shm_open = py_libc.shm_open | |
py_shm_open.argtypes = [ct.c_char_p, ct.c_int, ct.c_uint] | |
py_shm_open.restype = ct.c_int | |
py_shm_unlink = py_libc.shm_unlink | |
py_shm_unlink.argtypes = [ct.c_char_p] | |
py_shm_unlink.restype = ct.c_int | |
THIS_DIR = str(Path(__file__).resolve().parent) | |
PDFJS_PATH = THIS_DIR / Path("pdf.js/build/generic/build/pdf.js") | |
CANVAS_PATH = THIS_DIR / Path("pdf.js/node_modules/canvas") | |
print("Waiting for import thread to finish") | |
import_thread.join() | |
print("Importing JS libraries") | |
globalThis = javascript.globalThis | |
pdfjs = javascript.require(str(PDFJS_PATH)) | |
libcanvas = javascript.require(str(CANVAS_PATH)) | |
koffi = javascript.require("koffi") | |
js_libc = koffi.load(LIBC_PATH) | |
js_shm_open = js_libc.func("int shm_open(char* name, int oflag, unsigned int mode)") | |
js_mmap = js_libc.func("void* mmap(void* addr, size_t len, int prot, int flags, int fd, long offset)") | |
print("Done") | |
def render_pdf(input, outdir, scale): | |
pdf = pdfjs.getDocument(input).promise | |
n_pages = pdf.numPages | |
n_digits = len(str(n_pages)) | |
print("Determine shared memory size ...") | |
sizes = [] | |
for i in range(1, n_pages+1): | |
page = pdf.getPage(i) | |
viewport = page.getViewport({"scale": scale}) | |
w, h = int(viewport.width), int(viewport.height) | |
sizes.append( (w, h) ) | |
shm_size = max(w*h for w, h in sizes) * 4 | |
print(f"Shared memory size in bytes: {shm_size}") | |
shm_key_s = "pdfjs_render_shm" | |
shm_key_b = shm_key_s.encode("ascii") | |
try: | |
print("Creating shared memory ...") | |
shm_fd_py = py_shm_open(shm_key_b, O_CREAT|O_RDWR, 0o666) | |
py_ftruncate(shm_fd_py, shm_size) | |
print("Setting up cross-language handles for shared memory ...") | |
shm_fd_js = js_shm_open(shm_key_s, O_RDWR, 0o666) | |
print(shm_fd_py, shm_fd_js) | |
memmap_py = mmap.mmap(shm_fd_py, shm_size, flags=mmap.MAP_SHARED, prot=mmap.PROT_READ) | |
memview_py = memoryview(memmap_py) | |
memmap_ptr_js = js_mmap(0, shm_size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, shm_fd_js, 0) | |
array_buffer_js = koffi.view(memmap_ptr_js, shm_size) | |
array_js = globalThis.Uint8Array(array_buffer_js, 0, shm_size) | |
print(memmap_py, memmap_ptr_js, array_buffer_js, array_js) | |
for i in range(1, n_pages+1): | |
print(f"Page {i}") | |
page = pdf.getPage(i) | |
viewport = page.getViewport({"scale": scale}) | |
w, h = int(viewport.width), int(viewport.height) | |
# page_memsize = w*h*4 | |
canvas = libcanvas.createCanvas(w, h) | |
context = canvas.getContext("2d") | |
page.render({"canvasContext": context, "viewport": viewport}).promise | |
# TODO file a feature request with the node-canvas library to create a canvas in shared memory directly | |
js_buffer = canvas.toBuffer("raw") | |
js_buffer.copy(array_js) | |
# We assume that PIL doesn't mind if the buffer is longer | |
# NOTE passing a memoryview requires PIL >= 9.5 | |
pil_image = PIL.Image.frombuffer("RGBX", (w, h), memview_py, "raw", "BGRX", 0, 1) | |
pil_image.save(outdir / f"out_{i:0{n_digits}d}.jpg") | |
finally: | |
try: | |
# The memoryview must be explicitly del'ed to avoid the following exception when attempting to close the mmap: "BufferError: cannot close exported pointers exist" | |
del memview_py | |
memmap_py.close() | |
finally: | |
py_shm_unlink(shm_key_b) | |
pdf.destroy() | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Render a PDF file with Mozilla pdf.js via JsPyBridge." | |
) | |
path_type = lambda p: Path(p).expanduser().resolve() | |
input_type = lambda p: p if p.startswith("http") else str(path_type(p)) | |
parser.add_argument( | |
"input", type=input_type, | |
help="Input file path or URL.", | |
) | |
parser.add_argument("--outdir", "-o", type=path_type, required=True) | |
parser.add_argument("--scale", type=float, default=4) | |
args = parser.parse_args() | |
if not args.outdir.exists(): | |
args.outdir.mkdir(parents=True, exist_ok=True) | |
render_pdf(args.input, args.outdir, scale=args.scale) | |
main() |
We might want to rewrite the shared memory approach with ctypes and koffi (or maybe bun:ffi
or node-ffi-rs
) for direct access of the OS shared memory APIs, as any third-party JS shared memory packages seemed unsatisfactory.
Technically, this should be doable, as most of it could be written with ctypes, and we would just need to get a JS object view of the shared memory in the end (Buffer or something), so it should be possible to keep the koffi part minimal.
Ideally, canvas
should expose the underlying cairo API to create a canvas backed by an external (caller-provided) buffer.
Future ideas
Chores:
- handle windows shared memory
- factor out helper functions for shared memory (rather than inline FFI usage in
pdf_render()
), maybe move the bindings to a separate file - provide an option whether to use shared memory or
blobValueOf()
, merging the separate files into one. - add ability to plug in different shmem size predictors, or to pass a custom value (use
blobValueOf()
if page is larger than buffer)
pdfJS features:
- handling different bitmap formats
- crop, matrix transformations
- play with other
RenderParameters
, e.g.pageColors
,intent
, ...
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Historical note: the author created extremeheat/JSPyBridge#103 (
blobValueOf()
) on behalf of this use case.