Skip to content

Instantly share code, notes, and snippets.

@yuchen-xue
Created May 10, 2025 06:42
Show Gist options
  • Save yuchen-xue/91a3317b3f22aae8c4558279bb4941fa to your computer and use it in GitHub Desktop.
Save yuchen-xue/91a3317b3f22aae8c4558279bb4941fa to your computer and use it in GitHub Desktop.
A function for extracting text and images from a PDF file.
def extract_pdf(
pdf_path: str | os.PathLike,
img_dir: str | os.PathLike,
text_output_path: str | os.PathLike,
skip_first_image: bool = True,
) -> None:
"""
Extracts text and images from a PDF file.
This function extracts text from each page of the PDF and saves it to a text file.
It also extracts images from each page and saves them in a specified directory.
The images are saved in PNG format, and the paths to the images are included in the text file
in HTML format.
Args:
pdf_path (str | os.PathLike): Path to the PDF file.
img_dir (str | os.PathLike): Directory to save extracted images.
text_output_path (str | os.PathLike): Path to save the extracted text.
skip_first_image (bool): If True, skips the first image on each page.
This is useful if the first image is a thumbnail or cover image.
"""
# Create the output directory for images if it doesn't exist
img_dir = Path(img_dir)
img_dir.mkdir(parents=True, exist_ok=True)
# Open the PDF file and extract text and images
with (
fitz.open(pdf_path) as doc,
open(text_output_path, "w", encoding="utf-8") as output_file,
):
# Iterate through each page
for page_num in range(len(doc)):
page = doc.load_page(page_num)
# Extract text and write to the output file
text = page.get_text().strip()
print(f"{text}", file=output_file)
# Extract every image on the page and save it
images = page.get_images(full=True)
for img_index, img in enumerate(images):
# Skip the first image since it is usually the page thumbnail
if skip_first_image and img_index == 0:
continue
# Extract the image
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
# Save the image
img_path = img_dir / f"page_{page_num + 1}_img_{img_index}.png"
with img_path.open("wb") as img_file:
img_file.write(image_bytes)
# Write the image path following the HTML format in the output file
print(f'<img src="{img_path.name}">', file=output_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment