yuchen-xue · May 10, 2025 06:42
diff --git a/extract-pdf.py b/extract-pdf.py
 def extract_pdf(
    pdf_path: str | os.PathLike,
    img_dir: str | os.PathLike,
    text_output_path: str | os.PathLike,
    skip_first_image: bool = True,
 ) -> None:
    """
    Extracts text and images from a PDF file.
    This function extracts text from each page of the PDF and saves it to a text file.
    It also extracts images from each page and saves them in a specified directory.
    The images are saved in PNG format, and the paths to the images are included in the text file
    in HTML format.
    Args:
        pdf_path (str | os.PathLike): Path to the PDF file.
        img_dir (str | os.PathLike): Directory to save extracted images.
        text_output_path (str | os.PathLike): Path to save the extracted text.
        skip_first_image (bool): If True, skips the first image on each page.
            This is useful if the first image is a thumbnail or cover image.
    """

    # Create the output directory for images if it doesn't exist
    img_dir = Path(img_dir)
    img_dir.mkdir(parents=True, exist_ok=True)

    # Open the PDF file and extract text and images
    with (
        fitz.open(pdf_path) as doc,
        open(text_output_path, "w", encoding="utf-8") as output_file,
    ):
        # Iterate through each page
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)

            # Extract text and write to the output file
            text = page.get_text().strip()
            print(f"{text}", file=output_file)

            # Extract every image on the page and save it
            images = page.get_images(full=True)
            for img_index, img in enumerate(images):
                # Skip the first image since it is usually the page thumbnail
                if skip_first_image and img_index == 0:
                    continue

                # Extract the image
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]

                # Save the image
                img_path = img_dir / f"page_{page_num + 1}_img_{img_index}.png"
                with img_path.open("wb") as img_file:
                    img_file.write(image_bytes)

                # Write the image path following the HTML format in the output file
                print(f'<img src="{img_path.name}">', file=output_file)
	def extract_pdf(
	pdf_path: str \| os.PathLike,
	img_dir: str \| os.PathLike,
	text_output_path: str \| os.PathLike,
	skip_first_image: bool = True,
	) -> None:
	"""
	Extracts text and images from a PDF file.
	This function extracts text from each page of the PDF and saves it to a text file.
	It also extracts images from each page and saves them in a specified directory.
	The images are saved in PNG format, and the paths to the images are included in the text file
	in HTML format.
	Args:
	pdf_path (str \| os.PathLike): Path to the PDF file.
	img_dir (str \| os.PathLike): Directory to save extracted images.
	text_output_path (str \| os.PathLike): Path to save the extracted text.
	skip_first_image (bool): If True, skips the first image on each page.
	This is useful if the first image is a thumbnail or cover image.
	"""

	# Create the output directory for images if it doesn't exist
	img_dir = Path(img_dir)
	img_dir.mkdir(parents=True, exist_ok=True)

	# Open the PDF file and extract text and images
	with (
	fitz.open(pdf_path) as doc,
	open(text_output_path, "w", encoding="utf-8") as output_file,
	):
	# Iterate through each page
	for page_num in range(len(doc)):
	page = doc.load_page(page_num)

	# Extract text and write to the output file
	text = page.get_text().strip()
	print(f"{text}", file=output_file)

	# Extract every image on the page and save it
	images = page.get_images(full=True)
	for img_index, img in enumerate(images):
	# Skip the first image since it is usually the page thumbnail
	if skip_first_image and img_index == 0:
	continue

	# Extract the image
	xref = img[0]
	base_image = doc.extract_image(xref)
	image_bytes = base_image["image"]

	# Save the image
	img_path = img_dir / f"page_{page_num + 1}_img_{img_index}.png"
	with img_path.open("wb") as img_file:
	img_file.write(image_bytes)

	# Write the image path following the HTML format in the output file
	print(f'<img src="{img_path.name}">', file=output_file)