Created
May 10, 2025 06:42
-
-
Save yuchen-xue/91a3317b3f22aae8c4558279bb4941fa to your computer and use it in GitHub Desktop.
A function for extracting text and images from a PDF file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def extract_pdf( | |
pdf_path: str | os.PathLike, | |
img_dir: str | os.PathLike, | |
text_output_path: str | os.PathLike, | |
skip_first_image: bool = True, | |
) -> None: | |
""" | |
Extracts text and images from a PDF file. | |
This function extracts text from each page of the PDF and saves it to a text file. | |
It also extracts images from each page and saves them in a specified directory. | |
The images are saved in PNG format, and the paths to the images are included in the text file | |
in HTML format. | |
Args: | |
pdf_path (str | os.PathLike): Path to the PDF file. | |
img_dir (str | os.PathLike): Directory to save extracted images. | |
text_output_path (str | os.PathLike): Path to save the extracted text. | |
skip_first_image (bool): If True, skips the first image on each page. | |
This is useful if the first image is a thumbnail or cover image. | |
""" | |
# Create the output directory for images if it doesn't exist | |
img_dir = Path(img_dir) | |
img_dir.mkdir(parents=True, exist_ok=True) | |
# Open the PDF file and extract text and images | |
with ( | |
fitz.open(pdf_path) as doc, | |
open(text_output_path, "w", encoding="utf-8") as output_file, | |
): | |
# Iterate through each page | |
for page_num in range(len(doc)): | |
page = doc.load_page(page_num) | |
# Extract text and write to the output file | |
text = page.get_text().strip() | |
print(f"{text}", file=output_file) | |
# Extract every image on the page and save it | |
images = page.get_images(full=True) | |
for img_index, img in enumerate(images): | |
# Skip the first image since it is usually the page thumbnail | |
if skip_first_image and img_index == 0: | |
continue | |
# Extract the image | |
xref = img[0] | |
base_image = doc.extract_image(xref) | |
image_bytes = base_image["image"] | |
# Save the image | |
img_path = img_dir / f"page_{page_num + 1}_img_{img_index}.png" | |
with img_path.open("wb") as img_file: | |
img_file.write(image_bytes) | |
# Write the image path following the HTML format in the output file | |
print(f'<img src="{img_path.name}">', file=output_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment