Skip to content

Instantly share code, notes, and snippets.

@dhsrocha
Created March 24, 2025 17:26
Show Gist options
  • Save dhsrocha/046607c3a39559bcaae8b4000b8f969e to your computer and use it in GitHub Desktop.
Save dhsrocha/046607c3a39559bcaae8b4000b8f969e to your computer and use it in GitHub Desktop.
Extract text from PDF files.
import logging
import os
import PyPDF2
import fitz
import pytesseract
from PIL import Image
from fpdf import FPDF
# Configure logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')
def extract_images_from_pdf(_pdf_file, _output_dir):
if os.path.exists(_output_dir):
num_pages = len([n for n in os.listdir(_output_dir) if n.endswith('.png')])
logging.warning(f'Skipping image extraction. {num_pages} images found.')
return num_pages
logging.info("Creating output directory and extracting images from PDF.")
os.makedirs(_output_dir)
with open(_pdf_file, 'rb') as file:
num_pages = len(PyPDF2.PdfReader(file).pages)
with fitz.open(_pdf_file) as doc:
for p in range(num_pages):
pix = doc.load_page(p).get_pixmap()
pix.save(f'{_output_dir}/page_{p}.png')
logging.info(f"Saved image for page {p}.")
return num_pages
def extract_text_from(_output_dir, _text_dir, _output_file, _num_pages):
if not os.path.exists(_text_dir):
os.makedirs(_text_dir)
text_lines = ""
for p in range(_num_pages):
text_file_path = os.path.join(_text_dir, f'page_{p}.txt')
if os.path.exists(text_file_path):
logging.info(f'Text file for page {p} already exists. Skipping.')
with open(text_file_path, 'r', encoding='utf-8') as text_file:
text_lines += text_file.read()
continue
logging.info(f"Extracting text from images for page {p}.")
img = Image.open(f'{_output_dir}/page_{p}.png')
text = pytesseract.image_to_string(img)
with open(text_file_path, 'w', encoding='utf-8') as text_file:
text_file.write(text)
logging.info(f"Saved extracted text to {text_file_path}.")
text_lines += text
with open(_output_file, 'w', encoding='utf-8') as io:
io.write(text_lines)
logging.info(f"Saved concatenated text to {_output_file}.")
def create_pdf_with_extracted_text(_extracted_text, _output_pdf_path):
if not os.path.exists(_output_pdf_path):
logging.info("Creating new PDF with extracted text.")
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.set_font("Arial", size=12)
for text in _extracted_text:
pdf.multi_cell(0, 10, text)
pdf.output(_output_pdf_path)
logging.info(f"Output PDF saved at: {_output_pdf_path}")
else:
logging.info("Output PDF already exists. Skipping PDF creation.")
def main():
input_dir = 'in/input_2.pdf'
page_dir = '2/img'
ext_dir = '2/text'
concat_path = "extracted_text.txt"
num_pages = extract_images_from_pdf(input_dir, page_dir)
extract_text_from(page_dir, ext_dir, concat_path, num_pages)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment