Created
March 24, 2025 17:26
-
-
Save dhsrocha/046607c3a39559bcaae8b4000b8f969e to your computer and use it in GitHub Desktop.
Extract text from PDF files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import os | |
import PyPDF2 | |
import fitz | |
import pytesseract | |
from PIL import Image | |
from fpdf import FPDF | |
# Configure logging | |
logging.basicConfig(level=logging.INFO, | |
format='%(asctime)s - %(levelname)s - %(message)s') | |
def extract_images_from_pdf(_pdf_file, _output_dir): | |
if os.path.exists(_output_dir): | |
num_pages = len([n for n in os.listdir(_output_dir) if n.endswith('.png')]) | |
logging.warning(f'Skipping image extraction. {num_pages} images found.') | |
return num_pages | |
logging.info("Creating output directory and extracting images from PDF.") | |
os.makedirs(_output_dir) | |
with open(_pdf_file, 'rb') as file: | |
num_pages = len(PyPDF2.PdfReader(file).pages) | |
with fitz.open(_pdf_file) as doc: | |
for p in range(num_pages): | |
pix = doc.load_page(p).get_pixmap() | |
pix.save(f'{_output_dir}/page_{p}.png') | |
logging.info(f"Saved image for page {p}.") | |
return num_pages | |
def extract_text_from(_output_dir, _text_dir, _output_file, _num_pages): | |
if not os.path.exists(_text_dir): | |
os.makedirs(_text_dir) | |
text_lines = "" | |
for p in range(_num_pages): | |
text_file_path = os.path.join(_text_dir, f'page_{p}.txt') | |
if os.path.exists(text_file_path): | |
logging.info(f'Text file for page {p} already exists. Skipping.') | |
with open(text_file_path, 'r', encoding='utf-8') as text_file: | |
text_lines += text_file.read() | |
continue | |
logging.info(f"Extracting text from images for page {p}.") | |
img = Image.open(f'{_output_dir}/page_{p}.png') | |
text = pytesseract.image_to_string(img) | |
with open(text_file_path, 'w', encoding='utf-8') as text_file: | |
text_file.write(text) | |
logging.info(f"Saved extracted text to {text_file_path}.") | |
text_lines += text | |
with open(_output_file, 'w', encoding='utf-8') as io: | |
io.write(text_lines) | |
logging.info(f"Saved concatenated text to {_output_file}.") | |
def create_pdf_with_extracted_text(_extracted_text, _output_pdf_path): | |
if not os.path.exists(_output_pdf_path): | |
logging.info("Creating new PDF with extracted text.") | |
pdf = FPDF() | |
pdf.set_auto_page_break(auto=True, margin=15) | |
pdf.add_page() | |
pdf.set_font("Arial", size=12) | |
for text in _extracted_text: | |
pdf.multi_cell(0, 10, text) | |
pdf.output(_output_pdf_path) | |
logging.info(f"Output PDF saved at: {_output_pdf_path}") | |
else: | |
logging.info("Output PDF already exists. Skipping PDF creation.") | |
def main(): | |
input_dir = 'in/input_2.pdf' | |
page_dir = '2/img' | |
ext_dir = '2/text' | |
concat_path = "extracted_text.txt" | |
num_pages = extract_images_from_pdf(input_dir, page_dir) | |
extract_text_from(page_dir, ext_dir, concat_path, num_pages) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment