dhsrocha · March 24, 2025 17:26
diff --git a/ocr.py b/ocr.py
 import logging
 import os

 import PyPDF2
 import fitz
 import pytesseract
 from PIL import Image
 from fpdf import FPDF

 # Configure logging
 logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')


 def extract_images_from_pdf(_pdf_file, _output_dir):
    if os.path.exists(_output_dir):
        num_pages = len([n for n in os.listdir(_output_dir) if n.endswith('.png')])
        logging.warning(f'Skipping image extraction. {num_pages} images found.')
        return num_pages

    logging.info("Creating output directory and extracting images from PDF.")
    os.makedirs(_output_dir)
    with open(_pdf_file, 'rb') as file:
        num_pages = len(PyPDF2.PdfReader(file).pages)

    with fitz.open(_pdf_file) as doc:
        for p in range(num_pages):
            pix = doc.load_page(p).get_pixmap()
            pix.save(f'{_output_dir}/page_{p}.png')
            logging.info(f"Saved image for page {p}.")

    return num_pages


 def extract_text_from(_output_dir, _text_dir, _output_file, _num_pages):
    if not os.path.exists(_text_dir):
        os.makedirs(_text_dir)

    text_lines = ""

    for p in range(_num_pages):
        text_file_path = os.path.join(_text_dir, f'page_{p}.txt')
        if os.path.exists(text_file_path):
            logging.info(f'Text file for page {p} already exists. Skipping.')
            with open(text_file_path, 'r', encoding='utf-8') as text_file:
                text_lines += text_file.read()
            continue

        logging.info(f"Extracting text from images for page {p}.")
        img = Image.open(f'{_output_dir}/page_{p}.png')
        text = pytesseract.image_to_string(img)
        with open(text_file_path, 'w', encoding='utf-8') as text_file:
            text_file.write(text)
            logging.info(f"Saved extracted text to {text_file_path}.")
        text_lines += text

    with open(_output_file, 'w', encoding='utf-8') as io:
        io.write(text_lines)
        logging.info(f"Saved concatenated text to {_output_file}.")


 def create_pdf_with_extracted_text(_extracted_text, _output_pdf_path):
    if not os.path.exists(_output_pdf_path):
        logging.info("Creating new PDF with extracted text.")
        pdf = FPDF()
        pdf.set_auto_page_break(auto=True, margin=15)
        pdf.add_page()
        pdf.set_font("Arial", size=12)

        for text in _extracted_text:
            pdf.multi_cell(0, 10, text)

        pdf.output(_output_pdf_path)
        logging.info(f"Output PDF saved at: {_output_pdf_path}")
    else:
        logging.info("Output PDF already exists. Skipping PDF creation.")


 def main():
    input_dir = 'in/input_2.pdf'
    page_dir = '2/img'
    ext_dir = '2/text'
    concat_path = "extracted_text.txt"

    num_pages = extract_images_from_pdf(input_dir, page_dir)
    extract_text_from(page_dir, ext_dir, concat_path, num_pages)


 if __name__ == "__main__":
    main()
	import logging
	import os

	import PyPDF2
	import fitz
	import pytesseract
	from PIL import Image
	from fpdf import FPDF

	# Configure logging
	logging.basicConfig(level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s')


	def extract_images_from_pdf(_pdf_file, _output_dir):
	if os.path.exists(_output_dir):
	num_pages = len([n for n in os.listdir(_output_dir) if n.endswith('.png')])
	logging.warning(f'Skipping image extraction. {num_pages} images found.')
	return num_pages

	logging.info("Creating output directory and extracting images from PDF.")
	os.makedirs(_output_dir)
	with open(_pdf_file, 'rb') as file:
	num_pages = len(PyPDF2.PdfReader(file).pages)

	with fitz.open(_pdf_file) as doc:
	for p in range(num_pages):
	pix = doc.load_page(p).get_pixmap()
	pix.save(f'{_output_dir}/page_{p}.png')
	logging.info(f"Saved image for page {p}.")

	return num_pages


	def extract_text_from(_output_dir, _text_dir, _output_file, _num_pages):
	if not os.path.exists(_text_dir):
	os.makedirs(_text_dir)

	text_lines = ""

	for p in range(_num_pages):
	text_file_path = os.path.join(_text_dir, f'page_{p}.txt')
	if os.path.exists(text_file_path):
	logging.info(f'Text file for page {p} already exists. Skipping.')
	with open(text_file_path, 'r', encoding='utf-8') as text_file:
	text_lines += text_file.read()
	continue

	logging.info(f"Extracting text from images for page {p}.")
	img = Image.open(f'{_output_dir}/page_{p}.png')
	text = pytesseract.image_to_string(img)
	with open(text_file_path, 'w', encoding='utf-8') as text_file:
	text_file.write(text)
	logging.info(f"Saved extracted text to {text_file_path}.")
	text_lines += text

	with open(_output_file, 'w', encoding='utf-8') as io:
	io.write(text_lines)
	logging.info(f"Saved concatenated text to {_output_file}.")


	def create_pdf_with_extracted_text(_extracted_text, _output_pdf_path):
	if not os.path.exists(_output_pdf_path):
	logging.info("Creating new PDF with extracted text.")
	pdf = FPDF()
	pdf.set_auto_page_break(auto=True, margin=15)
	pdf.add_page()
	pdf.set_font("Arial", size=12)

	for text in _extracted_text:
	pdf.multi_cell(0, 10, text)

	pdf.output(_output_pdf_path)
	logging.info(f"Output PDF saved at: {_output_pdf_path}")
	else:
	logging.info("Output PDF already exists. Skipping PDF creation.")


	def main():
	input_dir = 'in/input_2.pdf'
	page_dir = '2/img'
	ext_dir = '2/text'
	concat_path = "extracted_text.txt"

	num_pages = extract_images_from_pdf(input_dir, page_dir)
	extract_text_from(page_dir, ext_dir, concat_path, num_pages)


	if __name__ == "__main__":
	main()