ssgosh · July 15, 2024 06:15
diff --git a/extract_pdf_highlights.py b/extract_pdf_highlights.py
 """
 Extract highlighted text from PDFs in the given dir and its subdirs and save them
 to text files with the same name as the PDFs, except with a .txt extension.

 Tested with Python 3.10.4 and PyMuPDF==1.24.7

 Created with the help of BingAI and various Stackoverflow and GitHub answers.

 usage: extract_pdf_highlights.py [-h] dirname

 Extract highlights from PDFs

 positional arguments:
  dirname     Directory containing PDFs

 options:
  -h, --help  show this help message and exit
 """
 import os
 import pymupdf
 import argparse


 def extract_highlights_from_pdf(pdf_path):
    doc = pymupdf.open(pdf_path)
    highlights = []

    for page_num in range(doc.page_count):
        page = doc[page_num]
        annots = page.annots()

        for annot in annots:
            if annot.info.get("subject") == "Highlight":
                text = get_text_from_annot(annot, page)
                highlights.append(text)

    return highlights


 def get_text_from_annot(annot, page):
    # annot.vertices gives a bunch of quadrilaterals whose union
    # is the highlighted area: https://github.com/pymupdf/PyMuPDF/issues/318#issuecomment-657098888
    # 4 points per quad: top-left, top-right, bottom-right, bottom-left
    quad_points = annot.vertices
    assert len(quad_points) % 4 == 0
    text = ""
    for i in range(0, len(quad_points), 4):
        # Rectangle is defined by two points: top-left and bottom-right
        quad = pymupdf.Rect(*quad_points[i], *quad_points[i + 3])
        line = page.get_text("text", quad).strip()
        if line[-1] != "-":
            line = line + " "
        text += line
    return text


 def save_highlights_to_text(highlights, txt_path):
    with open(txt_path, "w", encoding="utf-8") as txt_file:
        for highlight in highlights:
            txt_file.write(highlight + "\n")


 def process_pdfs(directory="."):
    def os_walk_on_error(e : OSError):
        raise e

    print("Directory:", directory)
    for root, _, files in os.walk(directory, onerror=os_walk_on_error):
        for filename in files:
            if filename.lower().endswith(".pdf"):
                print("Processing:", filename)
                pdf_path = os.path.join(root, filename)
                txt_path = os.path.splitext(pdf_path)[0] + ".txt"

                highlights = extract_highlights_from_pdf(pdf_path)
                if highlights:
                    print("Saving highlights to:", txt_path)
                    save_highlights_to_text(highlights, txt_path)


 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Extract highlights from PDFs")
    parser.add_argument("dirname", help="Directory containing PDFs")
    args = parser.parse_args()
    process_pdfs(args.dirname)
	"""
	Extract highlighted text from PDFs in the given dir and its subdirs and save them
	to text files with the same name as the PDFs, except with a .txt extension.

	Tested with Python 3.10.4 and PyMuPDF==1.24.7

	Created with the help of BingAI and various Stackoverflow and GitHub answers.

	usage: extract_pdf_highlights.py [-h] dirname

	Extract highlights from PDFs

	positional arguments:
	dirname Directory containing PDFs

	options:
	-h, --help show this help message and exit
	"""
	import os
	import pymupdf
	import argparse


	def extract_highlights_from_pdf(pdf_path):
	doc = pymupdf.open(pdf_path)
	highlights = []

	for page_num in range(doc.page_count):
	page = doc[page_num]
	annots = page.annots()

	for annot in annots:
	if annot.info.get("subject") == "Highlight":
	text = get_text_from_annot(annot, page)
	highlights.append(text)

	return highlights


	def get_text_from_annot(annot, page):
	# annot.vertices gives a bunch of quadrilaterals whose union
	# is the highlighted area: https://github.com/pymupdf/PyMuPDF/issues/318#issuecomment-657098888
	# 4 points per quad: top-left, top-right, bottom-right, bottom-left
	quad_points = annot.vertices
	assert len(quad_points) % 4 == 0
	text = ""
	for i in range(0, len(quad_points), 4):
	# Rectangle is defined by two points: top-left and bottom-right
	quad = pymupdf.Rect(quad_points[i], quad_points[i + 3])
	line = page.get_text("text", quad).strip()
	if line[-1] != "-":
	line = line + " "
	text += line
	return text


	def save_highlights_to_text(highlights, txt_path):
	with open(txt_path, "w", encoding="utf-8") as txt_file:
	for highlight in highlights:
	txt_file.write(highlight + "\n")


	def process_pdfs(directory="."):
	def os_walk_on_error(e : OSError):
	raise e

	print("Directory:", directory)
	for root, _, files in os.walk(directory, onerror=os_walk_on_error):
	for filename in files:
	if filename.lower().endswith(".pdf"):
	print("Processing:", filename)
	pdf_path = os.path.join(root, filename)
	txt_path = os.path.splitext(pdf_path)[0] + ".txt"

	highlights = extract_highlights_from_pdf(pdf_path)
	if highlights:
	print("Saving highlights to:", txt_path)
	save_highlights_to_text(highlights, txt_path)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Extract highlights from PDFs")
	parser.add_argument("dirname", help="Directory containing PDFs")
	args = parser.parse_args()
	process_pdfs(args.dirname)