Skip to content

Instantly share code, notes, and snippets.

@ssgosh
Created July 15, 2024 06:15
Show Gist options
  • Save ssgosh/eafa32980ea4efc50f7391267b4e29f2 to your computer and use it in GitHub Desktop.
Save ssgosh/eafa32980ea4efc50f7391267b4e29f2 to your computer and use it in GitHub Desktop.
Extract highlighted text from PDF files
"""
Extract highlighted text from PDFs in the given dir and its subdirs and save them
to text files with the same name as the PDFs, except with a .txt extension.
Tested with Python 3.10.4 and PyMuPDF==1.24.7
Created with the help of BingAI and various Stackoverflow and GitHub answers.
usage: extract_pdf_highlights.py [-h] dirname
Extract highlights from PDFs
positional arguments:
dirname Directory containing PDFs
options:
-h, --help show this help message and exit
"""
import os
import pymupdf
import argparse
def extract_highlights_from_pdf(pdf_path):
doc = pymupdf.open(pdf_path)
highlights = []
for page_num in range(doc.page_count):
page = doc[page_num]
annots = page.annots()
for annot in annots:
if annot.info.get("subject") == "Highlight":
text = get_text_from_annot(annot, page)
highlights.append(text)
return highlights
def get_text_from_annot(annot, page):
# annot.vertices gives a bunch of quadrilaterals whose union
# is the highlighted area: https://github.com/pymupdf/PyMuPDF/issues/318#issuecomment-657098888
# 4 points per quad: top-left, top-right, bottom-right, bottom-left
quad_points = annot.vertices
assert len(quad_points) % 4 == 0
text = ""
for i in range(0, len(quad_points), 4):
# Rectangle is defined by two points: top-left and bottom-right
quad = pymupdf.Rect(*quad_points[i], *quad_points[i + 3])
line = page.get_text("text", quad).strip()
if line[-1] != "-":
line = line + " "
text += line
return text
def save_highlights_to_text(highlights, txt_path):
with open(txt_path, "w", encoding="utf-8") as txt_file:
for highlight in highlights:
txt_file.write(highlight + "\n")
def process_pdfs(directory="."):
def os_walk_on_error(e : OSError):
raise e
print("Directory:", directory)
for root, _, files in os.walk(directory, onerror=os_walk_on_error):
for filename in files:
if filename.lower().endswith(".pdf"):
print("Processing:", filename)
pdf_path = os.path.join(root, filename)
txt_path = os.path.splitext(pdf_path)[0] + ".txt"
highlights = extract_highlights_from_pdf(pdf_path)
if highlights:
print("Saving highlights to:", txt_path)
save_highlights_to_text(highlights, txt_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Extract highlights from PDFs")
parser.add_argument("dirname", help="Directory containing PDFs")
args = parser.parse_args()
process_pdfs(args.dirname)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment