Created
July 15, 2024 06:15
-
-
Save ssgosh/eafa32980ea4efc50f7391267b4e29f2 to your computer and use it in GitHub Desktop.
Extract highlighted text from PDF files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Extract highlighted text from PDFs in the given dir and its subdirs and save them | |
to text files with the same name as the PDFs, except with a .txt extension. | |
Tested with Python 3.10.4 and PyMuPDF==1.24.7 | |
Created with the help of BingAI and various Stackoverflow and GitHub answers. | |
usage: extract_pdf_highlights.py [-h] dirname | |
Extract highlights from PDFs | |
positional arguments: | |
dirname Directory containing PDFs | |
options: | |
-h, --help show this help message and exit | |
""" | |
import os | |
import pymupdf | |
import argparse | |
def extract_highlights_from_pdf(pdf_path): | |
doc = pymupdf.open(pdf_path) | |
highlights = [] | |
for page_num in range(doc.page_count): | |
page = doc[page_num] | |
annots = page.annots() | |
for annot in annots: | |
if annot.info.get("subject") == "Highlight": | |
text = get_text_from_annot(annot, page) | |
highlights.append(text) | |
return highlights | |
def get_text_from_annot(annot, page): | |
# annot.vertices gives a bunch of quadrilaterals whose union | |
# is the highlighted area: https://github.com/pymupdf/PyMuPDF/issues/318#issuecomment-657098888 | |
# 4 points per quad: top-left, top-right, bottom-right, bottom-left | |
quad_points = annot.vertices | |
assert len(quad_points) % 4 == 0 | |
text = "" | |
for i in range(0, len(quad_points), 4): | |
# Rectangle is defined by two points: top-left and bottom-right | |
quad = pymupdf.Rect(*quad_points[i], *quad_points[i + 3]) | |
line = page.get_text("text", quad).strip() | |
if line[-1] != "-": | |
line = line + " " | |
text += line | |
return text | |
def save_highlights_to_text(highlights, txt_path): | |
with open(txt_path, "w", encoding="utf-8") as txt_file: | |
for highlight in highlights: | |
txt_file.write(highlight + "\n") | |
def process_pdfs(directory="."): | |
def os_walk_on_error(e : OSError): | |
raise e | |
print("Directory:", directory) | |
for root, _, files in os.walk(directory, onerror=os_walk_on_error): | |
for filename in files: | |
if filename.lower().endswith(".pdf"): | |
print("Processing:", filename) | |
pdf_path = os.path.join(root, filename) | |
txt_path = os.path.splitext(pdf_path)[0] + ".txt" | |
highlights = extract_highlights_from_pdf(pdf_path) | |
if highlights: | |
print("Saving highlights to:", txt_path) | |
save_highlights_to_text(highlights, txt_path) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Extract highlights from PDFs") | |
parser.add_argument("dirname", help="Directory containing PDFs") | |
args = parser.parse_args() | |
process_pdfs(args.dirname) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment