Skip to content

Instantly share code, notes, and snippets.

@scientific-coder
Forked from cpfiffer/pdf-to-structure.py
Created November 7, 2024 22:40
Show Gist options
  • Save scientific-coder/56e8f255c3079fc5f0b5395c5c6852e4 to your computer and use it in GitHub Desktop.
Save scientific-coder/56e8f255c3079fc5f0b5395c5c6852e4 to your computer and use it in GitHub Desktop.
Get structured output from PDFs. Goes through a PDF one page at a time -- it is not currently build for multiple pages, but could be extended as needed.
"""
pip install outlines torch==2.4.0 transformers accelerate typing-extensions pillow pdf2image rich requests
may need to install tkinter: https://stackoverflow.com/questions/25905540/importerror-no-module-named-tkinter
sudo apt-get install poppler-utils
"""
from enum import Enum
from io import BytesIO
from PIL import Image
from urllib.request import urlopen
import outlines
import torch
from transformers import (
LlavaForConditionalGeneration,
)
from pydantic import BaseModel, Field, confloat, constr
from pydantic.types import StringConstraints
from typing import List
from typing_extensions import Annotated
from pdf2image import convert_from_path
import os
from typing import List, Optional
from rich import print
import requests
model_name="mistral-community/pixtral-12b" # original magnet model is able to be loaded without issue
model_class=LlavaForConditionalGeneration
model_kwargs = {
"torch_dtype": torch.bfloat16,
"device_map": "auto",
}
processor_kwargs = {
"device": "cuda",
}
model = outlines.models.transformers_vision(
model_name,
model_class=model_class,
model_kwargs=model_kwargs,
processor_kwargs=processor_kwargs,
)
def convert_pdf_to_images(
pdf_path: str,
output_dir: Optional[str] = None,
dpi: int = 20,
fmt: str = 'PNG'
) -> List[Image.Image]:
"""
Convert a PDF file to a list of PIL Image objects.
Args:
pdf_path: Path to the PDF file
output_dir: Optional directory to save the images
dpi: Resolution for the conversion (200 is good for vision models)
fmt: Output format (PNG recommended for quality)
Returns:
List of PIL Image objects
"""
# Convert PDF to list of images
images = convert_from_path(
pdf_path,
dpi=dpi,
fmt=fmt
)
# Optionally save images
if output_dir:
os.makedirs(output_dir, exist_ok=True)
for i, image in enumerate(images):
image.save(os.path.join(output_dir, f'page_{i+1}.{fmt.lower()}'))
return images
# Download the louf-willard pdf
# https://arxiv.org/pdf/2307.09702
# Download the PDF file
pdf_url = "https://arxiv.org/pdf/2307.09702"
response = requests.get(pdf_url)
# Save the PDF locally
with open("louf-willard.pdf", "wb") as f:
f.write(response.content)
# Load the louf-willard pdf
images = convert_pdf_to_images("louf-willard.pdf", dpi=80, output_dir="output_images")
class PageSummary(BaseModel):
description: str
key_takeaways: List[str]
page_number: int
page_summary_generator = outlines.generate.json(model, PageSummary)
instruction = f"""
<s>[INST]
You are an expert at summarizing pages from a scientific paper.
Please summarize the page.
Your schema is:
{PageSummary.model_json_schema()}
Please extract
[IMG][/INST]
""".strip()
for image in images:
page_summary = page_summary_generator(instruction, [image])
print(page_summary)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment