Skip to content

Instantly share code, notes, and snippets.

@martenc
Created March 20, 2025 14:28
Show Gist options
  • Save martenc/5753112074c56e5989250c4d6de716af to your computer and use it in GitHub Desktop.
Save martenc/5753112074c56e5989250c4d6de716af to your computer and use it in GitHub Desktop.
multi-modal RAG - high_res chunking strategy
from pathlib import Path
from langchain.docstore.document import Document
from unstructured.partition.pdf import partition_pdf
from unstructured.documents.elements import Text, Image, Table, CompositeElement
class ExtractionPipeline:
def __init__(self, folder_path):
self.folder_path = folder_path
self.pdf_filenames = self.get_pdf_filenames_from_folder()
def get_pdf_filenames_from_folder(self):
return list(Path(self.folder_path).glob("*.pdf"))
def load_and_split_documents(self, folder_path):
document_chunks = []
for pdf_filename in self.pdf_filenames:
print("processing document: ", pdf_filename)
raw_pdf_elements = partition_pdf(pdf_filename,
chunking_strategy="by_title",
max_characters=2000,
new_after_n_chars=1800,
combine_text_under_n_chars=1000,
infer_table_structure=True)
Text_elements = [Document(page_content = e.text.strip(), metadata={"filename": e.metadata.filename, "source_type": "text"}) for e in raw_pdf_elements if type(e) == Text or CompositeElement]
print("Number of Detected Text elements: ", len(Text_elements))
# Tables
Table_elements = [Document(page_content = e.metadata.text_as_html, metadata={"filename": e.metadata.filename, "source_type": "table_html"}) for e in raw_pdf_elements if type(e) == Table]
print("Number of Detected HTML Tables: ", len(Table_elements))
return Text_elements, Table_elements
class VisualExtractionPipeline:
def __init__(self, folder_path):
self.folder_path = folder_path
self.pdf_filenames = self.get_pdf_filenames_from_folder()
def get_pdf_filenames_from_folder(self):
return list(Path(self.folder_path).glob("*.pdf"))
def load_and_split_documents(self, folder_path):
for pdf_filename in self.pdf_filenames:
print("processing document: ", pdf_filename)
image_text_elements = partition_pdf(
filename=pdf_filename,
strategy="hi_res",
hi_res_model_name="yolox",
extract_images_in_pdf=True,
extract_image_block_types=["Image", "Table"],
extract_image_block_to_payload=False,
extract_image_block_output_dir="/dli/task/03-Lab/figures/",
)
image_elements=[Document(page_content = e.metadata.image_path, metadata={"filename": e.metadata.filename, "source_type": "image"}) for e in image_text_elements if type(e) == Image]
#table_elements=[Document(page_content = e.metadata.image_path, metadata={"filename": e.metadata.filename, "source_type": "image_table"}) for e in image_text_elements if type(e) == Table]
return image_elements #+ table_elements
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment