Created
March 20, 2025 14:28
-
-
Save martenc/5753112074c56e5989250c4d6de716af to your computer and use it in GitHub Desktop.
multi-modal RAG - high_res chunking strategy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
from langchain.docstore.document import Document | |
from unstructured.partition.pdf import partition_pdf | |
from unstructured.documents.elements import Text, Image, Table, CompositeElement | |
class ExtractionPipeline: | |
def __init__(self, folder_path): | |
self.folder_path = folder_path | |
self.pdf_filenames = self.get_pdf_filenames_from_folder() | |
def get_pdf_filenames_from_folder(self): | |
return list(Path(self.folder_path).glob("*.pdf")) | |
def load_and_split_documents(self, folder_path): | |
document_chunks = [] | |
for pdf_filename in self.pdf_filenames: | |
print("processing document: ", pdf_filename) | |
raw_pdf_elements = partition_pdf(pdf_filename, | |
chunking_strategy="by_title", | |
max_characters=2000, | |
new_after_n_chars=1800, | |
combine_text_under_n_chars=1000, | |
infer_table_structure=True) | |
Text_elements = [Document(page_content = e.text.strip(), metadata={"filename": e.metadata.filename, "source_type": "text"}) for e in raw_pdf_elements if type(e) == Text or CompositeElement] | |
print("Number of Detected Text elements: ", len(Text_elements)) | |
# Tables | |
Table_elements = [Document(page_content = e.metadata.text_as_html, metadata={"filename": e.metadata.filename, "source_type": "table_html"}) for e in raw_pdf_elements if type(e) == Table] | |
print("Number of Detected HTML Tables: ", len(Table_elements)) | |
return Text_elements, Table_elements |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class VisualExtractionPipeline: | |
def __init__(self, folder_path): | |
self.folder_path = folder_path | |
self.pdf_filenames = self.get_pdf_filenames_from_folder() | |
def get_pdf_filenames_from_folder(self): | |
return list(Path(self.folder_path).glob("*.pdf")) | |
def load_and_split_documents(self, folder_path): | |
for pdf_filename in self.pdf_filenames: | |
print("processing document: ", pdf_filename) | |
image_text_elements = partition_pdf( | |
filename=pdf_filename, | |
strategy="hi_res", | |
hi_res_model_name="yolox", | |
extract_images_in_pdf=True, | |
extract_image_block_types=["Image", "Table"], | |
extract_image_block_to_payload=False, | |
extract_image_block_output_dir="/dli/task/03-Lab/figures/", | |
) | |
image_elements=[Document(page_content = e.metadata.image_path, metadata={"filename": e.metadata.filename, "source_type": "image"}) for e in image_text_elements if type(e) == Image] | |
#table_elements=[Document(page_content = e.metadata.image_path, metadata={"filename": e.metadata.filename, "source_type": "image_table"}) for e in image_text_elements if type(e) == Table] | |
return image_elements #+ table_elements |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment