Created
October 20, 2024 06:36
-
-
Save immma/8549a18021574e1f305efb1b2786029c to your computer and use it in GitHub Desktop.
AWS Textract Python PDF table
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto3 | |
import time | |
# Initialize Textract client | |
textract = boto3.client('textract') | |
s3 = boto3.client('s3') | |
def start_layout_analysis(document_bucket, document_key): | |
# Start the Textract job for layout analysis | |
response = textract.start_document_analysis( | |
DocumentLocation={ | |
'S3Object': { | |
'Bucket': document_bucket, | |
'Name': document_key | |
} | |
}, | |
FeatureTypes=['LAYOUT', 'TABLES'] # Specify that we want to analyze layout | |
) | |
return response['JobId'] | |
def is_textract_job_complete(job_id): | |
# Check the status of the Textract job | |
response = textract.get_document_analysis(JobId=job_id) | |
status = response['JobStatus'] | |
while status == "IN_PROGRESS": | |
time.sleep(5) | |
response = textract.get_document_analysis(JobId=job_id) | |
status = response['JobStatus'] | |
return status | |
def get_textract_job_results(job_id): | |
# Fetch results from the completed Textract job | |
pages = [] | |
response = textract.get_document_analysis(JobId=job_id) | |
pages.append(response) | |
# Handle pagination if the document has multiple pages | |
next_token = response.get('NextToken') | |
while next_token: | |
response = textract.get_document_analysis(JobId=job_id, NextToken=next_token) | |
pages.append(response) | |
next_token = response.get('NextToken') | |
return pages | |
def get_text_from_cell(cell, page): | |
# Extract the text from a table cell | |
text = '' | |
for relationship in cell.get('Relationships', []): | |
if relationship['Type'] == 'CHILD': | |
for child_id in relationship['Ids']: | |
word_block = next((b for b in page['Blocks'] if b['Id'] == child_id), None) | |
if word_block and word_block['BlockType'] == 'WORD': | |
text += word_block.get('Text', '') + ' ' | |
return text.strip() # Clean up extra spaces | |
def extract_table_data(pages): | |
# Extract table data from Textract results | |
tables_data = [] | |
for page in pages: | |
for block in page['Blocks']: | |
if block['BlockType'] == 'TABLE': | |
table = [] | |
rows = {} | |
# Get each cell in the table | |
for relationship in block.get('Relationships', []): | |
# print(relationship) | |
if relationship['Type'] == 'CHILD': | |
for child_id in relationship['Ids']: | |
cell = next(b for b in page['Blocks'] if b['Id'] == child_id) | |
if cell['BlockType'] == 'CELL': | |
row_index = cell['RowIndex'] | |
col_index = cell['ColumnIndex'] | |
cell_text = get_text_from_cell(cell, page) | |
# Store cell text in rows | |
if row_index not in rows: | |
rows[row_index] = {} | |
rows[row_index][col_index] = cell_text | |
# Create a sorted table by row and column indices | |
for row in sorted(rows.keys()): | |
sorted_row = [rows[row].get(col, '') for col in sorted(rows[row].keys())] | |
table.append(sorted_row) | |
tables_data.append(table) | |
# print(tables_data) | |
return tables_data | |
document_bucket = 'ardih-extractor' | |
document_key = 'testfolder/doctest.pdf' # In case, you have folder that named 'testfolder' | |
job_id = start_layout_analysis(document_bucket, document_key) | |
print(f'Started Textract Layout Analysis Job with ID: {job_id}') | |
# Wait for the job to complete | |
status = is_textract_job_complete(job_id) | |
if status == 'SUCCEEDED': | |
print('Textract Layout Analysis succeeded!') | |
# Fetch the results of the analysis | |
pages = get_textract_job_results(job_id) | |
# Extract the table data | |
tables_data = extract_table_data(pages) | |
print(tables_data) | |
else: | |
print('Textract Layout Analysis job failed!') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment