Skip to content

Instantly share code, notes, and snippets.

@immma
Created October 20, 2024 06:36
Show Gist options
  • Save immma/8549a18021574e1f305efb1b2786029c to your computer and use it in GitHub Desktop.
Save immma/8549a18021574e1f305efb1b2786029c to your computer and use it in GitHub Desktop.
AWS Textract Python PDF table
import boto3
import time
# Initialize Textract client
textract = boto3.client('textract')
s3 = boto3.client('s3')
def start_layout_analysis(document_bucket, document_key):
# Start the Textract job for layout analysis
response = textract.start_document_analysis(
DocumentLocation={
'S3Object': {
'Bucket': document_bucket,
'Name': document_key
}
},
FeatureTypes=['LAYOUT', 'TABLES'] # Specify that we want to analyze layout
)
return response['JobId']
def is_textract_job_complete(job_id):
# Check the status of the Textract job
response = textract.get_document_analysis(JobId=job_id)
status = response['JobStatus']
while status == "IN_PROGRESS":
time.sleep(5)
response = textract.get_document_analysis(JobId=job_id)
status = response['JobStatus']
return status
def get_textract_job_results(job_id):
# Fetch results from the completed Textract job
pages = []
response = textract.get_document_analysis(JobId=job_id)
pages.append(response)
# Handle pagination if the document has multiple pages
next_token = response.get('NextToken')
while next_token:
response = textract.get_document_analysis(JobId=job_id, NextToken=next_token)
pages.append(response)
next_token = response.get('NextToken')
return pages
def get_text_from_cell(cell, page):
# Extract the text from a table cell
text = ''
for relationship in cell.get('Relationships', []):
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
word_block = next((b for b in page['Blocks'] if b['Id'] == child_id), None)
if word_block and word_block['BlockType'] == 'WORD':
text += word_block.get('Text', '') + ' '
return text.strip() # Clean up extra spaces
def extract_table_data(pages):
# Extract table data from Textract results
tables_data = []
for page in pages:
for block in page['Blocks']:
if block['BlockType'] == 'TABLE':
table = []
rows = {}
# Get each cell in the table
for relationship in block.get('Relationships', []):
# print(relationship)
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
cell = next(b for b in page['Blocks'] if b['Id'] == child_id)
if cell['BlockType'] == 'CELL':
row_index = cell['RowIndex']
col_index = cell['ColumnIndex']
cell_text = get_text_from_cell(cell, page)
# Store cell text in rows
if row_index not in rows:
rows[row_index] = {}
rows[row_index][col_index] = cell_text
# Create a sorted table by row and column indices
for row in sorted(rows.keys()):
sorted_row = [rows[row].get(col, '') for col in sorted(rows[row].keys())]
table.append(sorted_row)
tables_data.append(table)
# print(tables_data)
return tables_data
document_bucket = 'ardih-extractor'
document_key = 'testfolder/doctest.pdf' # In case, you have folder that named 'testfolder'
job_id = start_layout_analysis(document_bucket, document_key)
print(f'Started Textract Layout Analysis Job with ID: {job_id}')
# Wait for the job to complete
status = is_textract_job_complete(job_id)
if status == 'SUCCEEDED':
print('Textract Layout Analysis succeeded!')
# Fetch the results of the analysis
pages = get_textract_job_results(job_id)
# Extract the table data
tables_data = extract_table_data(pages)
print(tables_data)
else:
print('Textract Layout Analysis job failed!')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment