immma · October 20, 2024 06:36
diff --git a/extractor.py b/extractor.py
 import boto3
 import time

 # Initialize Textract client
 textract = boto3.client('textract')
 s3 = boto3.client('s3')

 def start_layout_analysis(document_bucket, document_key):
    # Start the Textract job for layout analysis
    response = textract.start_document_analysis(
        DocumentLocation={
            'S3Object': {
                'Bucket': document_bucket,
                'Name': document_key
            }
        },
        FeatureTypes=['LAYOUT', 'TABLES']  # Specify that we want to analyze layout
    )
    return response['JobId']

 def is_textract_job_complete(job_id):
    # Check the status of the Textract job
    response = textract.get_document_analysis(JobId=job_id)
    status = response['JobStatus']
    while status == "IN_PROGRESS":
        time.sleep(5)
        response = textract.get_document_analysis(JobId=job_id)
        status = response['JobStatus']
    return status

 def get_textract_job_results(job_id):
    # Fetch results from the completed Textract job
    pages = []
    response = textract.get_document_analysis(JobId=job_id)
    pages.append(response)

    # Handle pagination if the document has multiple pages
    next_token = response.get('NextToken')
    while next_token:
        response = textract.get_document_analysis(JobId=job_id, NextToken=next_token)
        pages.append(response)
        next_token = response.get('NextToken')

    return pages

 def get_text_from_cell(cell, page):
    # Extract the text from a table cell
    text = ''
    for relationship in cell.get('Relationships', []):
        if relationship['Type'] == 'CHILD':
            for child_id in relationship['Ids']:
                word_block = next((b for b in page['Blocks'] if b['Id'] == child_id), None)
                if word_block and word_block['BlockType'] == 'WORD':
                    text += word_block.get('Text', '') + ' '
    return text.strip()  # Clean up extra spaces

 def extract_table_data(pages):
    # Extract table data from Textract results
    tables_data = []
    for page in pages:
        for block in page['Blocks']:
            if block['BlockType'] == 'TABLE':
                table = []
                rows = {}

                # Get each cell in the table
                for relationship in block.get('Relationships', []):
                    # print(relationship)
                    if relationship['Type'] == 'CHILD':
                        for child_id in relationship['Ids']:
                            cell = next(b for b in page['Blocks'] if b['Id'] == child_id)
                            if cell['BlockType'] == 'CELL':
                                row_index = cell['RowIndex']
                                col_index = cell['ColumnIndex']
                                cell_text = get_text_from_cell(cell, page)

                                # Store cell text in rows
                                if row_index not in rows:
                                    rows[row_index] = {}
                                rows[row_index][col_index] = cell_text

                # Create a sorted table by row and column indices
                for row in sorted(rows.keys()):
                    sorted_row = [rows[row].get(col, '') for col in sorted(rows[row].keys())]
                    table.append(sorted_row)

                tables_data.append(table)
    # print(tables_data)
    return tables_data

 document_bucket = 'ardih-extractor'
 document_key = 'testfolder/doctest.pdf' # In case, you have folder that named 'testfolder'

 job_id = start_layout_analysis(document_bucket, document_key)
 print(f'Started Textract Layout Analysis Job with ID: {job_id}')

 # Wait for the job to complete
 status = is_textract_job_complete(job_id)
 if status == 'SUCCEEDED':
    print('Textract Layout Analysis succeeded!')
    
    # Fetch the results of the analysis
    pages = get_textract_job_results(job_id)
    
    # Extract the table data
    tables_data = extract_table_data(pages)

    print(tables_data)
 else:
    print('Textract Layout Analysis job failed!')
	import boto3
	import time

	# Initialize Textract client
	textract = boto3.client('textract')
	s3 = boto3.client('s3')

	def start_layout_analysis(document_bucket, document_key):
	# Start the Textract job for layout analysis
	response = textract.start_document_analysis(
	DocumentLocation={
	'S3Object': {
	'Bucket': document_bucket,
	'Name': document_key
	}
	},
	FeatureTypes=['LAYOUT', 'TABLES'] # Specify that we want to analyze layout
	)
	return response['JobId']

	def is_textract_job_complete(job_id):
	# Check the status of the Textract job
	response = textract.get_document_analysis(JobId=job_id)
	status = response['JobStatus']
	while status == "IN_PROGRESS":
	time.sleep(5)
	response = textract.get_document_analysis(JobId=job_id)
	status = response['JobStatus']
	return status

	def get_textract_job_results(job_id):
	# Fetch results from the completed Textract job
	pages = []
	response = textract.get_document_analysis(JobId=job_id)
	pages.append(response)

	# Handle pagination if the document has multiple pages
	next_token = response.get('NextToken')
	while next_token:
	response = textract.get_document_analysis(JobId=job_id, NextToken=next_token)
	pages.append(response)
	next_token = response.get('NextToken')

	return pages

	def get_text_from_cell(cell, page):
	# Extract the text from a table cell
	text = ''
	for relationship in cell.get('Relationships', []):
	if relationship['Type'] == 'CHILD':
	for child_id in relationship['Ids']:
	word_block = next((b for b in page['Blocks'] if b['Id'] == child_id), None)
	if word_block and word_block['BlockType'] == 'WORD':
	text += word_block.get('Text', '') + ' '
	return text.strip() # Clean up extra spaces

	def extract_table_data(pages):
	# Extract table data from Textract results
	tables_data = []
	for page in pages:
	for block in page['Blocks']:
	if block['BlockType'] == 'TABLE':
	table = []
	rows = {}

	# Get each cell in the table
	for relationship in block.get('Relationships', []):
	# print(relationship)
	if relationship['Type'] == 'CHILD':
	for child_id in relationship['Ids']:
	cell = next(b for b in page['Blocks'] if b['Id'] == child_id)
	if cell['BlockType'] == 'CELL':
	row_index = cell['RowIndex']
	col_index = cell['ColumnIndex']
	cell_text = get_text_from_cell(cell, page)

	# Store cell text in rows
	if row_index not in rows:
	rows[row_index] = {}
	rows[row_index][col_index] = cell_text

	# Create a sorted table by row and column indices
	for row in sorted(rows.keys()):
	sorted_row = [rows[row].get(col, '') for col in sorted(rows[row].keys())]
	table.append(sorted_row)

	tables_data.append(table)
	# print(tables_data)
	return tables_data

	document_bucket = 'ardih-extractor'
	document_key = 'testfolder/doctest.pdf' # In case, you have folder that named 'testfolder'

	job_id = start_layout_analysis(document_bucket, document_key)
	print(f'Started Textract Layout Analysis Job with ID: {job_id}')

	# Wait for the job to complete
	status = is_textract_job_complete(job_id)
	if status == 'SUCCEEDED':
	print('Textract Layout Analysis succeeded!')

	# Fetch the results of the analysis
	pages = get_textract_job_results(job_id)

	# Extract the table data
	tables_data = extract_table_data(pages)

	print(tables_data)
	else:
	print('Textract Layout Analysis job failed!')