filipeandre · April 21, 2025 14:52
diff --git a/extract_data_frame_paddleocr.py b/extract_data_frame_paddleocr.py
 # define functions to run the ocr,
 # annotate the images with the bounding boxes
 # consolidate the results to a dataframe
 # and iterate over a folder of images
 from PIL import Image, ImageDraw
 import pandas as pd
 from paddleocr import PaddleOCR
 import os
 import traceback


 def paddle_inference(img_path, lang='en'):
    # Initialize PaddleOCR
    ocr = PaddleOCR(use_gpu=True, use_angle_cls=True, lang=lang, min_subgraph_size=30)

    # Perform OCR on the image
    ocr_result = ocr.ocr(img_path, cls=True)

    # Process OCR results
    results = []
    for line in ocr_result:
        for boxes, txt_info in line:
            if len(txt_info) == 2:  # Ensure there are 2 elements (text and confidence)
                text, confidence = txt_info
                results.append({
                    'coordinates': boxes,
                    'text': text,
                    'confidence': confidence
                })

    if not results:
        print("No text detected.")
        return None, None

    # Draw bounding boxes on the image
    image = Image.open(img_path).convert('RGB')
    draw = ImageDraw.Draw(image)

    for res in results:
        # Flatten the list of coordinates for PIL drawing
        coordinates = [pt for box in res['coordinates'] for pt in box]
        draw.polygon(coordinates, outline='red')

    # Extract the base filename without the extension
    base_filename = os.path.splitext(os.path.basename(img_path))[0]

    # Save the result image
    result_path = f'results/{base_filename}_result.jpg'
    image.save(result_path)

    # Create DataFrame
    df = pd.DataFrame(results)

    return result_path, df

 # Define the consolidate_ocr_results function
 def consolidate_ocr_results(filename, ocr_df):
    # Ensure 'confidence' column is numeric for the comparison
    ocr_df['confidence'] = pd.to_numeric(ocr_df['confidence'], errors='coerce')

    # Filter the dataframe for scores greater than 0.25
    filtered_df = ocr_df[ocr_df['confidence'] > 0.25]

    # Combine the filtered OCR'd text into one cell
    all_text = ' '.join(filtered_df['text'])

    # Combine the filtered scores into one cell, converting each to a string
    all_scores = ' '.join([str(score) for score in filtered_df['confidence']])

    # Create a new DataFrame with the combined text, scores, and filename
    result_df = pd.DataFrame({
        'Filename': [filename],
        'OCR_Text': [all_text],
        'Scores': [all_scores]
    })

    return result_df

 # Define the process_folder function
 def process_folder(folder_path, lang='en'):
    # Get a list of all image files in the provided folder
    supported_extensions = ('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')
    files = [f for f in os.listdir(folder_path) if f.lower().endswith(supported_extensions)]

    # Prepare a list to store all results dataframes
    results_dfs = []

    # Loop over each file and process it
    for file in files:
        file_path = os.path.join(folder_path, file)
        # Call paddle_inference function on the file
        try:
            result_image_path, ocr_dataframe = paddle_inference(file_path, lang)
            if ocr_dataframe is not None:
                # If OCR results exist, consolidate them using the provided function
                consolidated_df = consolidate_ocr_results(file, ocr_dataframe)
                # Add the result dataframe to our list
                results_dfs.append(consolidated_df)
        except Exception as e:
            error_message = f"An error occurred while processing {file}: {e}\n{traceback.format_exc()}\n"
            print(error_message)
            # Append error message to 'errors.txt' file
            with open(os.path.join(folder_path, 'errors.txt'), 'a') as error_file:
                error_file.write(error_message)

    # Concatenate all dataframes in the list into a single dataframe
    all_results_df = pd.concat(results_dfs, ignore_index=True) if results_dfs else pd.DataFrame()

    # Return the final dataframe containing all results
    return all_results_df
	# define functions to run the ocr,
	# annotate the images with the bounding boxes
	# consolidate the results to a dataframe
	# and iterate over a folder of images
	from PIL import Image, ImageDraw
	import pandas as pd
	from paddleocr import PaddleOCR
	import os
	import traceback


	def paddle_inference(img_path, lang='en'):
	# Initialize PaddleOCR
	ocr = PaddleOCR(use_gpu=True, use_angle_cls=True, lang=lang, min_subgraph_size=30)

	# Perform OCR on the image
	ocr_result = ocr.ocr(img_path, cls=True)

	# Process OCR results
	results = []
	for line in ocr_result:
	for boxes, txt_info in line:
	if len(txt_info) == 2: # Ensure there are 2 elements (text and confidence)
	text, confidence = txt_info
	results.append({
	'coordinates': boxes,
	'text': text,
	'confidence': confidence
	})

	if not results:
	print("No text detected.")
	return None, None

	# Draw bounding boxes on the image
	image = Image.open(img_path).convert('RGB')
	draw = ImageDraw.Draw(image)

	for res in results:
	# Flatten the list of coordinates for PIL drawing
	coordinates = [pt for box in res['coordinates'] for pt in box]
	draw.polygon(coordinates, outline='red')

	# Extract the base filename without the extension
	base_filename = os.path.splitext(os.path.basename(img_path))[0]

	# Save the result image
	result_path = f'results/{base_filename}_result.jpg'
	image.save(result_path)

	# Create DataFrame
	df = pd.DataFrame(results)

	return result_path, df

	# Define the consolidate_ocr_results function
	def consolidate_ocr_results(filename, ocr_df):
	# Ensure 'confidence' column is numeric for the comparison
	ocr_df['confidence'] = pd.to_numeric(ocr_df['confidence'], errors='coerce')

	# Filter the dataframe for scores greater than 0.25
	filtered_df = ocr_df[ocr_df['confidence'] > 0.25]

	# Combine the filtered OCR'd text into one cell
	all_text = ' '.join(filtered_df['text'])

	# Combine the filtered scores into one cell, converting each to a string
	all_scores = ' '.join([str(score) for score in filtered_df['confidence']])

	# Create a new DataFrame with the combined text, scores, and filename
	result_df = pd.DataFrame({
	'Filename': [filename],
	'OCR_Text': [all_text],
	'Scores': [all_scores]
	})

	return result_df

	# Define the process_folder function
	def process_folder(folder_path, lang='en'):
	# Get a list of all image files in the provided folder
	supported_extensions = ('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff')
	files = [f for f in os.listdir(folder_path) if f.lower().endswith(supported_extensions)]

	# Prepare a list to store all results dataframes
	results_dfs = []

	# Loop over each file and process it
	for file in files:
	file_path = os.path.join(folder_path, file)
	# Call paddle_inference function on the file
	try:
	result_image_path, ocr_dataframe = paddle_inference(file_path, lang)
	if ocr_dataframe is not None:
	# If OCR results exist, consolidate them using the provided function
	consolidated_df = consolidate_ocr_results(file, ocr_dataframe)
	# Add the result dataframe to our list
	results_dfs.append(consolidated_df)
	except Exception as e:
	error_message = f"An error occurred while processing {file}: {e}\n{traceback.format_exc()}\n"
	print(error_message)
	# Append error message to 'errors.txt' file
	with open(os.path.join(folder_path, 'errors.txt'), 'a') as error_file:
	error_file.write(error_message)

	# Concatenate all dataframes in the list into a single dataframe
	all_results_df = pd.concat(results_dfs, ignore_index=True) if results_dfs else pd.DataFrame()

	# Return the final dataframe containing all results
	return all_results_df