sgrodnik · March 17, 2025 13:53
diff --git a/average_column6.py b/average_column6.py
 # The script processes multiple RST text files, computes the average for column 6, and saves the result to a new file.
 # Users can specify files directly or use wildcards (e.g., *.RST).
 # By default, the output filename is generated automatically.
 # Brief statistics on processed data, including min/max values of column 6 and empty values per column, are displayed.

 # Example call in PowerShell (wildcard):
 # python C:\Users\Username\Downloads\average_column6.py "C:\Users\Username\Downloads\SMW-B-1-N-012-*.RST"

 # Example call in PowerShell (explicit filenames):
 # python C:\Users\Username\Downloads\average_column6.py "C:\Users\Username\Downloads\SMW-B-1-N-012-0.RST" "C:\Users\Username\Downloads\SMW-B-1-N-012-1.RST" -o "C:\Users\Username\Downloads\result.RST"


 import pandas as pd
 import numpy as np
 import argparse
 import os
 import glob

 # Command line argument parser setup
 parser = argparse.ArgumentParser(description='Process RST files and calculate average for column 6')
 parser.add_argument('input_patterns', nargs='+', help='Input RST files (can include wildcards)')
 parser.add_argument('--output', '-o', help='Path to output file')
 args = parser.parse_args()

 # Expand wildcards in input patterns
 file_paths = []
 for pattern in args.input_patterns:
    matched_files = glob.glob(pattern)
    matched_files = [f for f in matched_files if 'AVG' not in os.path.basename(f)]
    if matched_files:
        file_paths.extend(matched_files)
    else:
        print(f"Warning: No files found matching pattern '{pattern}'")

 if not file_paths:
    print("Error: No input files found.")
    exit(1)

 print(f"Processing {len(file_paths)} files:")
 for file in file_paths:
    print(f"  - {file}")

 # Define output file
 if args.output:
    output_path = args.output
 else:
    # If output file not specified, create filename with AVG suffix
    # Using the directory of the first input file
    base_dir = os.path.dirname(file_paths[0])
    base_name = os.path.basename(file_paths[0]).split('.')[0]
    output_path = os.path.join(base_dir, f"{base_name.rsplit('-', 1)[0]}-AVG.RST")

 # Data starts from the third row (skipping headers)
 data_start_row = 3

 # Store first three header lines from the first file
 with open(file_paths[0], "r") as f:
    header_lines = [next(f) for _ in range(data_start_row)]

 # Read files while skipping headers
 dataframes = [
    pd.read_csv(path, delim_whitespace=True, skiprows=data_start_row, header=None, dtype=float)
    for path in file_paths
 ]

 # Average column 6 (index 5)
 average_column = sum(df.iloc[:, 5] for df in dataframes) / len(dataframes)

 # Create the result DataFrame
 result_df = dataframes[0].copy()
 result_df.iloc[:, 5] = average_column  # Replace column 6 with the average values

 # Replace NaN values with empty strings
 result_df = result_df.replace(np.nan, "")

 # Define column formats based on the given example
 column_formats = [
    "{:>5.0f}",  # Column 1 (0 decimal place)
    "{:>5.0f}",  # Column 2 (0 decimal place)
    "{:>5.0f}",  # Column 3 (0 decimal place)
    "{:>8.2f}",  # Column 4 (2 decimal places)
    "{:>8.5f}",  # Column 5 (5 decimal places)
    "{:>8.2f}",  # Column 6 (2 decimal places)
    "{:>10.0f}", # Column 7 (0 decimal place)
    "{:>10.0f}"  # Column 8 (0 decimal place)
 ]

 # Format rows with fixed column widths, replacing NaN values with empty spaces
 formatted_lines = []
 for row in result_df.itertuples(index=False, name=None):
    formatted_line = " ".join(
        fmt.format(val) if val != "" else " " * len(fmt.format(0)) for fmt, val in zip(column_formats, row)
    )
    formatted_lines.append(formatted_line)

 # Save the result with the original header
 with open(output_path, "w") as f:
    f.writelines(header_lines)  # Write original header
    f.write("\n".join(formatted_lines))  # Write formatted data

 # Convert column 6 back to float, ignoring empty values
 column_6_values = pd.to_numeric(result_df.iloc[:, 5], errors='coerce')  # Convert to float, set invalid values to NaN

 # Compute statistics
 total_rows = len(result_df)
 min_value = column_6_values.min(skipna=True)  # Ignore NaN values
 max_value = column_6_values.max(skipna=True)  # Ignore NaN values
 empty_values_per_column = (result_df == "").sum()  # Count empty values per column

 # Print statistics
 for i, path in enumerate(file_paths):
    print(f"Input {i+1}: {path}")
 print(f"File saved: {output_path}")
 print(f"Total rows: {total_rows}")
 print(f"Column 6 - Min: {min_value:.2f}, Max: {max_value:.2f}")
 print("Empty values per column:")
 for i, count in enumerate(empty_values_per_column):
    print(f"  Column {i+1}: {count}")
	# The script processes multiple RST text files, computes the average for column 6, and saves the result to a new file.
	# Users can specify files directly or use wildcards (e.g., *.RST).
	# By default, the output filename is generated automatically.
	# Brief statistics on processed data, including min/max values of column 6 and empty values per column, are displayed.

	# Example call in PowerShell (wildcard):
	# python C:\Users\Username\Downloads\average_column6.py "C:\Users\Username\Downloads\SMW-B-1-N-012-*.RST"

	# Example call in PowerShell (explicit filenames):
	# python C:\Users\Username\Downloads\average_column6.py "C:\Users\Username\Downloads\SMW-B-1-N-012-0.RST" "C:\Users\Username\Downloads\SMW-B-1-N-012-1.RST" -o "C:\Users\Username\Downloads\result.RST"


	import pandas as pd
	import numpy as np
	import argparse
	import os
	import glob

	# Command line argument parser setup
	parser = argparse.ArgumentParser(description='Process RST files and calculate average for column 6')
	parser.add_argument('input_patterns', nargs='+', help='Input RST files (can include wildcards)')
	parser.add_argument('--output', '-o', help='Path to output file')
	args = parser.parse_args()

	# Expand wildcards in input patterns
	file_paths = []
	for pattern in args.input_patterns:
	matched_files = glob.glob(pattern)
	matched_files = [f for f in matched_files if 'AVG' not in os.path.basename(f)]
	if matched_files:
	file_paths.extend(matched_files)
	else:
	print(f"Warning: No files found matching pattern '{pattern}'")

	if not file_paths:
	print("Error: No input files found.")
	exit(1)

	print(f"Processing {len(file_paths)} files:")
	for file in file_paths:
	print(f" - {file}")

	# Define output file
	if args.output:
	output_path = args.output
	else:
	# If output file not specified, create filename with AVG suffix
	# Using the directory of the first input file
	base_dir = os.path.dirname(file_paths[0])
	base_name = os.path.basename(file_paths[0]).split('.')[0]
	output_path = os.path.join(base_dir, f"{base_name.rsplit('-', 1)[0]}-AVG.RST")

	# Data starts from the third row (skipping headers)
	data_start_row = 3

	# Store first three header lines from the first file
	with open(file_paths[0], "r") as f:
	header_lines = [next(f) for _ in range(data_start_row)]

	# Read files while skipping headers
	dataframes = [
	pd.read_csv(path, delim_whitespace=True, skiprows=data_start_row, header=None, dtype=float)
	for path in file_paths
	]

	# Average column 6 (index 5)
	average_column = sum(df.iloc[:, 5] for df in dataframes) / len(dataframes)

	# Create the result DataFrame
	result_df = dataframes[0].copy()
	result_df.iloc[:, 5] = average_column # Replace column 6 with the average values

	# Replace NaN values with empty strings
	result_df = result_df.replace(np.nan, "")

	# Define column formats based on the given example
	column_formats = [
	"{:>5.0f}", # Column 1 (0 decimal place)
	"{:>5.0f}", # Column 2 (0 decimal place)
	"{:>5.0f}", # Column 3 (0 decimal place)
	"{:>8.2f}", # Column 4 (2 decimal places)
	"{:>8.5f}", # Column 5 (5 decimal places)
	"{:>8.2f}", # Column 6 (2 decimal places)
	"{:>10.0f}", # Column 7 (0 decimal place)
	"{:>10.0f}" # Column 8 (0 decimal place)
	]

	# Format rows with fixed column widths, replacing NaN values with empty spaces
	formatted_lines = []
	for row in result_df.itertuples(index=False, name=None):
	formatted_line = " ".join(
	fmt.format(val) if val != "" else " " * len(fmt.format(0)) for fmt, val in zip(column_formats, row)
	)
	formatted_lines.append(formatted_line)

	# Save the result with the original header
	with open(output_path, "w") as f:
	f.writelines(header_lines) # Write original header
	f.write("\n".join(formatted_lines)) # Write formatted data

	# Convert column 6 back to float, ignoring empty values
	column_6_values = pd.to_numeric(result_df.iloc[:, 5], errors='coerce') # Convert to float, set invalid values to NaN

	# Compute statistics
	total_rows = len(result_df)
	min_value = column_6_values.min(skipna=True) # Ignore NaN values
	max_value = column_6_values.max(skipna=True) # Ignore NaN values
	empty_values_per_column = (result_df == "").sum() # Count empty values per column

	# Print statistics
	for i, path in enumerate(file_paths):
	print(f"Input {i+1}: {path}")
	print(f"File saved: {output_path}")
	print(f"Total rows: {total_rows}")
	print(f"Column 6 - Min: {min_value:.2f}, Max: {max_value:.2f}")
	print("Empty values per column:")
	for i, count in enumerate(empty_values_per_column):
	print(f" Column {i+1}: {count}")