Skip to content

Instantly share code, notes, and snippets.

@sgrodnik
Last active March 17, 2025 13:53
Show Gist options
  • Save sgrodnik/bfbb6a7e954c817cb582548e4769f1a7 to your computer and use it in GitHub Desktop.
Save sgrodnik/bfbb6a7e954c817cb582548e4769f1a7 to your computer and use it in GitHub Desktop.
# The script processes multiple RST text files, computes the average for column 6, and saves the result to a new file.
# Users can specify files directly or use wildcards (e.g., *.RST).
# By default, the output filename is generated automatically.
# Brief statistics on processed data, including min/max values of column 6 and empty values per column, are displayed.
# Example call in PowerShell (wildcard):
# python C:\Users\Username\Downloads\average_column6.py "C:\Users\Username\Downloads\SMW-B-1-N-012-*.RST"
# Example call in PowerShell (explicit filenames):
# python C:\Users\Username\Downloads\average_column6.py "C:\Users\Username\Downloads\SMW-B-1-N-012-0.RST" "C:\Users\Username\Downloads\SMW-B-1-N-012-1.RST" -o "C:\Users\Username\Downloads\result.RST"
import pandas as pd
import numpy as np
import argparse
import os
import glob
# Command line argument parser setup
parser = argparse.ArgumentParser(description='Process RST files and calculate average for column 6')
parser.add_argument('input_patterns', nargs='+', help='Input RST files (can include wildcards)')
parser.add_argument('--output', '-o', help='Path to output file')
args = parser.parse_args()
# Expand wildcards in input patterns
file_paths = []
for pattern in args.input_patterns:
matched_files = glob.glob(pattern)
matched_files = [f for f in matched_files if 'AVG' not in os.path.basename(f)]
if matched_files:
file_paths.extend(matched_files)
else:
print(f"Warning: No files found matching pattern '{pattern}'")
if not file_paths:
print("Error: No input files found.")
exit(1)
print(f"Processing {len(file_paths)} files:")
for file in file_paths:
print(f" - {file}")
# Define output file
if args.output:
output_path = args.output
else:
# If output file not specified, create filename with AVG suffix
# Using the directory of the first input file
base_dir = os.path.dirname(file_paths[0])
base_name = os.path.basename(file_paths[0]).split('.')[0]
output_path = os.path.join(base_dir, f"{base_name.rsplit('-', 1)[0]}-AVG.RST")
# Data starts from the third row (skipping headers)
data_start_row = 3
# Store first three header lines from the first file
with open(file_paths[0], "r") as f:
header_lines = [next(f) for _ in range(data_start_row)]
# Read files while skipping headers
dataframes = [
pd.read_csv(path, delim_whitespace=True, skiprows=data_start_row, header=None, dtype=float)
for path in file_paths
]
# Average column 6 (index 5)
average_column = sum(df.iloc[:, 5] for df in dataframes) / len(dataframes)
# Create the result DataFrame
result_df = dataframes[0].copy()
result_df.iloc[:, 5] = average_column # Replace column 6 with the average values
# Replace NaN values with empty strings
result_df = result_df.replace(np.nan, "")
# Define column formats based on the given example
column_formats = [
"{:>5.0f}", # Column 1 (0 decimal place)
"{:>5.0f}", # Column 2 (0 decimal place)
"{:>5.0f}", # Column 3 (0 decimal place)
"{:>8.2f}", # Column 4 (2 decimal places)
"{:>8.5f}", # Column 5 (5 decimal places)
"{:>8.2f}", # Column 6 (2 decimal places)
"{:>10.0f}", # Column 7 (0 decimal place)
"{:>10.0f}" # Column 8 (0 decimal place)
]
# Format rows with fixed column widths, replacing NaN values with empty spaces
formatted_lines = []
for row in result_df.itertuples(index=False, name=None):
formatted_line = " ".join(
fmt.format(val) if val != "" else " " * len(fmt.format(0)) for fmt, val in zip(column_formats, row)
)
formatted_lines.append(formatted_line)
# Save the result with the original header
with open(output_path, "w") as f:
f.writelines(header_lines) # Write original header
f.write("\n".join(formatted_lines)) # Write formatted data
# Convert column 6 back to float, ignoring empty values
column_6_values = pd.to_numeric(result_df.iloc[:, 5], errors='coerce') # Convert to float, set invalid values to NaN
# Compute statistics
total_rows = len(result_df)
min_value = column_6_values.min(skipna=True) # Ignore NaN values
max_value = column_6_values.max(skipna=True) # Ignore NaN values
empty_values_per_column = (result_df == "").sum() # Count empty values per column
# Print statistics
for i, path in enumerate(file_paths):
print(f"Input {i+1}: {path}")
print(f"File saved: {output_path}")
print(f"Total rows: {total_rows}")
print(f"Column 6 - Min: {min_value:.2f}, Max: {max_value:.2f}")
print("Empty values per column:")
for i, count in enumerate(empty_values_per_column):
print(f" Column {i+1}: {count}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment