Last active
March 17, 2025 13:53
-
-
Save sgrodnik/bfbb6a7e954c817cb582548e4769f1a7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The script processes multiple RST text files, computes the average for column 6, and saves the result to a new file. | |
# Users can specify files directly or use wildcards (e.g., *.RST). | |
# By default, the output filename is generated automatically. | |
# Brief statistics on processed data, including min/max values of column 6 and empty values per column, are displayed. | |
# Example call in PowerShell (wildcard): | |
# python C:\Users\Username\Downloads\average_column6.py "C:\Users\Username\Downloads\SMW-B-1-N-012-*.RST" | |
# Example call in PowerShell (explicit filenames): | |
# python C:\Users\Username\Downloads\average_column6.py "C:\Users\Username\Downloads\SMW-B-1-N-012-0.RST" "C:\Users\Username\Downloads\SMW-B-1-N-012-1.RST" -o "C:\Users\Username\Downloads\result.RST" | |
import pandas as pd | |
import numpy as np | |
import argparse | |
import os | |
import glob | |
# Command line argument parser setup | |
parser = argparse.ArgumentParser(description='Process RST files and calculate average for column 6') | |
parser.add_argument('input_patterns', nargs='+', help='Input RST files (can include wildcards)') | |
parser.add_argument('--output', '-o', help='Path to output file') | |
args = parser.parse_args() | |
# Expand wildcards in input patterns | |
file_paths = [] | |
for pattern in args.input_patterns: | |
matched_files = glob.glob(pattern) | |
matched_files = [f for f in matched_files if 'AVG' not in os.path.basename(f)] | |
if matched_files: | |
file_paths.extend(matched_files) | |
else: | |
print(f"Warning: No files found matching pattern '{pattern}'") | |
if not file_paths: | |
print("Error: No input files found.") | |
exit(1) | |
print(f"Processing {len(file_paths)} files:") | |
for file in file_paths: | |
print(f" - {file}") | |
# Define output file | |
if args.output: | |
output_path = args.output | |
else: | |
# If output file not specified, create filename with AVG suffix | |
# Using the directory of the first input file | |
base_dir = os.path.dirname(file_paths[0]) | |
base_name = os.path.basename(file_paths[0]).split('.')[0] | |
output_path = os.path.join(base_dir, f"{base_name.rsplit('-', 1)[0]}-AVG.RST") | |
# Data starts from the third row (skipping headers) | |
data_start_row = 3 | |
# Store first three header lines from the first file | |
with open(file_paths[0], "r") as f: | |
header_lines = [next(f) for _ in range(data_start_row)] | |
# Read files while skipping headers | |
dataframes = [ | |
pd.read_csv(path, delim_whitespace=True, skiprows=data_start_row, header=None, dtype=float) | |
for path in file_paths | |
] | |
# Average column 6 (index 5) | |
average_column = sum(df.iloc[:, 5] for df in dataframes) / len(dataframes) | |
# Create the result DataFrame | |
result_df = dataframes[0].copy() | |
result_df.iloc[:, 5] = average_column # Replace column 6 with the average values | |
# Replace NaN values with empty strings | |
result_df = result_df.replace(np.nan, "") | |
# Define column formats based on the given example | |
column_formats = [ | |
"{:>5.0f}", # Column 1 (0 decimal place) | |
"{:>5.0f}", # Column 2 (0 decimal place) | |
"{:>5.0f}", # Column 3 (0 decimal place) | |
"{:>8.2f}", # Column 4 (2 decimal places) | |
"{:>8.5f}", # Column 5 (5 decimal places) | |
"{:>8.2f}", # Column 6 (2 decimal places) | |
"{:>10.0f}", # Column 7 (0 decimal place) | |
"{:>10.0f}" # Column 8 (0 decimal place) | |
] | |
# Format rows with fixed column widths, replacing NaN values with empty spaces | |
formatted_lines = [] | |
for row in result_df.itertuples(index=False, name=None): | |
formatted_line = " ".join( | |
fmt.format(val) if val != "" else " " * len(fmt.format(0)) for fmt, val in zip(column_formats, row) | |
) | |
formatted_lines.append(formatted_line) | |
# Save the result with the original header | |
with open(output_path, "w") as f: | |
f.writelines(header_lines) # Write original header | |
f.write("\n".join(formatted_lines)) # Write formatted data | |
# Convert column 6 back to float, ignoring empty values | |
column_6_values = pd.to_numeric(result_df.iloc[:, 5], errors='coerce') # Convert to float, set invalid values to NaN | |
# Compute statistics | |
total_rows = len(result_df) | |
min_value = column_6_values.min(skipna=True) # Ignore NaN values | |
max_value = column_6_values.max(skipna=True) # Ignore NaN values | |
empty_values_per_column = (result_df == "").sum() # Count empty values per column | |
# Print statistics | |
for i, path in enumerate(file_paths): | |
print(f"Input {i+1}: {path}") | |
print(f"File saved: {output_path}") | |
print(f"Total rows: {total_rows}") | |
print(f"Column 6 - Min: {min_value:.2f}, Max: {max_value:.2f}") | |
print("Empty values per column:") | |
for i, count in enumerate(empty_values_per_column): | |
print(f" Column {i+1}: {count}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment