Created
July 24, 2025 10:16
-
-
Save dutchLuck/81e4ce415c36d1713bf3da5113a54e0d to your computer and use it in GitHub Desktop.
Output the basic descriptive stats for one or more CSV data files containing one or more columns of numbers.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/python | |
# | |
# C S V S T A T S . P Y | |
# | |
# Output the basic stats for one or more data files containing | |
# one or more columns of numbers. If there are multiple columns | |
# of numbers then they must be separated by a comma. Comments | |
# are allowed in the data files if the comments are started with | |
# a # (i.e. hash character). | |
# | |
# This code was largely generated by A.I. so check the | |
# results that it produces. | |
# | |
# The code requires scipy, numpy and pandas. Depending on | |
# your python installation, you may need to use pip to | |
# install these libraries if they are not already installed. | |
# | |
# | |
import pandas as pd | |
import numpy as np | |
from scipy.stats import skew, kurtosis | |
import argparse | |
from io import StringIO | |
import math | |
import csv | |
import os | |
def analyze_csv(file_path, output, csv_rows, csv_fields, has_header): | |
try: | |
data_lines = [] | |
with open(file_path, 'r') as f: | |
for line in f: | |
stripped = line.strip() | |
if not stripped or stripped.startswith('#'): | |
continue | |
line_no_comment = line.split('#', 1)[0].strip() | |
if line_no_comment: | |
data_lines.append(line_no_comment + '\n') | |
if not data_lines: | |
output(f"\n{file_path}: No valid data found.") | |
return | |
clean_data = StringIO("".join(data_lines)) | |
if has_header: | |
df = pd.read_csv(clean_data, header=0) | |
else: | |
df = pd.read_csv(clean_data, header=None) | |
df.columns = [f"Column {i+1}" for i in range(df.shape[1])] | |
num_rows = len(df) | |
output(f"\nAnalyzing file: {file_path}") | |
output(f"Number of data rows read: {num_rows}") | |
output("-" * 80) | |
for column in df.columns: | |
try: | |
data = pd.to_numeric(df[column], errors='coerce').dropna() | |
stats = {field: "undefined" for field in csv_fields} | |
stats['File'] = os.path.basename(file_path) | |
stats['Column'] = column | |
if data.empty: | |
output(f"{column} has no valid numeric data.\n") | |
csv_rows.append(stats) | |
continue | |
stats['Count'] = len(data) | |
stats['Sum'] = data.sum() | |
stats['Mean'] = data.mean() | |
stats['Median'] = data.median() | |
stats['Minimum'] = data.min() | |
stats['Maximum'] = data.max() | |
stats['Range'] = stats['Maximum'] - stats['Minimum'] | |
stats['25% Quartile'] = data.quantile(0.25) | |
stats['75% Quartile'] = data.quantile(0.75) | |
stats['Variance'] = data.var(ddof=1) | |
stats['Std Deviation'] = data.std(ddof=1) | |
stats['Average Deviation'] = np.mean(np.abs(data - stats['Mean'])) | |
stats['Skewness'] = skew(data) | |
stats['Kurtosis'] = kurtosis(data) | |
if stats['Std Deviation'] != 0: | |
stats['Pearson Skewness'] = 3 * (stats['Mean'] - stats['Median']) / stats['Std Deviation'] | |
if all(data > 0): | |
stats['Geometric Mean'] = math.exp(np.mean(np.log(data))) | |
stats['Harmonic Mean'] = len(data) / np.sum(1.0 / data) | |
if np.all(data == data.astype(int)): | |
mode_vals = data.mode() | |
if not mode_vals.empty and len(mode_vals) != len(data.unique()): | |
stats['Mode'] = ', '.join(str(v) for v in mode_vals) | |
else: | |
stats['Mode'] = "No mode (all values unique)" | |
output(f"Statistics for {column}:") | |
for field in csv_fields: | |
if field in ['File', 'Column']: | |
continue | |
output(f" {field:22}: {stats[field]}") | |
output("") | |
csv_rows.append(stats) | |
except Exception as e: | |
output(f"Error processing {column} in {file_path}: {e}") | |
stats['Error'] = str(e) | |
csv_rows.append(stats) | |
except FileNotFoundError: | |
output(f"File not found: {file_path}") | |
except Exception as e: | |
output(f"An error occurred reading {file_path}: {e}") | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Analyze numeric columns in one or more CSV files. Supports comments, header rows, and detailed text/CSV reports." | |
) | |
parser.add_argument("files", nargs='+', help="One or more CSV files to analyze") | |
parser.add_argument("-c", "--csv-report", help="Optional CSV report output (e.g. stats.csv)") | |
parser.add_argument("-H", "--header", action="store_true", help="Treat the first non-comment row as a header row") | |
parser.add_argument("-q", "--quiet", action="store_true", help="Suppress terminal output") | |
parser.add_argument("-r", "--report", help="Optional text report output (e.g. stats.txt)") | |
args = parser.parse_args() | |
csv_fields = [ | |
'File', 'Column', 'Count', 'Minimum', '25% Quartile', 'Median', '75% Quartile', 'Maximum', | |
'Range', 'Mode', 'Mean', 'Geometric Mean', 'Harmonic Mean', 'Sum', 'Average Deviation', | |
'Variance', 'Std Deviation', 'Skewness', 'Pearson Skewness', 'Kurtosis' | |
] | |
csv_rows = [] | |
# Text output setup | |
if args.report: | |
report_file = open(args.report, 'w') | |
if args.quiet: | |
output = lambda msg: print(msg, file=report_file) | |
else: | |
output = lambda msg: (print(msg), print(msg, file=report_file)) | |
else: | |
output = print if not args.quiet else lambda msg: None | |
# Process all files | |
for file_path in args.files: | |
analyze_csv(file_path, output, csv_rows, csv_fields, has_header=args.header) | |
# Write CSV report | |
if args.csv_report: | |
with open(args.csv_report, 'w', newline='') as csvfile: | |
writer = csv.DictWriter(csvfile, fieldnames=csv_fields) | |
writer.writeheader() | |
writer.writerows(csv_rows) | |
if not args.quiet: | |
print(f"\nCSV report saved to: {args.csv_report}") | |
if args.report: | |
report_file.close() | |
if not args.quiet: | |
print(f"Text report saved to: {args.report}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment