Skip to content

Instantly share code, notes, and snippets.

@dutchLuck
Created July 24, 2025 10:16
Show Gist options
  • Save dutchLuck/81e4ce415c36d1713bf3da5113a54e0d to your computer and use it in GitHub Desktop.
Save dutchLuck/81e4ce415c36d1713bf3da5113a54e0d to your computer and use it in GitHub Desktop.
Output the basic descriptive stats for one or more CSV data files containing one or more columns of numbers.
#! /bin/python
#
# C S V S T A T S . P Y
#
# Output the basic stats for one or more data files containing
# one or more columns of numbers. If there are multiple columns
# of numbers then they must be separated by a comma. Comments
# are allowed in the data files if the comments are started with
# a # (i.e. hash character).
#
# This code was largely generated by A.I. so check the
# results that it produces.
#
# The code requires scipy, numpy and pandas. Depending on
# your python installation, you may need to use pip to
# install these libraries if they are not already installed.
#
#
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
import argparse
from io import StringIO
import math
import csv
import os
def analyze_csv(file_path, output, csv_rows, csv_fields, has_header):
try:
data_lines = []
with open(file_path, 'r') as f:
for line in f:
stripped = line.strip()
if not stripped or stripped.startswith('#'):
continue
line_no_comment = line.split('#', 1)[0].strip()
if line_no_comment:
data_lines.append(line_no_comment + '\n')
if not data_lines:
output(f"\n{file_path}: No valid data found.")
return
clean_data = StringIO("".join(data_lines))
if has_header:
df = pd.read_csv(clean_data, header=0)
else:
df = pd.read_csv(clean_data, header=None)
df.columns = [f"Column {i+1}" for i in range(df.shape[1])]
num_rows = len(df)
output(f"\nAnalyzing file: {file_path}")
output(f"Number of data rows read: {num_rows}")
output("-" * 80)
for column in df.columns:
try:
data = pd.to_numeric(df[column], errors='coerce').dropna()
stats = {field: "undefined" for field in csv_fields}
stats['File'] = os.path.basename(file_path)
stats['Column'] = column
if data.empty:
output(f"{column} has no valid numeric data.\n")
csv_rows.append(stats)
continue
stats['Count'] = len(data)
stats['Sum'] = data.sum()
stats['Mean'] = data.mean()
stats['Median'] = data.median()
stats['Minimum'] = data.min()
stats['Maximum'] = data.max()
stats['Range'] = stats['Maximum'] - stats['Minimum']
stats['25% Quartile'] = data.quantile(0.25)
stats['75% Quartile'] = data.quantile(0.75)
stats['Variance'] = data.var(ddof=1)
stats['Std Deviation'] = data.std(ddof=1)
stats['Average Deviation'] = np.mean(np.abs(data - stats['Mean']))
stats['Skewness'] = skew(data)
stats['Kurtosis'] = kurtosis(data)
if stats['Std Deviation'] != 0:
stats['Pearson Skewness'] = 3 * (stats['Mean'] - stats['Median']) / stats['Std Deviation']
if all(data > 0):
stats['Geometric Mean'] = math.exp(np.mean(np.log(data)))
stats['Harmonic Mean'] = len(data) / np.sum(1.0 / data)
if np.all(data == data.astype(int)):
mode_vals = data.mode()
if not mode_vals.empty and len(mode_vals) != len(data.unique()):
stats['Mode'] = ', '.join(str(v) for v in mode_vals)
else:
stats['Mode'] = "No mode (all values unique)"
output(f"Statistics for {column}:")
for field in csv_fields:
if field in ['File', 'Column']:
continue
output(f" {field:22}: {stats[field]}")
output("")
csv_rows.append(stats)
except Exception as e:
output(f"Error processing {column} in {file_path}: {e}")
stats['Error'] = str(e)
csv_rows.append(stats)
except FileNotFoundError:
output(f"File not found: {file_path}")
except Exception as e:
output(f"An error occurred reading {file_path}: {e}")
def main():
parser = argparse.ArgumentParser(
description="Analyze numeric columns in one or more CSV files. Supports comments, header rows, and detailed text/CSV reports."
)
parser.add_argument("files", nargs='+', help="One or more CSV files to analyze")
parser.add_argument("-c", "--csv-report", help="Optional CSV report output (e.g. stats.csv)")
parser.add_argument("-H", "--header", action="store_true", help="Treat the first non-comment row as a header row")
parser.add_argument("-q", "--quiet", action="store_true", help="Suppress terminal output")
parser.add_argument("-r", "--report", help="Optional text report output (e.g. stats.txt)")
args = parser.parse_args()
csv_fields = [
'File', 'Column', 'Count', 'Minimum', '25% Quartile', 'Median', '75% Quartile', 'Maximum',
'Range', 'Mode', 'Mean', 'Geometric Mean', 'Harmonic Mean', 'Sum', 'Average Deviation',
'Variance', 'Std Deviation', 'Skewness', 'Pearson Skewness', 'Kurtosis'
]
csv_rows = []
# Text output setup
if args.report:
report_file = open(args.report, 'w')
if args.quiet:
output = lambda msg: print(msg, file=report_file)
else:
output = lambda msg: (print(msg), print(msg, file=report_file))
else:
output = print if not args.quiet else lambda msg: None
# Process all files
for file_path in args.files:
analyze_csv(file_path, output, csv_rows, csv_fields, has_header=args.header)
# Write CSV report
if args.csv_report:
with open(args.csv_report, 'w', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=csv_fields)
writer.writeheader()
writer.writerows(csv_rows)
if not args.quiet:
print(f"\nCSV report saved to: {args.csv_report}")
if args.report:
report_file.close()
if not args.quiet:
print(f"Text report saved to: {args.report}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment