dutchLuck · July 24, 2025 10:16
diff --git a/csvstats.py b/csvstats.py
 #! /bin/python
 #
 # C S V S T A T S . P Y
 #
 # Output the basic stats for one or more data files containing
 # one or more columns of numbers. If there are multiple columns
 # of numbers then they must be separated by a comma. Comments
 # are allowed in the data files if the comments are started with
 # a # (i.e. hash character).
 #
 # This code was largely generated by A.I. so check the
 # results that it produces.
 #
 # The code requires scipy, numpy and pandas. Depending on
 # your python installation, you may need to use pip to
 # install these libraries if they are not already installed.
 #
 # 

 import pandas as pd
 import numpy as np
 from scipy.stats import skew, kurtosis
 import argparse
 from io import StringIO
 import math
 import csv
 import os

 def analyze_csv(file_path, output, csv_rows, csv_fields, has_header):
    try:
        data_lines = []
        with open(file_path, 'r') as f:
            for line in f:
                stripped = line.strip()
                if not stripped or stripped.startswith('#'):
                    continue
                line_no_comment = line.split('#', 1)[0].strip()
                if line_no_comment:
                    data_lines.append(line_no_comment + '\n')

        if not data_lines:
            output(f"\n{file_path}: No valid data found.")
            return

        clean_data = StringIO("".join(data_lines))
        if has_header:
            df = pd.read_csv(clean_data, header=0)
        else:
            df = pd.read_csv(clean_data, header=None)
            df.columns = [f"Column {i+1}" for i in range(df.shape[1])]

        num_rows = len(df)
        output(f"\nAnalyzing file: {file_path}")
        output(f"Number of data rows read: {num_rows}")
        output("-" * 80)

        for column in df.columns:
            try:
                data = pd.to_numeric(df[column], errors='coerce').dropna()
                stats = {field: "undefined" for field in csv_fields}
                stats['File'] = os.path.basename(file_path)
                stats['Column'] = column

                if data.empty:
                    output(f"{column} has no valid numeric data.\n")
                    csv_rows.append(stats)
                    continue

                stats['Count'] = len(data)
                stats['Sum'] = data.sum()
                stats['Mean'] = data.mean()
                stats['Median'] = data.median()
                stats['Minimum'] = data.min()
                stats['Maximum'] = data.max()
                stats['Range'] = stats['Maximum'] - stats['Minimum']
                stats['25% Quartile'] = data.quantile(0.25)
                stats['75% Quartile'] = data.quantile(0.75)
                stats['Variance'] = data.var(ddof=1)
                stats['Std Deviation'] = data.std(ddof=1)
                stats['Average Deviation'] = np.mean(np.abs(data - stats['Mean']))
                stats['Skewness'] = skew(data)
                stats['Kurtosis'] = kurtosis(data)

                if stats['Std Deviation'] != 0:
                    stats['Pearson Skewness'] = 3 * (stats['Mean'] - stats['Median']) / stats['Std Deviation']

                if all(data > 0):
                    stats['Geometric Mean'] = math.exp(np.mean(np.log(data)))
                    stats['Harmonic Mean'] = len(data) / np.sum(1.0 / data)

                if np.all(data == data.astype(int)):
                    mode_vals = data.mode()
                    if not mode_vals.empty and len(mode_vals) != len(data.unique()):
                        stats['Mode'] = ', '.join(str(v) for v in mode_vals)
                    else:
                        stats['Mode'] = "No mode (all values unique)"

                output(f"Statistics for {column}:")
                for field in csv_fields:
                    if field in ['File', 'Column']:
                        continue
                    output(f"  {field:22}: {stats[field]}")
                output("")

                csv_rows.append(stats)

            except Exception as e:
                output(f"Error processing {column} in {file_path}: {e}")
                stats['Error'] = str(e)
                csv_rows.append(stats)

    except FileNotFoundError:
        output(f"File not found: {file_path}")
    except Exception as e:
        output(f"An error occurred reading {file_path}: {e}")

 def main():
    parser = argparse.ArgumentParser(
        description="Analyze numeric columns in one or more CSV files. Supports comments, header rows, and detailed text/CSV reports."
    )
    parser.add_argument("files", nargs='+', help="One or more CSV files to analyze")
    parser.add_argument("-c", "--csv-report", help="Optional CSV report output (e.g. stats.csv)")
    parser.add_argument("-H", "--header", action="store_true", help="Treat the first non-comment row as a header row")
    parser.add_argument("-q", "--quiet", action="store_true", help="Suppress terminal output")
    parser.add_argument("-r", "--report", help="Optional text report output (e.g. stats.txt)")

    args = parser.parse_args()

    csv_fields = [
        'File', 'Column', 'Count', 'Minimum', '25% Quartile', 'Median', '75% Quartile', 'Maximum',
        'Range', 'Mode', 'Mean', 'Geometric Mean', 'Harmonic Mean', 'Sum', 'Average Deviation',
        'Variance', 'Std Deviation', 'Skewness', 'Pearson Skewness', 'Kurtosis'
    ]
    csv_rows = []

    # Text output setup
    if args.report:
        report_file = open(args.report, 'w')
        if args.quiet:
            output = lambda msg: print(msg, file=report_file)
        else:
            output = lambda msg: (print(msg), print(msg, file=report_file))
    else:
        output = print if not args.quiet else lambda msg: None

    # Process all files
    for file_path in args.files:
        analyze_csv(file_path, output, csv_rows, csv_fields, has_header=args.header)

    # Write CSV report
    if args.csv_report:
        with open(args.csv_report, 'w', newline='') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=csv_fields)
            writer.writeheader()
            writer.writerows(csv_rows)
        if not args.quiet:
            print(f"\nCSV report saved to: {args.csv_report}")

    if args.report:
        report_file.close()
        if not args.quiet:
            print(f"Text report saved to: {args.report}")

 if __name__ == "__main__":
    main()
	#! /bin/python
	#
	# C S V S T A T S . P Y
	#
	# Output the basic stats for one or more data files containing
	# one or more columns of numbers. If there are multiple columns
	# of numbers then they must be separated by a comma. Comments
	# are allowed in the data files if the comments are started with
	# a # (i.e. hash character).
	#
	# This code was largely generated by A.I. so check the
	# results that it produces.
	#
	# The code requires scipy, numpy and pandas. Depending on
	# your python installation, you may need to use pip to
	# install these libraries if they are not already installed.
	#
	#

	import pandas as pd
	import numpy as np
	from scipy.stats import skew, kurtosis
	import argparse
	from io import StringIO
	import math
	import csv
	import os

	def analyze_csv(file_path, output, csv_rows, csv_fields, has_header):
	try:
	data_lines = []
	with open(file_path, 'r') as f:
	for line in f:
	stripped = line.strip()
	if not stripped or stripped.startswith('#'):
	continue
	line_no_comment = line.split('#', 1)[0].strip()
	if line_no_comment:
	data_lines.append(line_no_comment + '\n')

	if not data_lines:
	output(f"\n{file_path}: No valid data found.")
	return

	clean_data = StringIO("".join(data_lines))
	if has_header:
	df = pd.read_csv(clean_data, header=0)
	else:
	df = pd.read_csv(clean_data, header=None)
	df.columns = [f"Column {i+1}" for i in range(df.shape[1])]

	num_rows = len(df)
	output(f"\nAnalyzing file: {file_path}")
	output(f"Number of data rows read: {num_rows}")
	output("-" * 80)

	for column in df.columns:
	try:
	data = pd.to_numeric(df[column], errors='coerce').dropna()
	stats = {field: "undefined" for field in csv_fields}
	stats['File'] = os.path.basename(file_path)
	stats['Column'] = column

	if data.empty:
	output(f"{column} has no valid numeric data.\n")
	csv_rows.append(stats)
	continue

	stats['Count'] = len(data)
	stats['Sum'] = data.sum()
	stats['Mean'] = data.mean()
	stats['Median'] = data.median()
	stats['Minimum'] = data.min()
	stats['Maximum'] = data.max()
	stats['Range'] = stats['Maximum'] - stats['Minimum']
	stats['25% Quartile'] = data.quantile(0.25)
	stats['75% Quartile'] = data.quantile(0.75)
	stats['Variance'] = data.var(ddof=1)
	stats['Std Deviation'] = data.std(ddof=1)
	stats['Average Deviation'] = np.mean(np.abs(data - stats['Mean']))
	stats['Skewness'] = skew(data)
	stats['Kurtosis'] = kurtosis(data)

	if stats['Std Deviation'] != 0:
	stats['Pearson Skewness'] = 3 * (stats['Mean'] - stats['Median']) / stats['Std Deviation']

	if all(data > 0):
	stats['Geometric Mean'] = math.exp(np.mean(np.log(data)))
	stats['Harmonic Mean'] = len(data) / np.sum(1.0 / data)

	if np.all(data == data.astype(int)):
	mode_vals = data.mode()
	if not mode_vals.empty and len(mode_vals) != len(data.unique()):
	stats['Mode'] = ', '.join(str(v) for v in mode_vals)
	else:
	stats['Mode'] = "No mode (all values unique)"

	output(f"Statistics for {column}:")
	for field in csv_fields:
	if field in ['File', 'Column']:
	continue
	output(f" {field:22}: {stats[field]}")
	output("")

	csv_rows.append(stats)

	except Exception as e:
	output(f"Error processing {column} in {file_path}: {e}")
	stats['Error'] = str(e)
	csv_rows.append(stats)

	except FileNotFoundError:
	output(f"File not found: {file_path}")
	except Exception as e:
	output(f"An error occurred reading {file_path}: {e}")

	def main():
	parser = argparse.ArgumentParser(
	description="Analyze numeric columns in one or more CSV files. Supports comments, header rows, and detailed text/CSV reports."
	)
	parser.add_argument("files", nargs='+', help="One or more CSV files to analyze")
	parser.add_argument("-c", "--csv-report", help="Optional CSV report output (e.g. stats.csv)")
	parser.add_argument("-H", "--header", action="store_true", help="Treat the first non-comment row as a header row")
	parser.add_argument("-q", "--quiet", action="store_true", help="Suppress terminal output")
	parser.add_argument("-r", "--report", help="Optional text report output (e.g. stats.txt)")

	args = parser.parse_args()

	csv_fields = [
	'File', 'Column', 'Count', 'Minimum', '25% Quartile', 'Median', '75% Quartile', 'Maximum',
	'Range', 'Mode', 'Mean', 'Geometric Mean', 'Harmonic Mean', 'Sum', 'Average Deviation',
	'Variance', 'Std Deviation', 'Skewness', 'Pearson Skewness', 'Kurtosis'
	]
	csv_rows = []

	# Text output setup
	if args.report:
	report_file = open(args.report, 'w')
	if args.quiet:
	output = lambda msg: print(msg, file=report_file)
	else:
	output = lambda msg: (print(msg), print(msg, file=report_file))
	else:
	output = print if not args.quiet else lambda msg: None

	# Process all files
	for file_path in args.files:
	analyze_csv(file_path, output, csv_rows, csv_fields, has_header=args.header)

	# Write CSV report
	if args.csv_report:
	with open(args.csv_report, 'w', newline='') as csvfile:
	writer = csv.DictWriter(csvfile, fieldnames=csv_fields)
	writer.writeheader()
	writer.writerows(csv_rows)
	if not args.quiet:
	print(f"\nCSV report saved to: {args.csv_report}")

	if args.report:
	report_file.close()
	if not args.quiet:
	print(f"Text report saved to: {args.report}")

	if __name__ == "__main__":
	main()