martenc · April 7, 2025 06:11
diff --git a/numpy_pro_tips.py b/numpy_pro_tips.py
 # NumPy Pro Tips: Data Analysis & EDA Techniques

 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt

 # ======================================================================
 # 1. STATISTICAL FUNCTIONS & AGGREGATION
 # ======================================================================

 # Basic descriptive statistics
 data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
 print(f"Mean: {np.mean(data)}")
 print(f"Median: {np.median(data)}")
 print(f"Standard Deviation: {np.std(data)}")
 print(f"Variance: {np.var(data)}")
 print(f"Min: {np.min(data)}, Max: {np.max(data)}")

 # Percentiles and quantiles
 print(f"25th percentile: {np.percentile(data, 25)}")
 print(f"50th percentile: {np.percentile(data, 50)}")  # Same as median
 print(f"75th percentile: {np.percentile(data, 75)}")
 print(f"Interquartile range (IQR): {np.percentile(data, 75) - np.percentile(data, 25)}")

 # Multi-dimensional aggregation with axis parameter
 array_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
 print(f"Row means: {np.mean(array_2d, axis=1)}")  # Mean of each row
 print(f"Column means: {np.mean(array_2d, axis=0)}")  # Mean of each column

 # Weighted statistics
 values = np.array([1, 2, 3, 4, 5])
 weights = np.array([0.1, 0.2, 0.3, 0.3, 0.1])
 weighted_avg = np.average(values, weights=weights)
 print(f"Weighted average: {weighted_avg}")

 # Running/cumulative statistics
 print(f"Cumulative sum: {np.cumsum(data)}")
 print(f"Cumulative product: {np.cumprod(data)}")
 print(f"Cumulative max: {np.maximum.accumulate(data)}")

 # ======================================================================
 # 2. VECTORIZED DATA CLEANING & TRANSFORMATION
 # ======================================================================

 # Handling missing values (NaN)
 data_with_nan = np.array([1, 2, np.nan, 4, 5, np.nan, 7])
 print(f"Identify NaN values: {np.isnan(data_with_nan)}")
 print(f"Count of NaN values: {np.isnan(data_with_nan).sum()}")
 print(f"Filtered array (non-NaN): {data_with_nan[~np.isnan(data_with_nan)]}")

 # Mean imputation for NaN values
 mean_val = np.nanmean(data_with_nan)  # Mean ignoring NaNs
 data_imputed = np.where(np.isnan(data_with_nan), mean_val, data_with_nan)
 print(f"After mean imputation: {data_imputed}")

 # Outlier detection with Z-scores
 def detect_outliers_zscore(data, threshold=3):
    """Detect outliers using Z-score method"""
    z_scores = (data - np.mean(data)) / np.std(data)
    return np.abs(z_scores) > threshold

 # Outlier detection with IQR method
 def detect_outliers_iqr(data, k=1.5):
    """Detect outliers using IQR method"""
    q1, q3 = np.percentile(data, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - k * iqr
    upper_bound = q3 + k * iqr
    return (data < lower_bound) | (data > upper_bound)

 # Min-max normalization
 def min_max_normalize(data):
    """Scale data to range [0, 1]"""
    return (data - np.min(data)) / (np.max(data) - np.min(data))

 # Z-score standardization
 def standardize(data):
    """Standardize data to mean=0, std=1"""
    return (data - np.mean(data)) / np.std(data)

 # ======================================================================
 # 3. BROADCASTING & VECTORIZATION TRICKS
 # ======================================================================

 # Element-wise operations are automatically vectorized
 a = np.array([1, 2, 3, 4])
 b = np.array([5, 6, 7, 8])
 print(f"a + b: {a + b}")
 print(f"a * b: {a * b}")
 print(f"a ** 2: {a ** 2}")
 print(f"np.log(a): {np.log(a)}")

 # Broadcasting with different shapes
 row = np.array([1, 2, 3, 4])
 column = np.array([[10], [20], [30]])
 print(f"Broadcasting result:\n{column + row}")  # 3x4 result

 # Conditional logic without loops
 conditions = [data < 3, (data >= 3) & (data < 7), data >= 7]
 choices = [data*2, data, data*0.5]
 result = np.select(conditions, choices, default=data)
 print(f"Conditional result: {result}")

 # Fast replacement with where
 numbers = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
 result = np.where(numbers % 2 == 0, "even", "odd")
 print(f"Even/odd labels: {result}")

 # ======================================================================
 # 4. ADVANCED INDEXING & FILTERING
 # ======================================================================

 # Boolean masking
 array = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
 mask = array > 5
 print(f"Filtered array: {array[mask]}")

 # Multiple condition filtering with logical operators
 mask2 = (array > 3) & (array < 8)
 print(f"Complex filter: {array[mask2]}")

 # Fancy indexing with integer arrays
 indices = np.array([0, 2, 5, 7])
 print(f"Fancy indexing: {array[indices]}")

 # Combining boolean masking and fancy indexing
 filtered_indices = np.where(array > 5)[0]  # Returns indices where condition is True
 print(f"Indices where array > 5: {filtered_indices}")
 print(f"Values where array > 5: {array[filtered_indices]}")

 # ======================================================================
 # 5. EFFICIENT COMPUTATION TECHNIQUES
 # ======================================================================

 # Element-wise comparison with tolerance
 a = np.array([0.1, 0.2, 0.3])
 b = np.array([0.10000001, 0.2, 0.30000001])
 print(f"Exact equality: {a == b}")
 print(f"Equality with tolerance: {np.isclose(a, b)}")
 print(f"All close: {np.allclose(a, b)}")

 # Optimized linear algebra
 A = np.array([[1, 2], [3, 4]])
 B = np.array([[5, 6], [7, 8]])
 print(f"Matrix multiplication:\n{np.dot(A, B)}")  # or A @ B in Python 3.5+
 print(f"Matrix inverse:\n{np.linalg.inv(A)}")
 print(f"Determinant: {np.linalg.det(A)}")
 print(f"Eigenvalues: {np.linalg.eigvals(A)}")

 # Fast set operations
 a = np.array([1, 2, 3, 4, 5])
 b = np.array([4, 5, 6, 7, 8])
 print(f"Unique values: {np.unique(np.concatenate((a, b)))}")
 print(f"Intersection: {np.intersect1d(a, b)}")
 print(f"Union: {np.union1d(a, b)}")
 print(f"In a but not in b: {np.setdiff1d(a, b)}")
 print(f"In either a or b but not both: {np.setxor1d(a, b)}")

 # ======================================================================
 # 6. MEMORY EFFICIENCY & PERFORMANCE TIPS
 # ======================================================================

 # View vs copy
 original = np.arange(10)
 view = original[2:5]  # Creates a view - changes affect original
 view[0] = 99
 print(f"Original after modifying view: {original}")

 copy = original[2:5].copy()  # Creates a copy - changes don't affect original
 copy[0] = 88
 print(f"Original after modifying copy: {original}")

 # Memory-efficient dtype selection
 int_array = np.arange(1000)
 int_small = np.arange(1000, dtype=np.int16)  # Use smaller integer type
 print(f"Memory usage int_array: {int_array.nbytes} bytes")
 print(f"Memory usage int_small: {int_small.nbytes} bytes")

 # Pre-allocate arrays for performance
 def efficient_preallocate():
    result = np.zeros(1000)
    for i in range(1000):
        result[i] = i  # Modifies existing array
    return result

 # Use ufuncs for faster aggregation
 large_array = np.random.rand(1000000)
 # Fast:
 result = np.sum(large_array)
 # Slower:
 # result = sum(large_array)

 # ======================================================================
 # 7. DATA EXPLORATION HELPERS
 # ======================================================================

 # Generate descriptive statistics report
 def describe_numpy_array(arr):
    """Generate descriptive statistics for numpy array"""
    return {
        'shape': arr.shape,
        'size': arr.size,
        'dim': arr.ndim,
        'dtype': arr.dtype,
        'min': np.min(arr),
        'max': np.max(arr),
        'mean': np.mean(arr),
        'median': np.median(arr),
        'std': np.std(arr),
        'var': np.var(arr),
        'Q1': np.percentile(arr, 25),
        'Q3': np.percentile(arr, 75),
        'skewness': (np.mean(arr) - np.median(arr)) / np.std(arr) if np.std(arr) != 0 else 0,
        'unique_values': np.unique(arr).size,
        'missing_values': np.isnan(arr).sum() if np.issubdtype(arr.dtype, np.number) else None
    }

 # Find patterns in data with FFT
 def find_frequencies(time_series):
    """Find dominant frequencies in time series data"""
    fft_output = np.fft.fft(time_series)
    power = np.abs(fft_output)
    freq = np.fft.fftfreq(len(time_series))
    dominant_freq_idx = np.argmax(power[1:]) + 1
    return {
        'dominant_freq': freq[dominant_freq_idx],
        'dominant_period': 1 / freq[dominant_freq_idx] if freq[dominant_freq_idx] != 0 else np.inf,
        'fft_power': power,
        'frequencies': freq
    }
	# NumPy Pro Tips: Data Analysis & EDA Techniques

	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt

	# ======================================================================
	# 1. STATISTICAL FUNCTIONS & AGGREGATION
	# ======================================================================

	# Basic descriptive statistics
	data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
	print(f"Mean: {np.mean(data)}")
	print(f"Median: {np.median(data)}")
	print(f"Standard Deviation: {np.std(data)}")
	print(f"Variance: {np.var(data)}")
	print(f"Min: {np.min(data)}, Max: {np.max(data)}")

	# Percentiles and quantiles
	print(f"25th percentile: {np.percentile(data, 25)}")
	print(f"50th percentile: {np.percentile(data, 50)}") # Same as median
	print(f"75th percentile: {np.percentile(data, 75)}")
	print(f"Interquartile range (IQR): {np.percentile(data, 75) - np.percentile(data, 25)}")

	# Multi-dimensional aggregation with axis parameter
	array_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
	print(f"Row means: {np.mean(array_2d, axis=1)}") # Mean of each row
	print(f"Column means: {np.mean(array_2d, axis=0)}") # Mean of each column

	# Weighted statistics
	values = np.array([1, 2, 3, 4, 5])
	weights = np.array([0.1, 0.2, 0.3, 0.3, 0.1])
	weighted_avg = np.average(values, weights=weights)
	print(f"Weighted average: {weighted_avg}")

	# Running/cumulative statistics
	print(f"Cumulative sum: {np.cumsum(data)}")
	print(f"Cumulative product: {np.cumprod(data)}")
	print(f"Cumulative max: {np.maximum.accumulate(data)}")

	# ======================================================================
	# 2. VECTORIZED DATA CLEANING & TRANSFORMATION
	# ======================================================================

	# Handling missing values (NaN)
	data_with_nan = np.array([1, 2, np.nan, 4, 5, np.nan, 7])
	print(f"Identify NaN values: {np.isnan(data_with_nan)}")
	print(f"Count of NaN values: {np.isnan(data_with_nan).sum()}")
	print(f"Filtered array (non-NaN): {data_with_nan[~np.isnan(data_with_nan)]}")

	# Mean imputation for NaN values
	mean_val = np.nanmean(data_with_nan) # Mean ignoring NaNs
	data_imputed = np.where(np.isnan(data_with_nan), mean_val, data_with_nan)
	print(f"After mean imputation: {data_imputed}")

	# Outlier detection with Z-scores
	def detect_outliers_zscore(data, threshold=3):
	"""Detect outliers using Z-score method"""
	z_scores = (data - np.mean(data)) / np.std(data)
	return np.abs(z_scores) > threshold

	# Outlier detection with IQR method
	def detect_outliers_iqr(data, k=1.5):
	"""Detect outliers using IQR method"""
	q1, q3 = np.percentile(data, [25, 75])
	iqr = q3 - q1
	lower_bound = q1 - k * iqr
	upper_bound = q3 + k * iqr
	return (data < lower_bound) \| (data > upper_bound)

	# Min-max normalization
	def min_max_normalize(data):
	"""Scale data to range [0, 1]"""
	return (data - np.min(data)) / (np.max(data) - np.min(data))

	# Z-score standardization
	def standardize(data):
	"""Standardize data to mean=0, std=1"""
	return (data - np.mean(data)) / np.std(data)

	# ======================================================================
	# 3. BROADCASTING & VECTORIZATION TRICKS
	# ======================================================================

	# Element-wise operations are automatically vectorized
	a = np.array([1, 2, 3, 4])
	b = np.array([5, 6, 7, 8])
	print(f"a + b: {a + b}")
	print(f"a * b: {a * b}")
	print(f"a 2: {a 2}")
	print(f"np.log(a): {np.log(a)}")

	# Broadcasting with different shapes
	row = np.array([1, 2, 3, 4])
	column = np.array([[10], [20], [30]])
	print(f"Broadcasting result:\n{column + row}") # 3x4 result

	# Conditional logic without loops
	conditions = [data < 3, (data >= 3) & (data < 7), data >= 7]
	choices = [data2, data, data0.5]
	result = np.select(conditions, choices, default=data)
	print(f"Conditional result: {result}")

	# Fast replacement with where
	numbers = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
	result = np.where(numbers % 2 == 0, "even", "odd")
	print(f"Even/odd labels: {result}")

	# ======================================================================
	# 4. ADVANCED INDEXING & FILTERING
	# ======================================================================

	# Boolean masking
	array = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
	mask = array > 5
	print(f"Filtered array: {array[mask]}")

	# Multiple condition filtering with logical operators
	mask2 = (array > 3) & (array < 8)
	print(f"Complex filter: {array[mask2]}")

	# Fancy indexing with integer arrays
	indices = np.array([0, 2, 5, 7])
	print(f"Fancy indexing: {array[indices]}")

	# Combining boolean masking and fancy indexing
	filtered_indices = np.where(array > 5)[0] # Returns indices where condition is True
	print(f"Indices where array > 5: {filtered_indices}")
	print(f"Values where array > 5: {array[filtered_indices]}")

	# ======================================================================
	# 5. EFFICIENT COMPUTATION TECHNIQUES
	# ======================================================================

	# Element-wise comparison with tolerance
	a = np.array([0.1, 0.2, 0.3])
	b = np.array([0.10000001, 0.2, 0.30000001])
	print(f"Exact equality: {a == b}")
	print(f"Equality with tolerance: {np.isclose(a, b)}")
	print(f"All close: {np.allclose(a, b)}")

	# Optimized linear algebra
	A = np.array([[1, 2], [3, 4]])
	B = np.array([[5, 6], [7, 8]])
	print(f"Matrix multiplication:\n{np.dot(A, B)}") # or A @ B in Python 3.5+
	print(f"Matrix inverse:\n{np.linalg.inv(A)}")
	print(f"Determinant: {np.linalg.det(A)}")
	print(f"Eigenvalues: {np.linalg.eigvals(A)}")

	# Fast set operations
	a = np.array([1, 2, 3, 4, 5])
	b = np.array([4, 5, 6, 7, 8])
	print(f"Unique values: {np.unique(np.concatenate((a, b)))}")
	print(f"Intersection: {np.intersect1d(a, b)}")
	print(f"Union: {np.union1d(a, b)}")
	print(f"In a but not in b: {np.setdiff1d(a, b)}")
	print(f"In either a or b but not both: {np.setxor1d(a, b)}")

	# ======================================================================
	# 6. MEMORY EFFICIENCY & PERFORMANCE TIPS
	# ======================================================================

	# View vs copy
	original = np.arange(10)
	view = original[2:5] # Creates a view - changes affect original
	view[0] = 99
	print(f"Original after modifying view: {original}")

	copy = original[2:5].copy() # Creates a copy - changes don't affect original
	copy[0] = 88
	print(f"Original after modifying copy: {original}")

	# Memory-efficient dtype selection
	int_array = np.arange(1000)
	int_small = np.arange(1000, dtype=np.int16) # Use smaller integer type
	print(f"Memory usage int_array: {int_array.nbytes} bytes")
	print(f"Memory usage int_small: {int_small.nbytes} bytes")

	# Pre-allocate arrays for performance
	def efficient_preallocate():
	result = np.zeros(1000)
	for i in range(1000):
	result[i] = i # Modifies existing array
	return result

	# Use ufuncs for faster aggregation
	large_array = np.random.rand(1000000)
	# Fast:
	result = np.sum(large_array)
	# Slower:
	# result = sum(large_array)

	# ======================================================================
	# 7. DATA EXPLORATION HELPERS
	# ======================================================================

	# Generate descriptive statistics report
	def describe_numpy_array(arr):
	"""Generate descriptive statistics for numpy array"""
	return {
	'shape': arr.shape,
	'size': arr.size,
	'dim': arr.ndim,
	'dtype': arr.dtype,
	'min': np.min(arr),
	'max': np.max(arr),
	'mean': np.mean(arr),
	'median': np.median(arr),
	'std': np.std(arr),
	'var': np.var(arr),
	'Q1': np.percentile(arr, 25),
	'Q3': np.percentile(arr, 75),
	'skewness': (np.mean(arr) - np.median(arr)) / np.std(arr) if np.std(arr) != 0 else 0,
	'unique_values': np.unique(arr).size,
	'missing_values': np.isnan(arr).sum() if np.issubdtype(arr.dtype, np.number) else None
	}

	# Find patterns in data with FFT
	def find_frequencies(time_series):
	"""Find dominant frequencies in time series data"""
	fft_output = np.fft.fft(time_series)
	power = np.abs(fft_output)
	freq = np.fft.fftfreq(len(time_series))
	dominant_freq_idx = np.argmax(power[1:]) + 1
	return {
	'dominant_freq': freq[dominant_freq_idx],
	'dominant_period': 1 / freq[dominant_freq_idx] if freq[dominant_freq_idx] != 0 else np.inf,
	'fft_power': power,
	'frequencies': freq
	}