Created
April 7, 2025 06:11
-
-
Save martenc/2340830591926495f7f99513715ad081 to your computer and use it in GitHub Desktop.
Numpy Pro Tips
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# NumPy Pro Tips: Data Analysis & EDA Techniques | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
# ====================================================================== | |
# 1. STATISTICAL FUNCTIONS & AGGREGATION | |
# ====================================================================== | |
# Basic descriptive statistics | |
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) | |
print(f"Mean: {np.mean(data)}") | |
print(f"Median: {np.median(data)}") | |
print(f"Standard Deviation: {np.std(data)}") | |
print(f"Variance: {np.var(data)}") | |
print(f"Min: {np.min(data)}, Max: {np.max(data)}") | |
# Percentiles and quantiles | |
print(f"25th percentile: {np.percentile(data, 25)}") | |
print(f"50th percentile: {np.percentile(data, 50)}") # Same as median | |
print(f"75th percentile: {np.percentile(data, 75)}") | |
print(f"Interquartile range (IQR): {np.percentile(data, 75) - np.percentile(data, 25)}") | |
# Multi-dimensional aggregation with axis parameter | |
array_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) | |
print(f"Row means: {np.mean(array_2d, axis=1)}") # Mean of each row | |
print(f"Column means: {np.mean(array_2d, axis=0)}") # Mean of each column | |
# Weighted statistics | |
values = np.array([1, 2, 3, 4, 5]) | |
weights = np.array([0.1, 0.2, 0.3, 0.3, 0.1]) | |
weighted_avg = np.average(values, weights=weights) | |
print(f"Weighted average: {weighted_avg}") | |
# Running/cumulative statistics | |
print(f"Cumulative sum: {np.cumsum(data)}") | |
print(f"Cumulative product: {np.cumprod(data)}") | |
print(f"Cumulative max: {np.maximum.accumulate(data)}") | |
# ====================================================================== | |
# 2. VECTORIZED DATA CLEANING & TRANSFORMATION | |
# ====================================================================== | |
# Handling missing values (NaN) | |
data_with_nan = np.array([1, 2, np.nan, 4, 5, np.nan, 7]) | |
print(f"Identify NaN values: {np.isnan(data_with_nan)}") | |
print(f"Count of NaN values: {np.isnan(data_with_nan).sum()}") | |
print(f"Filtered array (non-NaN): {data_with_nan[~np.isnan(data_with_nan)]}") | |
# Mean imputation for NaN values | |
mean_val = np.nanmean(data_with_nan) # Mean ignoring NaNs | |
data_imputed = np.where(np.isnan(data_with_nan), mean_val, data_with_nan) | |
print(f"After mean imputation: {data_imputed}") | |
# Outlier detection with Z-scores | |
def detect_outliers_zscore(data, threshold=3): | |
"""Detect outliers using Z-score method""" | |
z_scores = (data - np.mean(data)) / np.std(data) | |
return np.abs(z_scores) > threshold | |
# Outlier detection with IQR method | |
def detect_outliers_iqr(data, k=1.5): | |
"""Detect outliers using IQR method""" | |
q1, q3 = np.percentile(data, [25, 75]) | |
iqr = q3 - q1 | |
lower_bound = q1 - k * iqr | |
upper_bound = q3 + k * iqr | |
return (data < lower_bound) | (data > upper_bound) | |
# Min-max normalization | |
def min_max_normalize(data): | |
"""Scale data to range [0, 1]""" | |
return (data - np.min(data)) / (np.max(data) - np.min(data)) | |
# Z-score standardization | |
def standardize(data): | |
"""Standardize data to mean=0, std=1""" | |
return (data - np.mean(data)) / np.std(data) | |
# ====================================================================== | |
# 3. BROADCASTING & VECTORIZATION TRICKS | |
# ====================================================================== | |
# Element-wise operations are automatically vectorized | |
a = np.array([1, 2, 3, 4]) | |
b = np.array([5, 6, 7, 8]) | |
print(f"a + b: {a + b}") | |
print(f"a * b: {a * b}") | |
print(f"a ** 2: {a ** 2}") | |
print(f"np.log(a): {np.log(a)}") | |
# Broadcasting with different shapes | |
row = np.array([1, 2, 3, 4]) | |
column = np.array([[10], [20], [30]]) | |
print(f"Broadcasting result:\n{column + row}") # 3x4 result | |
# Conditional logic without loops | |
conditions = [data < 3, (data >= 3) & (data < 7), data >= 7] | |
choices = [data*2, data, data*0.5] | |
result = np.select(conditions, choices, default=data) | |
print(f"Conditional result: {result}") | |
# Fast replacement with where | |
numbers = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9]) | |
result = np.where(numbers % 2 == 0, "even", "odd") | |
print(f"Even/odd labels: {result}") | |
# ====================================================================== | |
# 4. ADVANCED INDEXING & FILTERING | |
# ====================================================================== | |
# Boolean masking | |
array = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) | |
mask = array > 5 | |
print(f"Filtered array: {array[mask]}") | |
# Multiple condition filtering with logical operators | |
mask2 = (array > 3) & (array < 8) | |
print(f"Complex filter: {array[mask2]}") | |
# Fancy indexing with integer arrays | |
indices = np.array([0, 2, 5, 7]) | |
print(f"Fancy indexing: {array[indices]}") | |
# Combining boolean masking and fancy indexing | |
filtered_indices = np.where(array > 5)[0] # Returns indices where condition is True | |
print(f"Indices where array > 5: {filtered_indices}") | |
print(f"Values where array > 5: {array[filtered_indices]}") | |
# ====================================================================== | |
# 5. EFFICIENT COMPUTATION TECHNIQUES | |
# ====================================================================== | |
# Element-wise comparison with tolerance | |
a = np.array([0.1, 0.2, 0.3]) | |
b = np.array([0.10000001, 0.2, 0.30000001]) | |
print(f"Exact equality: {a == b}") | |
print(f"Equality with tolerance: {np.isclose(a, b)}") | |
print(f"All close: {np.allclose(a, b)}") | |
# Optimized linear algebra | |
A = np.array([[1, 2], [3, 4]]) | |
B = np.array([[5, 6], [7, 8]]) | |
print(f"Matrix multiplication:\n{np.dot(A, B)}") # or A @ B in Python 3.5+ | |
print(f"Matrix inverse:\n{np.linalg.inv(A)}") | |
print(f"Determinant: {np.linalg.det(A)}") | |
print(f"Eigenvalues: {np.linalg.eigvals(A)}") | |
# Fast set operations | |
a = np.array([1, 2, 3, 4, 5]) | |
b = np.array([4, 5, 6, 7, 8]) | |
print(f"Unique values: {np.unique(np.concatenate((a, b)))}") | |
print(f"Intersection: {np.intersect1d(a, b)}") | |
print(f"Union: {np.union1d(a, b)}") | |
print(f"In a but not in b: {np.setdiff1d(a, b)}") | |
print(f"In either a or b but not both: {np.setxor1d(a, b)}") | |
# ====================================================================== | |
# 6. MEMORY EFFICIENCY & PERFORMANCE TIPS | |
# ====================================================================== | |
# View vs copy | |
original = np.arange(10) | |
view = original[2:5] # Creates a view - changes affect original | |
view[0] = 99 | |
print(f"Original after modifying view: {original}") | |
copy = original[2:5].copy() # Creates a copy - changes don't affect original | |
copy[0] = 88 | |
print(f"Original after modifying copy: {original}") | |
# Memory-efficient dtype selection | |
int_array = np.arange(1000) | |
int_small = np.arange(1000, dtype=np.int16) # Use smaller integer type | |
print(f"Memory usage int_array: {int_array.nbytes} bytes") | |
print(f"Memory usage int_small: {int_small.nbytes} bytes") | |
# Pre-allocate arrays for performance | |
def efficient_preallocate(): | |
result = np.zeros(1000) | |
for i in range(1000): | |
result[i] = i # Modifies existing array | |
return result | |
# Use ufuncs for faster aggregation | |
large_array = np.random.rand(1000000) | |
# Fast: | |
result = np.sum(large_array) | |
# Slower: | |
# result = sum(large_array) | |
# ====================================================================== | |
# 7. DATA EXPLORATION HELPERS | |
# ====================================================================== | |
# Generate descriptive statistics report | |
def describe_numpy_array(arr): | |
"""Generate descriptive statistics for numpy array""" | |
return { | |
'shape': arr.shape, | |
'size': arr.size, | |
'dim': arr.ndim, | |
'dtype': arr.dtype, | |
'min': np.min(arr), | |
'max': np.max(arr), | |
'mean': np.mean(arr), | |
'median': np.median(arr), | |
'std': np.std(arr), | |
'var': np.var(arr), | |
'Q1': np.percentile(arr, 25), | |
'Q3': np.percentile(arr, 75), | |
'skewness': (np.mean(arr) - np.median(arr)) / np.std(arr) if np.std(arr) != 0 else 0, | |
'unique_values': np.unique(arr).size, | |
'missing_values': np.isnan(arr).sum() if np.issubdtype(arr.dtype, np.number) else None | |
} | |
# Find patterns in data with FFT | |
def find_frequencies(time_series): | |
"""Find dominant frequencies in time series data""" | |
fft_output = np.fft.fft(time_series) | |
power = np.abs(fft_output) | |
freq = np.fft.fftfreq(len(time_series)) | |
dominant_freq_idx = np.argmax(power[1:]) + 1 | |
return { | |
'dominant_freq': freq[dominant_freq_idx], | |
'dominant_period': 1 / freq[dominant_freq_idx] if freq[dominant_freq_idx] != 0 else np.inf, | |
'fft_power': power, | |
'frequencies': freq | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment