Created
April 7, 2025 05:42
-
-
Save martenc/688d47427a82542ec74e2c45969d30c8 to your computer and use it in GitHub Desktop.
Pandas Power Tips
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Pandas Power Tips: Essential Techniques for Data Manipulation | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
# ====================================================================== | |
# 1. DATA TYPE CONVERSION & HANDLING | |
# ====================================================================== | |
# Convert to numeric with error handling | |
df = pd.DataFrame({'A': ['1', '2', 'x', '4']}) | |
df['A_numeric'] = pd.to_numeric(df['A'], errors='coerce') # 'x' becomes NaN | |
df['A_ignore'] = pd.to_numeric(df['A'], errors='ignore') # keeps 'x' as string | |
df['A_raise'] = pd.to_numeric(df['A'], errors='raise') # raises ValueError | |
# Advanced datetime conversion with custom formats | |
df = pd.DataFrame({'date': ['2023-01-01', '01/02/2023', 'Jan 3, 2023']}) | |
df['date_parsed'] = pd.to_datetime(df['date'], infer_datetime_format=True, errors='coerce') | |
# Automatic data type detection for CSV import | |
df = pd.read_csv('file.csv', dtype='infer') | |
# Optimize memory usage by downcasting numeric types | |
def reduce_mem_usage(df, verbose=True): | |
"""Reduce memory usage of DataFrame by downcasting numeric types.""" | |
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] | |
start_mem = df.memory_usage().sum() / 1024**2 | |
for col in df.columns: | |
col_type = df[col].dtypes | |
if col_type in numerics: | |
c_min = df[col].min() | |
c_max = df[col].max() | |
if str(col_type)[:3] == 'int': | |
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: | |
df[col] = df[col].astype(np.int8) | |
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: | |
df[col] = df[col].astype(np.int16) | |
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: | |
df[col] = df[col].astype(np.int32) | |
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: | |
df[col] = df[col].astype(np.int64) | |
else: | |
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: | |
df[col] = df[col].astype(np.float16) | |
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: | |
df[col] = df[col].astype(np.float32) | |
else: | |
df[col] = df[col].astype(np.float64) | |
end_mem = df.memory_usage().sum() / 1024**2 | |
reduction = 100 * (start_mem - end_mem) / start_mem | |
if verbose: | |
print(f'Memory usage reduced from {start_mem:.2f}MB to {end_mem:.2f}MB ({reduction:.2f}% reduction)') | |
return df | |
# ====================================================================== | |
# 2. CLEANING TECHNIQUES | |
# ====================================================================== | |
# Identify and handle duplicates intelligently | |
df = pd.DataFrame({'A': [1, 2, 1, 3], 'B': [1, 2, 1, 4]}) | |
duplicate_rows = df[df.duplicated(keep=False)] # Marks all duplicates | |
df_unique = df.drop_duplicates(subset=['A']) # Keep first occurrence of each A value | |
# Missing value handling - core techniques | |
df = pd.DataFrame({'A': [1, 2, np.nan, 4], 'B': [5, np.nan, np.nan, 8]}) | |
# Drop rows with any NaN values | |
df_clean = df.dropna(inplace=False) # Returns new DataFrame without NaN rows | |
# df.dropna(inplace=True) # Modifies original DataFrame | |
# Drop rows with NaN only in specific columns | |
df_clean_subset = df.dropna(subset=['A']) # Only consider NaN in column A | |
# Fill all NaN values with a constant | |
df_filled = df.fillna(0) # Replace all NaN with 0 | |
# Advanced missing value imputation | |
df['A_filled'] = df['A'].fillna(df['A'].mean()) # Fill with column mean | |
df['B_filled'] = df['B'].fillna(method='ffill') # Forward fill | |
# Fill missing values based on group statistics | |
df = pd.DataFrame({ | |
'group': ['A', 'A', 'B', 'B', 'B'], | |
'value': [1, np.nan, 3, np.nan, 5] | |
}) | |
group_means = df.groupby('group')['value'].transform('mean') | |
df['value_filled'] = df['value'].fillna(group_means) | |
# Smart string cleaning with regex | |
df = pd.DataFrame({'text': [' Hello, World! ', 'Python 3.9 ', ' Data Science ']}) | |
df['text_clean'] = df['text'].str.strip().str.lower() # Strip whitespace, convert to lowercase | |
df['text_no_punct'] = df['text'].str.replace(r'[^\w\s]', '', regex=True) # Remove punctuation | |
df['text_numbers'] = df['text'].str.extract(r'(\d+\.?\d*)') # Extract numbers | |
# One-liner to detect and remove outliers using IQR method | |
def remove_outliers(df, column): | |
"""Remove outliers using IQR method""" | |
Q1 = df[column].quantile(0.25) | |
Q3 = df[column].quantile(0.75) | |
IQR = Q3 - Q1 | |
lower_bound = Q1 - 1.5 * IQR | |
upper_bound = Q3 + 1.5 * IQR | |
return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)] | |
# ====================================================================== | |
# 3. ADVANCED FILTERING & SELECTION | |
# ====================================================================== | |
# Boolean indexing combinations | |
df = pd.DataFrame({ | |
'A': [1, 2, 3, 4, 5], | |
'B': ['a', 'b', 'c', 'd', 'e'], | |
'C': [True, False, True, False, True] | |
}) | |
# Multiple conditions | |
mask = (df['A'] > 2) & (df['C'] == True) | |
filtered_df = df[mask] | |
# Using query() for cleaner filtering syntax | |
filtered_df = df.query('A > 2 and C == True') | |
# Selecting with isin for multiple values | |
df[df['B'].isin(['a', 'c', 'e'])] | |
# Between operator shorthand | |
df[df['A'].between(2, 4)] | |
# Complex string filtering with contains, startswith, endswith | |
df[df['B'].str.contains('a|e')] # Contains 'a' or 'e' | |
df[df['B'].str.startswith('a')] # Starts with 'a' | |
# Filter rows where any value in the row meets a condition | |
df[df.apply(lambda x: any(x > 3), axis=1)] # Any value in row > 3 | |
# ====================================================================== | |
# 4. PERFORMANCE OPTIMIZATION | |
# ====================================================================== | |
# Vectorized operations instead of loops | |
df = pd.DataFrame({'A': range(1, 6), 'B': range(6, 11)}) | |
# Slow: Using loop | |
for i in range(len(df)): | |
df.at[i, 'C'] = df.at[i, 'A'] * df.at[i, 'B'] | |
# Fast: Vectorized | |
df['C'] = df['A'] * df['B'] | |
# Use numpy for complex math operations | |
df['log_ratio'] = np.log(df['A'] / df['B']) | |
# Efficient apply with axis specification | |
df['squares'] = df['A'].apply(lambda x: x**2) # Apply to Series | |
df['row_sum'] = df.apply(lambda x: x['A'] + x['B'], axis=1) # Apply to each row | |
# Using eval() for complex expressions | |
df.eval('D = A + B * C', inplace=True) # Creates new column with expression | |
# Chunking for large files | |
reader = pd.read_csv('large_file.csv', chunksize=10000) | |
chunks = [] | |
for chunk in reader: | |
# Process each chunk | |
processed = chunk[chunk['value'] > 0] | |
chunks.append(processed) | |
# Combine all processed chunks | |
result = pd.concat(chunks) | |
# ====================================================================== | |
# 5. TIME SERIES MAGIC | |
# ====================================================================== | |
# Generate date ranges easily | |
date_range = pd.date_range(start='2023-01-01', end='2023-01-31', freq='D') | |
df = pd.DataFrame({'date': date_range}) | |
# Extract date components with dt accessor | |
df['year'] = df['date'].dt.year | |
df['month'] = df['date'].dt.month | |
df['day'] = df['date'].dt.day | |
df['weekday'] = df['date'].dt.day_name() | |
df['is_weekend'] = df['date'].dt.dayofweek >= 5 # Weekend detector | |
# Efficient date filtering | |
df[(df['date'] >= '2023-01-15') & (df['date'] <= '2023-01-20')] | |
df[df['date'].between('2023-01-15', '2023-01-20')] | |
# Resample time series data | |
time_df = pd.DataFrame({ | |
'date': pd.date_range(start='2023-01-01', periods=100, freq='H'), | |
'value': np.random.randn(100) | |
}) | |
time_df.set_index('date', inplace=True) | |
daily_mean = time_df.resample('D').mean() # Daily average | |
weekly_sum = time_df.resample('W').sum() # Weekly sum | |
# Rolling statistics | |
time_df['rolling_mean'] = time_df['value'].rolling(window=24).mean() # 24-hour moving average | |
time_df['rolling_std'] = time_df['value'].rolling(window=24).std() # Rolling standard deviation | |
# Calculate year-over-year growth | |
monthly_data = pd.DataFrame({ | |
'date': pd.date_range(start='2020-01-01', periods=36, freq='M'), | |
'sales': np.random.randint(100, 200, 36) | |
}) | |
monthly_data['sales_yoy'] = monthly_data['sales'].pct_change(12) * 100 # YoY growth percentage | |
# ====================================================================== | |
# 6. RESHAPING & TRANSFORMATION | |
# ====================================================================== | |
# Pivot tables with flexible aggregation | |
df = pd.DataFrame({ | |
'date': ['2023-01-01', '2023-01-01', '2023-01-02', '2023-01-02'], | |
'product': ['A', 'B', 'A', 'B'], | |
'sales': [100, 200, 150, 250], | |
'units': [10, 20, 15, 25] | |
}) | |
# Basic pivot table | |
pivot_df = df.pivot_table( | |
index='date', | |
columns='product', | |
values='sales', | |
aggfunc='sum' | |
) | |
# Multi-level pivot with multiple aggregations | |
advanced_pivot = df.pivot_table( | |
index='date', | |
columns='product', | |
values=['sales', 'units'], | |
aggfunc={'sales': ['sum', 'mean'], 'units': 'sum'} | |
) | |
# Melt (unpivot) operation for wide to long format | |
wide_df = pd.DataFrame({ | |
'id': [1, 2, 3], | |
'q1': [10, 11, 12], | |
'q2': [13, 14, 15], | |
'q3': [16, 17, 18], | |
'q4': [19, 20, 21] | |
}) | |
long_df = pd.melt( | |
wide_df, | |
id_vars=['id'], | |
value_vars=['q1', 'q2', 'q3', 'q4'], | |
var_name='quarter', | |
value_name='sales' | |
) | |
# Efficient crosstab for contingency tables | |
df = pd.DataFrame({ | |
'gender': ['M', 'F', 'M', 'F', 'M', 'F', 'M', 'F'], | |
'response': ['Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'No'] | |
}) | |
contingency = pd.crosstab( | |
df['gender'], | |
df['response'], | |
margins=True, # Include row and column totals | |
normalize='all' # Show proportions instead of counts | |
) | |
# Advanced GroupBy techniques | |
df = pd.DataFrame({ | |
'group': ['A', 'A', 'B', 'B', 'B', 'C'], | |
'val1': [1, 2, 3, 4, 5, 6], | |
'val2': [7, 8, 9, 10, 11, 12] | |
}) | |
# Transform to align aggregated values with original dataframe | |
df['group_mean'] = df.groupby('group')['val1'].transform('mean') | |
df['group_rank'] = df.groupby('group')['val1'].transform('rank') | |
# Multiple aggregations in one go | |
agg_df = df.groupby('group').agg({ | |
'val1': ['mean', 'median', 'std'], | |
'val2': ['min', 'max', 'count'] | |
}) | |
# Custom aggregation function | |
def range_diff(x): | |
return x.max() - x.min() | |
agg_custom = df.groupby('group').agg( | |
val1_range=('val1', range_diff), | |
val2_mean=('val2', 'mean') | |
) | |
# Complex window functions like SQL | |
df['cum_sum'] = df.groupby('group')['val1'].cumsum() | |
df['expanding_mean'] = df.groupby('group')['val1'].expanding().mean().reset_index(level=0, drop=True) | |
# ====================================================================== | |
# 7. VISUALIZATION SHORTCUTS | |
# ====================================================================== | |
# One-liners for common plots | |
df = pd.DataFrame({ | |
'cat': ['A', 'B', 'C', 'A', 'B', 'C'], | |
'val': [10, 15, 7, 9, 14, 8] | |
}) | |
# Quick histogram | |
df['val'].plot.hist(bins=10, alpha=0.5) | |
# Bar chart from value counts | |
df['cat'].value_counts().plot.bar() | |
# Scatter plot with color coding | |
df.plot.scatter(x='cat', y='val', c='val', colormap='viridis') | |
# Boxplot by category | |
df.boxplot(column='val', by='cat') | |
# Quick correlation heatmap | |
corr_df = pd.DataFrame(np.random.randn(10, 5), columns=['A', 'B', 'C', 'D', 'E']) | |
plt.figure(figsize=(8, 6)) | |
plt.imshow(corr_df.corr(), cmap='coolwarm') | |
plt.colorbar() | |
plt.xticks(range(len(corr_df.columns)), corr_df.columns) | |
plt.yticks(range(len(corr_df.columns)), corr_df.columns) | |
# ====================================================================== | |
# 8. METHOD CHAINING FOR ELEGANT TRANSFORMATIONS | |
# ====================================================================== | |
# Before (verbose) | |
df = pd.DataFrame({'A': [1, 2, np.nan, 4, 5], 'B': [np.nan, 2, 3, 4, 5]}) | |
df = df.fillna(0) | |
df = df[df['A'] > 1] | |
df = df.rename(columns={'A': 'Value A', 'B': 'Value B'}) | |
df = df.reset_index(drop=True) | |
# After (chained) | |
df = (pd.DataFrame({'A': [1, 2, np.nan, 4, 5], 'B': [np.nan, 2, 3, 4, 5]}) | |
.fillna(0) | |
.query('A > 1') | |
.rename(columns={'A': 'Value A', 'B': 'Value B'}) | |
.reset_index(drop=True)) | |
# ====================================================================== | |
# 9. WORKING WITH MULTIPLE DATAFRAMES | |
# ====================================================================== | |
# Merge dataframes using SQL-like joins | |
customers = pd.DataFrame({ | |
'customer_id': [1, 2, 3, 4, 5], | |
'name': ['John', 'Mary', 'Bob', 'Alice', 'Dave'] | |
}) | |
orders = pd.DataFrame({ | |
'order_id': [101, 102, 103, 104], | |
'customer_id': [1, 2, 3, 6], | |
'amount': [100, 200, 300, 400] | |
}) | |
# Inner join | |
inner_join = pd.merge(customers, orders, on='customer_id', how='inner') | |
# Left join with indicator | |
left_join = pd.merge(customers, orders, on='customer_id', how='left', indicator=True) | |
# Efficient concatenation with metadata preservation | |
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) | |
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]}) | |
# Add source information | |
df1['source'] = 'df1' | |
df2['source'] = 'df2' | |
# Concatenate with keys for hierarchical index | |
combined = pd.concat([df1, df2], keys=['first', 'second']) | |
# ====================================================================== | |
# 10. TEXT & STRING MANIPULATION | |
# ====================================================================== | |
# Extract information with regex | |
text_df = pd.DataFrame({ | |
'text': [ | |
'Customer ID: 123', | |
'Customer ID: 456, Order: 789', | |
'No ID found', | |
'Customer ID: ABC' | |
] | |
}) | |
# Extract customer IDs | |
text_df['customer_id'] = text_df['text'].str.extract(r'Customer ID: (\w+)') | |
# Extract multiple patterns with named groups | |
pattern = r'Customer ID: (?P<cust_id>\w+)(?:, Order: (?P<order_id>\w+))?' | |
extracted = text_df['text'].str.extract(pattern) | |
# Handling special characters in files | |
def clean_column_names(df): | |
"""Clean column names by replacing spaces and special chars with underscores""" | |
df.columns = df.columns.str.strip().str.lower().str.replace(r'[^\w\s]', '', regex=True).str.replace(' ', '_') | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment