Skip to content

Instantly share code, notes, and snippets.

@martenc
Created April 7, 2025 05:42
Show Gist options
  • Save martenc/688d47427a82542ec74e2c45969d30c8 to your computer and use it in GitHub Desktop.
Save martenc/688d47427a82542ec74e2c45969d30c8 to your computer and use it in GitHub Desktop.
Pandas Power Tips
# Pandas Power Tips: Essential Techniques for Data Manipulation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# ======================================================================
# 1. DATA TYPE CONVERSION & HANDLING
# ======================================================================
# Convert to numeric with error handling
df = pd.DataFrame({'A': ['1', '2', 'x', '4']})
df['A_numeric'] = pd.to_numeric(df['A'], errors='coerce') # 'x' becomes NaN
df['A_ignore'] = pd.to_numeric(df['A'], errors='ignore') # keeps 'x' as string
df['A_raise'] = pd.to_numeric(df['A'], errors='raise') # raises ValueError
# Advanced datetime conversion with custom formats
df = pd.DataFrame({'date': ['2023-01-01', '01/02/2023', 'Jan 3, 2023']})
df['date_parsed'] = pd.to_datetime(df['date'], infer_datetime_format=True, errors='coerce')
# Automatic data type detection for CSV import
df = pd.read_csv('file.csv', dtype='infer')
# Optimize memory usage by downcasting numeric types
def reduce_mem_usage(df, verbose=True):
"""Reduce memory usage of DataFrame by downcasting numeric types."""
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
reduction = 100 * (start_mem - end_mem) / start_mem
if verbose:
print(f'Memory usage reduced from {start_mem:.2f}MB to {end_mem:.2f}MB ({reduction:.2f}% reduction)')
return df
# ======================================================================
# 2. CLEANING TECHNIQUES
# ======================================================================
# Identify and handle duplicates intelligently
df = pd.DataFrame({'A': [1, 2, 1, 3], 'B': [1, 2, 1, 4]})
duplicate_rows = df[df.duplicated(keep=False)] # Marks all duplicates
df_unique = df.drop_duplicates(subset=['A']) # Keep first occurrence of each A value
# Missing value handling - core techniques
df = pd.DataFrame({'A': [1, 2, np.nan, 4], 'B': [5, np.nan, np.nan, 8]})
# Drop rows with any NaN values
df_clean = df.dropna(inplace=False) # Returns new DataFrame without NaN rows
# df.dropna(inplace=True) # Modifies original DataFrame
# Drop rows with NaN only in specific columns
df_clean_subset = df.dropna(subset=['A']) # Only consider NaN in column A
# Fill all NaN values with a constant
df_filled = df.fillna(0) # Replace all NaN with 0
# Advanced missing value imputation
df['A_filled'] = df['A'].fillna(df['A'].mean()) # Fill with column mean
df['B_filled'] = df['B'].fillna(method='ffill') # Forward fill
# Fill missing values based on group statistics
df = pd.DataFrame({
'group': ['A', 'A', 'B', 'B', 'B'],
'value': [1, np.nan, 3, np.nan, 5]
})
group_means = df.groupby('group')['value'].transform('mean')
df['value_filled'] = df['value'].fillna(group_means)
# Smart string cleaning with regex
df = pd.DataFrame({'text': [' Hello, World! ', 'Python 3.9 ', ' Data Science ']})
df['text_clean'] = df['text'].str.strip().str.lower() # Strip whitespace, convert to lowercase
df['text_no_punct'] = df['text'].str.replace(r'[^\w\s]', '', regex=True) # Remove punctuation
df['text_numbers'] = df['text'].str.extract(r'(\d+\.?\d*)') # Extract numbers
# One-liner to detect and remove outliers using IQR method
def remove_outliers(df, column):
"""Remove outliers using IQR method"""
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
# ======================================================================
# 3. ADVANCED FILTERING & SELECTION
# ======================================================================
# Boolean indexing combinations
df = pd.DataFrame({
'A': [1, 2, 3, 4, 5],
'B': ['a', 'b', 'c', 'd', 'e'],
'C': [True, False, True, False, True]
})
# Multiple conditions
mask = (df['A'] > 2) & (df['C'] == True)
filtered_df = df[mask]
# Using query() for cleaner filtering syntax
filtered_df = df.query('A > 2 and C == True')
# Selecting with isin for multiple values
df[df['B'].isin(['a', 'c', 'e'])]
# Between operator shorthand
df[df['A'].between(2, 4)]
# Complex string filtering with contains, startswith, endswith
df[df['B'].str.contains('a|e')] # Contains 'a' or 'e'
df[df['B'].str.startswith('a')] # Starts with 'a'
# Filter rows where any value in the row meets a condition
df[df.apply(lambda x: any(x > 3), axis=1)] # Any value in row > 3
# ======================================================================
# 4. PERFORMANCE OPTIMIZATION
# ======================================================================
# Vectorized operations instead of loops
df = pd.DataFrame({'A': range(1, 6), 'B': range(6, 11)})
# Slow: Using loop
for i in range(len(df)):
df.at[i, 'C'] = df.at[i, 'A'] * df.at[i, 'B']
# Fast: Vectorized
df['C'] = df['A'] * df['B']
# Use numpy for complex math operations
df['log_ratio'] = np.log(df['A'] / df['B'])
# Efficient apply with axis specification
df['squares'] = df['A'].apply(lambda x: x**2) # Apply to Series
df['row_sum'] = df.apply(lambda x: x['A'] + x['B'], axis=1) # Apply to each row
# Using eval() for complex expressions
df.eval('D = A + B * C', inplace=True) # Creates new column with expression
# Chunking for large files
reader = pd.read_csv('large_file.csv', chunksize=10000)
chunks = []
for chunk in reader:
# Process each chunk
processed = chunk[chunk['value'] > 0]
chunks.append(processed)
# Combine all processed chunks
result = pd.concat(chunks)
# ======================================================================
# 5. TIME SERIES MAGIC
# ======================================================================
# Generate date ranges easily
date_range = pd.date_range(start='2023-01-01', end='2023-01-31', freq='D')
df = pd.DataFrame({'date': date_range})
# Extract date components with dt accessor
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['weekday'] = df['date'].dt.day_name()
df['is_weekend'] = df['date'].dt.dayofweek >= 5 # Weekend detector
# Efficient date filtering
df[(df['date'] >= '2023-01-15') & (df['date'] <= '2023-01-20')]
df[df['date'].between('2023-01-15', '2023-01-20')]
# Resample time series data
time_df = pd.DataFrame({
'date': pd.date_range(start='2023-01-01', periods=100, freq='H'),
'value': np.random.randn(100)
})
time_df.set_index('date', inplace=True)
daily_mean = time_df.resample('D').mean() # Daily average
weekly_sum = time_df.resample('W').sum() # Weekly sum
# Rolling statistics
time_df['rolling_mean'] = time_df['value'].rolling(window=24).mean() # 24-hour moving average
time_df['rolling_std'] = time_df['value'].rolling(window=24).std() # Rolling standard deviation
# Calculate year-over-year growth
monthly_data = pd.DataFrame({
'date': pd.date_range(start='2020-01-01', periods=36, freq='M'),
'sales': np.random.randint(100, 200, 36)
})
monthly_data['sales_yoy'] = monthly_data['sales'].pct_change(12) * 100 # YoY growth percentage
# ======================================================================
# 6. RESHAPING & TRANSFORMATION
# ======================================================================
# Pivot tables with flexible aggregation
df = pd.DataFrame({
'date': ['2023-01-01', '2023-01-01', '2023-01-02', '2023-01-02'],
'product': ['A', 'B', 'A', 'B'],
'sales': [100, 200, 150, 250],
'units': [10, 20, 15, 25]
})
# Basic pivot table
pivot_df = df.pivot_table(
index='date',
columns='product',
values='sales',
aggfunc='sum'
)
# Multi-level pivot with multiple aggregations
advanced_pivot = df.pivot_table(
index='date',
columns='product',
values=['sales', 'units'],
aggfunc={'sales': ['sum', 'mean'], 'units': 'sum'}
)
# Melt (unpivot) operation for wide to long format
wide_df = pd.DataFrame({
'id': [1, 2, 3],
'q1': [10, 11, 12],
'q2': [13, 14, 15],
'q3': [16, 17, 18],
'q4': [19, 20, 21]
})
long_df = pd.melt(
wide_df,
id_vars=['id'],
value_vars=['q1', 'q2', 'q3', 'q4'],
var_name='quarter',
value_name='sales'
)
# Efficient crosstab for contingency tables
df = pd.DataFrame({
'gender': ['M', 'F', 'M', 'F', 'M', 'F', 'M', 'F'],
'response': ['Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'No']
})
contingency = pd.crosstab(
df['gender'],
df['response'],
margins=True, # Include row and column totals
normalize='all' # Show proportions instead of counts
)
# Advanced GroupBy techniques
df = pd.DataFrame({
'group': ['A', 'A', 'B', 'B', 'B', 'C'],
'val1': [1, 2, 3, 4, 5, 6],
'val2': [7, 8, 9, 10, 11, 12]
})
# Transform to align aggregated values with original dataframe
df['group_mean'] = df.groupby('group')['val1'].transform('mean')
df['group_rank'] = df.groupby('group')['val1'].transform('rank')
# Multiple aggregations in one go
agg_df = df.groupby('group').agg({
'val1': ['mean', 'median', 'std'],
'val2': ['min', 'max', 'count']
})
# Custom aggregation function
def range_diff(x):
return x.max() - x.min()
agg_custom = df.groupby('group').agg(
val1_range=('val1', range_diff),
val2_mean=('val2', 'mean')
)
# Complex window functions like SQL
df['cum_sum'] = df.groupby('group')['val1'].cumsum()
df['expanding_mean'] = df.groupby('group')['val1'].expanding().mean().reset_index(level=0, drop=True)
# ======================================================================
# 7. VISUALIZATION SHORTCUTS
# ======================================================================
# One-liners for common plots
df = pd.DataFrame({
'cat': ['A', 'B', 'C', 'A', 'B', 'C'],
'val': [10, 15, 7, 9, 14, 8]
})
# Quick histogram
df['val'].plot.hist(bins=10, alpha=0.5)
# Bar chart from value counts
df['cat'].value_counts().plot.bar()
# Scatter plot with color coding
df.plot.scatter(x='cat', y='val', c='val', colormap='viridis')
# Boxplot by category
df.boxplot(column='val', by='cat')
# Quick correlation heatmap
corr_df = pd.DataFrame(np.random.randn(10, 5), columns=['A', 'B', 'C', 'D', 'E'])
plt.figure(figsize=(8, 6))
plt.imshow(corr_df.corr(), cmap='coolwarm')
plt.colorbar()
plt.xticks(range(len(corr_df.columns)), corr_df.columns)
plt.yticks(range(len(corr_df.columns)), corr_df.columns)
# ======================================================================
# 8. METHOD CHAINING FOR ELEGANT TRANSFORMATIONS
# ======================================================================
# Before (verbose)
df = pd.DataFrame({'A': [1, 2, np.nan, 4, 5], 'B': [np.nan, 2, 3, 4, 5]})
df = df.fillna(0)
df = df[df['A'] > 1]
df = df.rename(columns={'A': 'Value A', 'B': 'Value B'})
df = df.reset_index(drop=True)
# After (chained)
df = (pd.DataFrame({'A': [1, 2, np.nan, 4, 5], 'B': [np.nan, 2, 3, 4, 5]})
.fillna(0)
.query('A > 1')
.rename(columns={'A': 'Value A', 'B': 'Value B'})
.reset_index(drop=True))
# ======================================================================
# 9. WORKING WITH MULTIPLE DATAFRAMES
# ======================================================================
# Merge dataframes using SQL-like joins
customers = pd.DataFrame({
'customer_id': [1, 2, 3, 4, 5],
'name': ['John', 'Mary', 'Bob', 'Alice', 'Dave']
})
orders = pd.DataFrame({
'order_id': [101, 102, 103, 104],
'customer_id': [1, 2, 3, 6],
'amount': [100, 200, 300, 400]
})
# Inner join
inner_join = pd.merge(customers, orders, on='customer_id', how='inner')
# Left join with indicator
left_join = pd.merge(customers, orders, on='customer_id', how='left', indicator=True)
# Efficient concatenation with metadata preservation
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})
# Add source information
df1['source'] = 'df1'
df2['source'] = 'df2'
# Concatenate with keys for hierarchical index
combined = pd.concat([df1, df2], keys=['first', 'second'])
# ======================================================================
# 10. TEXT & STRING MANIPULATION
# ======================================================================
# Extract information with regex
text_df = pd.DataFrame({
'text': [
'Customer ID: 123',
'Customer ID: 456, Order: 789',
'No ID found',
'Customer ID: ABC'
]
})
# Extract customer IDs
text_df['customer_id'] = text_df['text'].str.extract(r'Customer ID: (\w+)')
# Extract multiple patterns with named groups
pattern = r'Customer ID: (?P<cust_id>\w+)(?:, Order: (?P<order_id>\w+))?'
extracted = text_df['text'].str.extract(pattern)
# Handling special characters in files
def clean_column_names(df):
"""Clean column names by replacing spaces and special chars with underscores"""
df.columns = df.columns.str.strip().str.lower().str.replace(r'[^\w\s]', '', regex=True).str.replace(' ', '_')
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment