martenc · April 7, 2025 05:42
diff --git a/pandas_pro_tips.py b/pandas_pro_tips.py
 # Pandas Power Tips: Essential Techniques for Data Manipulation

 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt

 # ======================================================================
 # 1. DATA TYPE CONVERSION & HANDLING
 # ======================================================================

 # Convert to numeric with error handling
 df = pd.DataFrame({'A': ['1', '2', 'x', '4']})
 df['A_numeric'] = pd.to_numeric(df['A'], errors='coerce')  # 'x' becomes NaN
 df['A_ignore'] = pd.to_numeric(df['A'], errors='ignore')   # keeps 'x' as string
 df['A_raise'] = pd.to_numeric(df['A'], errors='raise')     # raises ValueError

 # Advanced datetime conversion with custom formats
 df = pd.DataFrame({'date': ['2023-01-01', '01/02/2023', 'Jan 3, 2023']})
 df['date_parsed'] = pd.to_datetime(df['date'], infer_datetime_format=True, errors='coerce')

 # Automatic data type detection for CSV import
 df = pd.read_csv('file.csv', dtype='infer')

 # Optimize memory usage by downcasting numeric types
 def reduce_mem_usage(df, verbose=True):
    """Reduce memory usage of DataFrame by downcasting numeric types."""
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    
    end_mem = df.memory_usage().sum() / 1024**2
    reduction = 100 * (start_mem - end_mem) / start_mem
    
    if verbose:
        print(f'Memory usage reduced from {start_mem:.2f}MB to {end_mem:.2f}MB ({reduction:.2f}% reduction)')
    
    return df

 # ======================================================================
 # 2. CLEANING TECHNIQUES
 # ======================================================================

 # Identify and handle duplicates intelligently
 df = pd.DataFrame({'A': [1, 2, 1, 3], 'B': [1, 2, 1, 4]})
 duplicate_rows = df[df.duplicated(keep=False)]  # Marks all duplicates
 df_unique = df.drop_duplicates(subset=['A'])    # Keep first occurrence of each A value

 # Missing value handling - core techniques
 df = pd.DataFrame({'A': [1, 2, np.nan, 4], 'B': [5, np.nan, np.nan, 8]})

 # Drop rows with any NaN values
 df_clean = df.dropna(inplace=False)  # Returns new DataFrame without NaN rows
 # df.dropna(inplace=True)  # Modifies original DataFrame

 # Drop rows with NaN only in specific columns
 df_clean_subset = df.dropna(subset=['A'])  # Only consider NaN in column A

 # Fill all NaN values with a constant
 df_filled = df.fillna(0)  # Replace all NaN with 0

 # Advanced missing value imputation
 df['A_filled'] = df['A'].fillna(df['A'].mean())  # Fill with column mean
 df['B_filled'] = df['B'].fillna(method='ffill')  # Forward fill

 # Fill missing values based on group statistics
 df = pd.DataFrame({
    'group': ['A', 'A', 'B', 'B', 'B'],
    'value': [1, np.nan, 3, np.nan, 5]
 })
 group_means = df.groupby('group')['value'].transform('mean')
 df['value_filled'] = df['value'].fillna(group_means)

 # Smart string cleaning with regex
 df = pd.DataFrame({'text': ['  Hello, World! ', 'Python  3.9  ', ' Data Science ']})
 df['text_clean'] = df['text'].str.strip().str.lower()  # Strip whitespace, convert to lowercase
 df['text_no_punct'] = df['text'].str.replace(r'[^\w\s]', '', regex=True)  # Remove punctuation
 df['text_numbers'] = df['text'].str.extract(r'(\d+\.?\d*)')  # Extract numbers

 # One-liner to detect and remove outliers using IQR method
 def remove_outliers(df, column):
    """Remove outliers using IQR method"""
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

 # ======================================================================
 # 3. ADVANCED FILTERING & SELECTION
 # ======================================================================

 # Boolean indexing combinations
 df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': ['a', 'b', 'c', 'd', 'e'],
    'C': [True, False, True, False, True]
 })
 # Multiple conditions
 mask = (df['A'] > 2) & (df['C'] == True)
 filtered_df = df[mask]

 # Using query() for cleaner filtering syntax
 filtered_df = df.query('A > 2 and C == True')

 # Selecting with isin for multiple values
 df[df['B'].isin(['a', 'c', 'e'])]

 # Between operator shorthand
 df[df['A'].between(2, 4)]

 # Complex string filtering with contains, startswith, endswith
 df[df['B'].str.contains('a|e')]  # Contains 'a' or 'e'
 df[df['B'].str.startswith('a')]  # Starts with 'a'

 # Filter rows where any value in the row meets a condition
 df[df.apply(lambda x: any(x > 3), axis=1)]  # Any value in row > 3

 # ======================================================================
 # 4. PERFORMANCE OPTIMIZATION
 # ======================================================================

 # Vectorized operations instead of loops
 df = pd.DataFrame({'A': range(1, 6), 'B': range(6, 11)})
 # Slow: Using loop
 for i in range(len(df)):
    df.at[i, 'C'] = df.at[i, 'A'] * df.at[i, 'B']
 # Fast: Vectorized
 df['C'] = df['A'] * df['B']

 # Use numpy for complex math operations
 df['log_ratio'] = np.log(df['A'] / df['B'])

 # Efficient apply with axis specification
 df['squares'] = df['A'].apply(lambda x: x**2)  # Apply to Series
 df['row_sum'] = df.apply(lambda x: x['A'] + x['B'], axis=1)  # Apply to each row

 # Using eval() for complex expressions
 df.eval('D = A + B * C', inplace=True)  # Creates new column with expression

 # Chunking for large files
 reader = pd.read_csv('large_file.csv', chunksize=10000)
 chunks = []
 for chunk in reader:
    # Process each chunk
    processed = chunk[chunk['value'] > 0]
    chunks.append(processed)
 # Combine all processed chunks
 result = pd.concat(chunks)

 # ======================================================================
 # 5. TIME SERIES MAGIC
 # ======================================================================

 # Generate date ranges easily
 date_range = pd.date_range(start='2023-01-01', end='2023-01-31', freq='D')
 df = pd.DataFrame({'date': date_range})

 # Extract date components with dt accessor
 df['year'] = df['date'].dt.year
 df['month'] = df['date'].dt.month
 df['day'] = df['date'].dt.day
 df['weekday'] = df['date'].dt.day_name()
 df['is_weekend'] = df['date'].dt.dayofweek >= 5  # Weekend detector

 # Efficient date filtering
 df[(df['date'] >= '2023-01-15') & (df['date'] <= '2023-01-20')]
 df[df['date'].between('2023-01-15', '2023-01-20')]

 # Resample time series data
 time_df = pd.DataFrame({
    'date': pd.date_range(start='2023-01-01', periods=100, freq='H'),
    'value': np.random.randn(100)
 })
 time_df.set_index('date', inplace=True)
 daily_mean = time_df.resample('D').mean()  # Daily average
 weekly_sum = time_df.resample('W').sum()   # Weekly sum

 # Rolling statistics
 time_df['rolling_mean'] = time_df['value'].rolling(window=24).mean()  # 24-hour moving average
 time_df['rolling_std'] = time_df['value'].rolling(window=24).std()    # Rolling standard deviation

 # Calculate year-over-year growth
 monthly_data = pd.DataFrame({
    'date': pd.date_range(start='2020-01-01', periods=36, freq='M'),
    'sales': np.random.randint(100, 200, 36)
 })
 monthly_data['sales_yoy'] = monthly_data['sales'].pct_change(12) * 100  # YoY growth percentage

 # ======================================================================
 # 6. RESHAPING & TRANSFORMATION
 # ======================================================================

 # Pivot tables with flexible aggregation
 df = pd.DataFrame({
    'date': ['2023-01-01', '2023-01-01', '2023-01-02', '2023-01-02'],
    'product': ['A', 'B', 'A', 'B'],
    'sales': [100, 200, 150, 250],
    'units': [10, 20, 15, 25]
 })

 # Basic pivot table
 pivot_df = df.pivot_table(
    index='date',
    columns='product',
    values='sales',
    aggfunc='sum'
 )

 # Multi-level pivot with multiple aggregations
 advanced_pivot = df.pivot_table(
    index='date',
    columns='product',
    values=['sales', 'units'],
    aggfunc={'sales': ['sum', 'mean'], 'units': 'sum'}
 )

 # Melt (unpivot) operation for wide to long format
 wide_df = pd.DataFrame({
    'id': [1, 2, 3],
    'q1': [10, 11, 12],
    'q2': [13, 14, 15],
    'q3': [16, 17, 18],
    'q4': [19, 20, 21]
 })

 long_df = pd.melt(
    wide_df,
    id_vars=['id'],
    value_vars=['q1', 'q2', 'q3', 'q4'],
    var_name='quarter',
    value_name='sales'
 )

 # Efficient crosstab for contingency tables
 df = pd.DataFrame({
    'gender': ['M', 'F', 'M', 'F', 'M', 'F', 'M', 'F'],
    'response': ['Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'No']
 })

 contingency = pd.crosstab(
    df['gender'],
    df['response'],
    margins=True,  # Include row and column totals
    normalize='all'  # Show proportions instead of counts
 )

 # Advanced GroupBy techniques
 df = pd.DataFrame({
    'group': ['A', 'A', 'B', 'B', 'B', 'C'],
    'val1': [1, 2, 3, 4, 5, 6],
    'val2': [7, 8, 9, 10, 11, 12]
 })

 # Transform to align aggregated values with original dataframe
 df['group_mean'] = df.groupby('group')['val1'].transform('mean')
 df['group_rank'] = df.groupby('group')['val1'].transform('rank')

 # Multiple aggregations in one go
 agg_df = df.groupby('group').agg({
    'val1': ['mean', 'median', 'std'],
    'val2': ['min', 'max', 'count']
 })

 # Custom aggregation function
 def range_diff(x):
    return x.max() - x.min()

 agg_custom = df.groupby('group').agg(
    val1_range=('val1', range_diff),
    val2_mean=('val2', 'mean')
 )

 # Complex window functions like SQL
 df['cum_sum'] = df.groupby('group')['val1'].cumsum()
 df['expanding_mean'] = df.groupby('group')['val1'].expanding().mean().reset_index(level=0, drop=True)

 # ======================================================================
 # 7. VISUALIZATION SHORTCUTS
 # ======================================================================

 # One-liners for common plots
 df = pd.DataFrame({
    'cat': ['A', 'B', 'C', 'A', 'B', 'C'],
    'val': [10, 15, 7, 9, 14, 8]
 })

 # Quick histogram
 df['val'].plot.hist(bins=10, alpha=0.5)

 # Bar chart from value counts
 df['cat'].value_counts().plot.bar()

 # Scatter plot with color coding
 df.plot.scatter(x='cat', y='val', c='val', colormap='viridis')

 # Boxplot by category
 df.boxplot(column='val', by='cat')

 # Quick correlation heatmap
 corr_df = pd.DataFrame(np.random.randn(10, 5), columns=['A', 'B', 'C', 'D', 'E'])
 plt.figure(figsize=(8, 6))
 plt.imshow(corr_df.corr(), cmap='coolwarm')
 plt.colorbar()
 plt.xticks(range(len(corr_df.columns)), corr_df.columns)
 plt.yticks(range(len(corr_df.columns)), corr_df.columns)

 # ======================================================================
 # 8. METHOD CHAINING FOR ELEGANT TRANSFORMATIONS
 # ======================================================================

 # Before (verbose)
 df = pd.DataFrame({'A': [1, 2, np.nan, 4, 5], 'B': [np.nan, 2, 3, 4, 5]})
 df = df.fillna(0)
 df = df[df['A'] > 1]
 df = df.rename(columns={'A': 'Value A', 'B': 'Value B'})
 df = df.reset_index(drop=True)

 # After (chained)
 df = (pd.DataFrame({'A': [1, 2, np.nan, 4, 5], 'B': [np.nan, 2, 3, 4, 5]})
      .fillna(0)
      .query('A > 1')
      .rename(columns={'A': 'Value A', 'B': 'Value B'})
      .reset_index(drop=True))

 # ======================================================================
 # 9. WORKING WITH MULTIPLE DATAFRAMES
 # ======================================================================

 # Merge dataframes using SQL-like joins
 customers = pd.DataFrame({
    'customer_id': [1, 2, 3, 4, 5],
    'name': ['John', 'Mary', 'Bob', 'Alice', 'Dave']
 })

 orders = pd.DataFrame({
    'order_id': [101, 102, 103, 104],
    'customer_id': [1, 2, 3, 6],
    'amount': [100, 200, 300, 400]
 })

 # Inner join
 inner_join = pd.merge(customers, orders, on='customer_id', how='inner')

 # Left join with indicator
 left_join = pd.merge(customers, orders, on='customer_id', how='left', indicator=True)

 # Efficient concatenation with metadata preservation
 df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
 df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})

 # Add source information
 df1['source'] = 'df1'
 df2['source'] = 'df2'

 # Concatenate with keys for hierarchical index
 combined = pd.concat([df1, df2], keys=['first', 'second'])

 # ======================================================================
 # 10. TEXT & STRING MANIPULATION
 # ======================================================================

 # Extract information with regex
 text_df = pd.DataFrame({
    'text': [
        'Customer ID: 123',
        'Customer ID: 456, Order: 789',
        'No ID found',
        'Customer ID: ABC'
    ]
 })

 # Extract customer IDs
 text_df['customer_id'] = text_df['text'].str.extract(r'Customer ID: (\w+)')

 # Extract multiple patterns with named groups
 pattern = r'Customer ID: (?P<cust_id>\w+)(?:, Order: (?P<order_id>\w+))?'
 extracted = text_df['text'].str.extract(pattern)

 # Handling special characters in files
 def clean_column_names(df):
    """Clean column names by replacing spaces and special chars with underscores"""
    df.columns = df.columns.str.strip().str.lower().str.replace(r'[^\w\s]', '', regex=True).str.replace(' ', '_')
    return df
	# Pandas Power Tips: Essential Techniques for Data Manipulation

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt

	# ======================================================================
	# 1. DATA TYPE CONVERSION & HANDLING
	# ======================================================================

	# Convert to numeric with error handling
	df = pd.DataFrame({'A': ['1', '2', 'x', '4']})
	df['A_numeric'] = pd.to_numeric(df['A'], errors='coerce') # 'x' becomes NaN
	df['A_ignore'] = pd.to_numeric(df['A'], errors='ignore') # keeps 'x' as string
	df['A_raise'] = pd.to_numeric(df['A'], errors='raise') # raises ValueError

	# Advanced datetime conversion with custom formats
	df = pd.DataFrame({'date': ['2023-01-01', '01/02/2023', 'Jan 3, 2023']})
	df['date_parsed'] = pd.to_datetime(df['date'], infer_datetime_format=True, errors='coerce')

	# Automatic data type detection for CSV import
	df = pd.read_csv('file.csv', dtype='infer')

	# Optimize memory usage by downcasting numeric types
	def reduce_mem_usage(df, verbose=True):
	"""Reduce memory usage of DataFrame by downcasting numeric types."""
	numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
	start_mem = df.memory_usage().sum() / 1024**2

	for col in df.columns:
	col_type = df[col].dtypes
	if col_type in numerics:
	c_min = df[col].min()
	c_max = df[col].max()

	if str(col_type)[:3] == 'int':
	if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
	df[col] = df[col].astype(np.int8)
	elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
	df[col] = df[col].astype(np.int16)
	elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
	df[col] = df[col].astype(np.int32)
	elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
	df[col] = df[col].astype(np.int64)
	else:
	if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
	df[col] = df[col].astype(np.float16)
	elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
	df[col] = df[col].astype(np.float32)
	else:
	df[col] = df[col].astype(np.float64)

	end_mem = df.memory_usage().sum() / 1024**2
	reduction = 100 * (start_mem - end_mem) / start_mem

	if verbose:
	print(f'Memory usage reduced from {start_mem:.2f}MB to {end_mem:.2f}MB ({reduction:.2f}% reduction)')

	return df

	# ======================================================================
	# 2. CLEANING TECHNIQUES
	# ======================================================================

	# Identify and handle duplicates intelligently
	df = pd.DataFrame({'A': [1, 2, 1, 3], 'B': [1, 2, 1, 4]})
	duplicate_rows = df[df.duplicated(keep=False)] # Marks all duplicates
	df_unique = df.drop_duplicates(subset=['A']) # Keep first occurrence of each A value

	# Missing value handling - core techniques
	df = pd.DataFrame({'A': [1, 2, np.nan, 4], 'B': [5, np.nan, np.nan, 8]})

	# Drop rows with any NaN values
	df_clean = df.dropna(inplace=False) # Returns new DataFrame without NaN rows
	# df.dropna(inplace=True) # Modifies original DataFrame

	# Drop rows with NaN only in specific columns
	df_clean_subset = df.dropna(subset=['A']) # Only consider NaN in column A

	# Fill all NaN values with a constant
	df_filled = df.fillna(0) # Replace all NaN with 0

	# Advanced missing value imputation
	df['A_filled'] = df['A'].fillna(df['A'].mean()) # Fill with column mean
	df['B_filled'] = df['B'].fillna(method='ffill') # Forward fill

	# Fill missing values based on group statistics
	df = pd.DataFrame({
	'group': ['A', 'A', 'B', 'B', 'B'],
	'value': [1, np.nan, 3, np.nan, 5]
	})
	group_means = df.groupby('group')['value'].transform('mean')
	df['value_filled'] = df['value'].fillna(group_means)

	# Smart string cleaning with regex
	df = pd.DataFrame({'text': [' Hello, World! ', 'Python 3.9 ', ' Data Science ']})
	df['text_clean'] = df['text'].str.strip().str.lower() # Strip whitespace, convert to lowercase
	df['text_no_punct'] = df['text'].str.replace(r'[^\w\s]', '', regex=True) # Remove punctuation
	df['text_numbers'] = df['text'].str.extract(r'(\d+\.?\d*)') # Extract numbers

	# One-liner to detect and remove outliers using IQR method
	def remove_outliers(df, column):
	"""Remove outliers using IQR method"""
	Q1 = df[column].quantile(0.25)
	Q3 = df[column].quantile(0.75)
	IQR = Q3 - Q1
	lower_bound = Q1 - 1.5 * IQR
	upper_bound = Q3 + 1.5 * IQR
	return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

	# ======================================================================
	# 3. ADVANCED FILTERING & SELECTION
	# ======================================================================

	# Boolean indexing combinations
	df = pd.DataFrame({
	'A': [1, 2, 3, 4, 5],
	'B': ['a', 'b', 'c', 'd', 'e'],
	'C': [True, False, True, False, True]
	})
	# Multiple conditions
	mask = (df['A'] > 2) & (df['C'] == True)
	filtered_df = df[mask]

	# Using query() for cleaner filtering syntax
	filtered_df = df.query('A > 2 and C == True')

	# Selecting with isin for multiple values
	df[df['B'].isin(['a', 'c', 'e'])]

	# Between operator shorthand
	df[df['A'].between(2, 4)]

	# Complex string filtering with contains, startswith, endswith
	df[df['B'].str.contains('a\|e')] # Contains 'a' or 'e'
	df[df['B'].str.startswith('a')] # Starts with 'a'

	# Filter rows where any value in the row meets a condition
	df[df.apply(lambda x: any(x > 3), axis=1)] # Any value in row > 3

	# ======================================================================
	# 4. PERFORMANCE OPTIMIZATION
	# ======================================================================

	# Vectorized operations instead of loops
	df = pd.DataFrame({'A': range(1, 6), 'B': range(6, 11)})
	# Slow: Using loop
	for i in range(len(df)):
	df.at[i, 'C'] = df.at[i, 'A'] * df.at[i, 'B']
	# Fast: Vectorized
	df['C'] = df['A'] * df['B']

	# Use numpy for complex math operations
	df['log_ratio'] = np.log(df['A'] / df['B'])

	# Efficient apply with axis specification
	df['squares'] = df['A'].apply(lambda x: x**2) # Apply to Series
	df['row_sum'] = df.apply(lambda x: x['A'] + x['B'], axis=1) # Apply to each row

	# Using eval() for complex expressions
	df.eval('D = A + B * C', inplace=True) # Creates new column with expression

	# Chunking for large files
	reader = pd.read_csv('large_file.csv', chunksize=10000)
	chunks = []
	for chunk in reader:
	# Process each chunk
	processed = chunk[chunk['value'] > 0]
	chunks.append(processed)
	# Combine all processed chunks
	result = pd.concat(chunks)

	# ======================================================================
	# 5. TIME SERIES MAGIC
	# ======================================================================

	# Generate date ranges easily
	date_range = pd.date_range(start='2023-01-01', end='2023-01-31', freq='D')
	df = pd.DataFrame({'date': date_range})

	# Extract date components with dt accessor
	df['year'] = df['date'].dt.year
	df['month'] = df['date'].dt.month
	df['day'] = df['date'].dt.day
	df['weekday'] = df['date'].dt.day_name()
	df['is_weekend'] = df['date'].dt.dayofweek >= 5 # Weekend detector

	# Efficient date filtering
	df[(df['date'] >= '2023-01-15') & (df['date'] <= '2023-01-20')]
	df[df['date'].between('2023-01-15', '2023-01-20')]

	# Resample time series data
	time_df = pd.DataFrame({
	'date': pd.date_range(start='2023-01-01', periods=100, freq='H'),
	'value': np.random.randn(100)
	})
	time_df.set_index('date', inplace=True)
	daily_mean = time_df.resample('D').mean() # Daily average
	weekly_sum = time_df.resample('W').sum() # Weekly sum

	# Rolling statistics
	time_df['rolling_mean'] = time_df['value'].rolling(window=24).mean() # 24-hour moving average
	time_df['rolling_std'] = time_df['value'].rolling(window=24).std() # Rolling standard deviation

	# Calculate year-over-year growth
	monthly_data = pd.DataFrame({
	'date': pd.date_range(start='2020-01-01', periods=36, freq='M'),
	'sales': np.random.randint(100, 200, 36)
	})
	monthly_data['sales_yoy'] = monthly_data['sales'].pct_change(12) * 100 # YoY growth percentage

	# ======================================================================
	# 6. RESHAPING & TRANSFORMATION
	# ======================================================================

	# Pivot tables with flexible aggregation
	df = pd.DataFrame({
	'date': ['2023-01-01', '2023-01-01', '2023-01-02', '2023-01-02'],
	'product': ['A', 'B', 'A', 'B'],
	'sales': [100, 200, 150, 250],
	'units': [10, 20, 15, 25]
	})

	# Basic pivot table
	pivot_df = df.pivot_table(
	index='date',
	columns='product',
	values='sales',
	aggfunc='sum'
	)

	# Multi-level pivot with multiple aggregations
	advanced_pivot = df.pivot_table(
	index='date',
	columns='product',
	values=['sales', 'units'],
	aggfunc={'sales': ['sum', 'mean'], 'units': 'sum'}
	)

	# Melt (unpivot) operation for wide to long format
	wide_df = pd.DataFrame({
	'id': [1, 2, 3],
	'q1': [10, 11, 12],
	'q2': [13, 14, 15],
	'q3': [16, 17, 18],
	'q4': [19, 20, 21]
	})

	long_df = pd.melt(
	wide_df,
	id_vars=['id'],
	value_vars=['q1', 'q2', 'q3', 'q4'],
	var_name='quarter',
	value_name='sales'
	)

	# Efficient crosstab for contingency tables
	df = pd.DataFrame({
	'gender': ['M', 'F', 'M', 'F', 'M', 'F', 'M', 'F'],
	'response': ['Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes', 'No']
	})

	contingency = pd.crosstab(
	df['gender'],
	df['response'],
	margins=True, # Include row and column totals
	normalize='all' # Show proportions instead of counts
	)

	# Advanced GroupBy techniques
	df = pd.DataFrame({
	'group': ['A', 'A', 'B', 'B', 'B', 'C'],
	'val1': [1, 2, 3, 4, 5, 6],
	'val2': [7, 8, 9, 10, 11, 12]
	})

	# Transform to align aggregated values with original dataframe
	df['group_mean'] = df.groupby('group')['val1'].transform('mean')
	df['group_rank'] = df.groupby('group')['val1'].transform('rank')

	# Multiple aggregations in one go
	agg_df = df.groupby('group').agg({
	'val1': ['mean', 'median', 'std'],
	'val2': ['min', 'max', 'count']
	})

	# Custom aggregation function
	def range_diff(x):
	return x.max() - x.min()

	agg_custom = df.groupby('group').agg(
	val1_range=('val1', range_diff),
	val2_mean=('val2', 'mean')
	)

	# Complex window functions like SQL
	df['cum_sum'] = df.groupby('group')['val1'].cumsum()
	df['expanding_mean'] = df.groupby('group')['val1'].expanding().mean().reset_index(level=0, drop=True)

	# ======================================================================
	# 7. VISUALIZATION SHORTCUTS
	# ======================================================================

	# One-liners for common plots
	df = pd.DataFrame({
	'cat': ['A', 'B', 'C', 'A', 'B', 'C'],
	'val': [10, 15, 7, 9, 14, 8]
	})

	# Quick histogram
	df['val'].plot.hist(bins=10, alpha=0.5)

	# Bar chart from value counts
	df['cat'].value_counts().plot.bar()

	# Scatter plot with color coding
	df.plot.scatter(x='cat', y='val', c='val', colormap='viridis')

	# Boxplot by category
	df.boxplot(column='val', by='cat')

	# Quick correlation heatmap
	corr_df = pd.DataFrame(np.random.randn(10, 5), columns=['A', 'B', 'C', 'D', 'E'])
	plt.figure(figsize=(8, 6))
	plt.imshow(corr_df.corr(), cmap='coolwarm')
	plt.colorbar()
	plt.xticks(range(len(corr_df.columns)), corr_df.columns)
	plt.yticks(range(len(corr_df.columns)), corr_df.columns)

	# ======================================================================
	# 8. METHOD CHAINING FOR ELEGANT TRANSFORMATIONS
	# ======================================================================

	# Before (verbose)
	df = pd.DataFrame({'A': [1, 2, np.nan, 4, 5], 'B': [np.nan, 2, 3, 4, 5]})
	df = df.fillna(0)
	df = df[df['A'] > 1]
	df = df.rename(columns={'A': 'Value A', 'B': 'Value B'})
	df = df.reset_index(drop=True)

	# After (chained)
	df = (pd.DataFrame({'A': [1, 2, np.nan, 4, 5], 'B': [np.nan, 2, 3, 4, 5]})
	.fillna(0)
	.query('A > 1')
	.rename(columns={'A': 'Value A', 'B': 'Value B'})
	.reset_index(drop=True))

	# ======================================================================
	# 9. WORKING WITH MULTIPLE DATAFRAMES
	# ======================================================================

	# Merge dataframes using SQL-like joins
	customers = pd.DataFrame({
	'customer_id': [1, 2, 3, 4, 5],
	'name': ['John', 'Mary', 'Bob', 'Alice', 'Dave']
	})

	orders = pd.DataFrame({
	'order_id': [101, 102, 103, 104],
	'customer_id': [1, 2, 3, 6],
	'amount': [100, 200, 300, 400]
	})

	# Inner join
	inner_join = pd.merge(customers, orders, on='customer_id', how='inner')

	# Left join with indicator
	left_join = pd.merge(customers, orders, on='customer_id', how='left', indicator=True)

	# Efficient concatenation with metadata preservation
	df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
	df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})

	# Add source information
	df1['source'] = 'df1'
	df2['source'] = 'df2'

	# Concatenate with keys for hierarchical index
	combined = pd.concat([df1, df2], keys=['first', 'second'])

	# ======================================================================
	# 10. TEXT & STRING MANIPULATION
	# ======================================================================

	# Extract information with regex
	text_df = pd.DataFrame({
	'text': [
	'Customer ID: 123',
	'Customer ID: 456, Order: 789',
	'No ID found',
	'Customer ID: ABC'
	]
	})

	# Extract customer IDs
	text_df['customer_id'] = text_df['text'].str.extract(r'Customer ID: (\w+)')

	# Extract multiple patterns with named groups
	pattern = r'Customer ID: (?P<cust_id>\w+)(?:, Order: (?P<order_id>\w+))?'
	extracted = text_df['text'].str.extract(pattern)

	# Handling special characters in files
	def clean_column_names(df):
	"""Clean column names by replacing spaces and special chars with underscores"""
	df.columns = df.columns.str.strip().str.lower().str.replace(r'[^\w\s]', '', regex=True).str.replace(' ', '_')
	return df