vidit0210 · March 1, 2020 17:02
diff --git a/Feature Engineering for Machine Learning in Python-DataCamp b/Feature Engineering for Machine Learning in Python-DataCamp
 ----
 Selecting specific data types
 ----
 # Create subset of only the numeric columns
 so_numeric_df = so_survey_df.select_dtypes(include=['int', 'float'])

 # Print the column names contained in so_survey_df_num
 print(so_numeric_df.columns)
 ---
 One-hot encoding and dummy variables
 ----
 # Convert the Country column to a one hot encoded Data Frame
 one_hot_encoded = pd.get_dummies(so_survey_df, columns=['Country'], prefix='OH')

 # Print the columns names
 print(one_hot_encoded.columns)
 ---
 # Create dummy variables for the Country column
 dummy = pd.get_dummies(so_survey_df, columns=['Country'], drop_first=True, prefix='DM')

 # Print the columns names
 print(dummy.columns)
 ---
 Dealing with uncommon categories
 ----
 # Create a series out of the Country column
 countries = so_survey_df['Country']

 # Get the counts of each category
 country_counts = countries.value_counts()

 # Print the count values for each category
 print(country_counts)
 -----
 # Create a series out of the Country column
 countries = so_survey_df['Country']

 # Get the counts of each category
 country_counts = countries.value_counts()

 # Create a mask for only categories that occur less than 10 times
 mask = countries.isin(country_counts[country_counts < 10].index)

 # Print the top 5 rows in the mask series
 print(mask.head())
 ----
 # Create a series out of the Country column
 countries = so_survey_df['Country']

 # Get the counts of each category
 country_counts = countries.value_counts()

 # Create a mask for only categories that occur less than 10 times
 mask = countries.isin(country_counts[country_counts < 10].index)

 # Label all other categories as Other
 countries[mask] = 'Other'

 # Print the updated category counts
 print(pd.value_counts(countries))
 -----
 Binarizing columns
 -----
 # Create the Paid_Job column filled with zeros
 so_survey_df['Paid_Job'] = 0

 # Replace all the Paid_Job values where ConvertedSalary is > 0
 so_survey_df.loc[so_survey_df['ConvertedSalary'] > 0, 'Paid_Job'] = 1

 # Print the first five rows of the columns
 print(so_survey_df[['Paid_Job', 'ConvertedSalary']].head())

 -----
 Binning values
 -----
 # Bin the continuous variable ConvertedSalary into 5 bins
 so_survey_df['equal_binned'] = pd.cut(so_survey_df['ConvertedSalary'], 5)

 # Print the first 5 rows of the equal_binned column
 print(so_survey_df[['equal_binned', 'ConvertedSalary']].head())
 ------
 # Import numpy
 import numpy as np

 # Specify the boundaries of the bins
 bins = [-np.inf, 10000, 50000, 100000, 150000, np.inf]

 # Bin labels
 labels = ['Very low', 'Low', 'Medium', 'High', 'Very high']

 # Bin the continuous variable ConvertedSalary using these boundaries
 so_survey_df['boundary_binned'] = pd.cut(so_survey_df['ConvertedSalary'], 
                                         bins, labels = labels)

 # Print the first 5 rows of the boundary_binned column
 print(so_survey_df[['boundary_binned', 'ConvertedSalary']].head())
 -------
 Finding the missing values
 -----
 print(sub_df.head(10).isnull())
 --
 Listwise deletion
 ----
 # Create a new DataFrame dropping all incomplete rows
 no_missing_values_rows = so_survey_df.dropna(how='any')

 # Print the shape of the new DataFrame
 print(no_missing_values_rows.shape)
 ----
 # Create a new DataFrame dropping all columns with incomplete rows
 no_missing_values_cols = so_survey_df.dropna(how='any', axis=1)

 # Print the shape of the new DataFrame
 print(no_missing_values_cols.shape)
 ----
 # Drop all rows where Gender is missing
 no_gender = so_survey_df.dropna(subset=['Gender'])

 # Print the shape of the new DataFrame
 print(no_gender.shape)
 ----
 Replacing missing values with constants
 -----
 # Replace missing values
 so_survey_df['Gender'].fillna(value='Not Given', inplace=True)

 # Print the count of each value
 print(so_survey_df['Gender'].value_counts())
 ----
 Filling continuous missing values
 -----
 # Fill missing values with the mean
 so_survey_df['StackOverflowJobsRecommend'].fillna(so_survey_df['StackOverflowJobsRecommend'].mean(), inplace=True)

 # Print the first five rows of StackOverflowJobsRecommend column
 print(so_survey_df['StackOverflowJobsRecommend'].head())
 ----
 # Fill missing values with the mean
 so_survey_df['StackOverflowJobsRecommend'].fillna(so_survey_df['StackOverflowJobsRecommend'].mean(), inplace=True)

 # Round the StackOverflowJobsRecommend values
 so_survey_df['StackOverflowJobsRecommend'] = round(so_survey_df['StackOverflowJobsRecommend'])

 # Print the top 5 rows
 print(so_survey_df['StackOverflowJobsRecommend'].head())
 -----
 Dealing with stray characters (I)
 ----
 so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace(',', '')
 ----
 so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace('$', '')
 ----
 Dealing with stray characters (II)
 ------
 # Attempt to convert the column to numeric values
 numeric_vals = pd.to_numeric(so_survey_df['RawSalary'], errors='coerce')

 # Find the indexes of missing values
 idx = numeric_vals.isna()

 # Print the relevant rows
 print(so_survey_df['RawSalary'][idx])
 ----
 # Replace the offending characters
 so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace('£', '')

 # Convert the column to float
 so_survey_df['RawSalary'] = so_survey_df['RawSalary'].astype('float')

 # Print the column
 print(so_survey_df['RawSalary'])
 ----
 Method chaining
 ----
 # Use method chaining
 so_survey_df['RawSalary'] = so_survey_df['RawSalary']\
                              .str.replace(',', '')\
                              .str.replace('$', '')\
                              .str.replace('£', '')\
                              .astype('float')
 
 # Print the RawSalary column
 print(so_survey_df['RawSalary'])
 -----
 What does your data look like? (I)
 ----
 # Create a histogram
 so_numeric_df.hist()
 plt.show()
 ---
 # Create a boxplot of two columns
 so_numeric_df[['Age', 'Years Experience']].boxplot()
 plt.show()
 ----
 # Create a boxplot of ConvertedSalary
 so_numeric_df[['ConvertedSalary']].boxplot()
 plt.show()
 -----
 What does your data look like? (II)
 ----
 # Import packages
 import matplotlib.pyplot as plt
 import seaborn as sns

 # Plot pairwise relationships
 sns.pairplot(so_numeric_df)

 # Show plot
 plt.show()
 ----
 Normalization
 ----
 # Import MinMaxScaler
 from sklearn.preprocessing import MinMaxScaler

 # Instantiate MinMaxScaler
 MM_scaler = MinMaxScaler()

 # Fit MM_scaler to the data
 MM_scaler.fit(so_numeric_df[['Age']])

 # Transform the data using the fitted scaler
 so_numeric_df['Age_MM'] = MM_scaler.transform(so_numeric_df[['Age']])

 # Compare the origional and transformed column
 print(so_numeric_df[['Age_MM', 'Age']].head())
 ------
 Standardization
 ------
 # Import StandardScaler
 from sklearn.preprocessing import StandardScaler

 # Instantiate StandardScaler
 SS_scaler = StandardScaler()

 # Fit SS_scaler to the data
 SS_scaler.fit(so_numeric_df[['Age']])

 # Transform the data using the fitted scaler
 so_numeric_df['Age_SS'] = SS_scaler.transform(so_numeric_df[['Age']])

 # Compare the origional and transformed column
 print(so_numeric_df[['Age_SS', 'Age']].head())
 ----
 Log transformation
 ----
 # Import PowerTransformer
 from sklearn.preprocessing import PowerTransformer

 # Instantiate PowerTransformer
 pow_trans = PowerTransformer()

 # Train the transform on the data
 pow_trans.fit(so_numeric_df[['ConvertedSalary']])

 # Apply the power transform to the data
 so_numeric_df['ConvertedSalary_LG'] = pow_trans.transform(so_numeric_df[['ConvertedSalary']])

 # Plot the data before and after the transformation
 so_numeric_df[['ConvertedSalary', 'ConvertedSalary_LG']].hist()
 plt.show()
 -----
 Percentage based outlier removal
 ----
 # Find the 95th quantile
 quantile = so_numeric_df['ConvertedSalary'].quantile(0.95)

 # Trim the outliers
 trimmed_df = so_numeric_df[so_numeric_df['ConvertedSalary'] < quantile]

 # The original histogram
 so_numeric_df[['ConvertedSalary']].hist()
 plt.show()
 plt.clf()

 # The trimmed histogram
 trimmed_df[['ConvertedSalary']].hist()
 plt.show()
 -----
 Statistical outlier removal
 -----
 # Find the mean and standard dev
 std = so_numeric_df['ConvertedSalary'].std()
 mean = so_numeric_df['ConvertedSalary'].mean()

 # Calculate the cutoff
 cut_off = std * 3
 lower, upper = mean - cut_off, mean + cut_off

 # Trim the outliers
 trimmed_df = so_numeric_df[(so_numeric_df['ConvertedSalary'] < upper) \
                           & (so_numeric_df['ConvertedSalary'] > lower)]

 # The trimmed box plot
 trimmed_df[['ConvertedSalary']].boxplot()
 plt.show()
 ------
 Train and testing transformations (I)
 -----
 # Import StandardScaler
 from sklearn.preprocessing import StandardScaler

 # Apply a standard scaler to the data
 SS_scaler = StandardScaler()

 # Fit the standard scaler to the data
 SS_scaler.fit(so_train_numeric[['Age']])

 # Transform the test data using the fitted scaler
 so_test_numeric['Age_ss'] = SS_scaler.transform(so_test_numeric[['Age']])
 print(so_test_numeric[['Age', 'Age_ss']].head())
 ------
 Train and testing transformations (II)
 ------
 train_std = so_train_numeric['ConvertedSalary'].std()
 train_mean = so_train_numeric['ConvertedSalary'].mean()

 cut_off = train_std * 3
 train_lower, train_upper = train_mean - cut_off, train_mean + cut_off

 # Trim the test DataFrame
 trimmed_df = so_test_numeric[(so_test_numeric['ConvertedSalary'] < train_upper) \
                             & (so_test_numeric['ConvertedSalary'] > train_lower)]
 -------
 Cleaning up your text
 ------
 # Replace all non letter characters with a whitespace
 speech_df['text_clean'] = speech_df['text'].str.replace('[^a-zA-Z]', ' ')

 # Change to lower case
 speech_df['text_clean'] = speech_df['text_clean'].str.lower()

 # Print the first 5 rows of the text_clean column
 print(speech_df['text_clean'].head())
 ----
 High level text features
 ----
 # Find the length of each text
 speech_df['char_cnt'] = speech_df['text_clean'].str.len()

 # Count the number of words in each text
 speech_df['word_cnt'] = speech_df['text_clean'].str.split().str.len()

 # Find the average length of word
 speech_df['avg_word_length'] = speech_df['char_cnt'] / speech_df['word_cnt']

 # Print the first 5 rows of these columns
 print(speech_df[['text_clean', 'char_cnt', 'word_cnt', 'avg_word_length']])
 -----
 LAST COURSE REMAINING INVOLVES TEXT
	----
	Selecting specific data types
	----
	# Create subset of only the numeric columns
	so_numeric_df = so_survey_df.select_dtypes(include=['int', 'float'])

	# Print the column names contained in so_survey_df_num
	print(so_numeric_df.columns)
	---
	One-hot encoding and dummy variables
	----
	# Convert the Country column to a one hot encoded Data Frame
	one_hot_encoded = pd.get_dummies(so_survey_df, columns=['Country'], prefix='OH')

	# Print the columns names
	print(one_hot_encoded.columns)
	---
	# Create dummy variables for the Country column
	dummy = pd.get_dummies(so_survey_df, columns=['Country'], drop_first=True, prefix='DM')

	# Print the columns names
	print(dummy.columns)
	---
	Dealing with uncommon categories
	----
	# Create a series out of the Country column
	countries = so_survey_df['Country']

	# Get the counts of each category
	country_counts = countries.value_counts()

	# Print the count values for each category
	print(country_counts)
	-----
	# Create a series out of the Country column
	countries = so_survey_df['Country']

	# Get the counts of each category
	country_counts = countries.value_counts()

	# Create a mask for only categories that occur less than 10 times
	mask = countries.isin(country_counts[country_counts < 10].index)

	# Print the top 5 rows in the mask series
	print(mask.head())
	----
	# Create a series out of the Country column
	countries = so_survey_df['Country']

	# Get the counts of each category
	country_counts = countries.value_counts()

	# Create a mask for only categories that occur less than 10 times
	mask = countries.isin(country_counts[country_counts < 10].index)

	# Label all other categories as Other
	countries[mask] = 'Other'

	# Print the updated category counts
	print(pd.value_counts(countries))
	-----
	Binarizing columns
	-----
	# Create the Paid_Job column filled with zeros
	so_survey_df['Paid_Job'] = 0

	# Replace all the Paid_Job values where ConvertedSalary is > 0
	so_survey_df.loc[so_survey_df['ConvertedSalary'] > 0, 'Paid_Job'] = 1

	# Print the first five rows of the columns
	print(so_survey_df[['Paid_Job', 'ConvertedSalary']].head())

	-----
	Binning values
	-----
	# Bin the continuous variable ConvertedSalary into 5 bins
	so_survey_df['equal_binned'] = pd.cut(so_survey_df['ConvertedSalary'], 5)

	# Print the first 5 rows of the equal_binned column
	print(so_survey_df[['equal_binned', 'ConvertedSalary']].head())
	------
	# Import numpy
	import numpy as np

	# Specify the boundaries of the bins
	bins = [-np.inf, 10000, 50000, 100000, 150000, np.inf]

	# Bin labels
	labels = ['Very low', 'Low', 'Medium', 'High', 'Very high']

	# Bin the continuous variable ConvertedSalary using these boundaries
	so_survey_df['boundary_binned'] = pd.cut(so_survey_df['ConvertedSalary'],
	bins, labels = labels)

	# Print the first 5 rows of the boundary_binned column
	print(so_survey_df[['boundary_binned', 'ConvertedSalary']].head())
	-------
	Finding the missing values
	-----
	print(sub_df.head(10).isnull())
	--
	Listwise deletion
	----
	# Create a new DataFrame dropping all incomplete rows
	no_missing_values_rows = so_survey_df.dropna(how='any')

	# Print the shape of the new DataFrame
	print(no_missing_values_rows.shape)
	----
	# Create a new DataFrame dropping all columns with incomplete rows
	no_missing_values_cols = so_survey_df.dropna(how='any', axis=1)

	# Print the shape of the new DataFrame
	print(no_missing_values_cols.shape)
	----
	# Drop all rows where Gender is missing
	no_gender = so_survey_df.dropna(subset=['Gender'])

	# Print the shape of the new DataFrame
	print(no_gender.shape)
	----
	Replacing missing values with constants
	-----
	# Replace missing values
	so_survey_df['Gender'].fillna(value='Not Given', inplace=True)

	# Print the count of each value
	print(so_survey_df['Gender'].value_counts())
	----
	Filling continuous missing values
	-----
	# Fill missing values with the mean
	so_survey_df['StackOverflowJobsRecommend'].fillna(so_survey_df['StackOverflowJobsRecommend'].mean(), inplace=True)

	# Print the first five rows of StackOverflowJobsRecommend column
	print(so_survey_df['StackOverflowJobsRecommend'].head())
	----
	# Fill missing values with the mean
	so_survey_df['StackOverflowJobsRecommend'].fillna(so_survey_df['StackOverflowJobsRecommend'].mean(), inplace=True)

	# Round the StackOverflowJobsRecommend values
	so_survey_df['StackOverflowJobsRecommend'] = round(so_survey_df['StackOverflowJobsRecommend'])

	# Print the top 5 rows
	print(so_survey_df['StackOverflowJobsRecommend'].head())
	-----
	Dealing with stray characters (I)
	----
	so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace(',', '')
	----
	so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace('$', '')
	----
	Dealing with stray characters (II)
	------
	# Attempt to convert the column to numeric values
	numeric_vals = pd.to_numeric(so_survey_df['RawSalary'], errors='coerce')

	# Find the indexes of missing values
	idx = numeric_vals.isna()

	# Print the relevant rows
	print(so_survey_df['RawSalary'][idx])
	----
	# Replace the offending characters
	so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace('£', '')

	# Convert the column to float
	so_survey_df['RawSalary'] = so_survey_df['RawSalary'].astype('float')

	# Print the column
	print(so_survey_df['RawSalary'])
	----
	Method chaining
	----
	# Use method chaining
	so_survey_df['RawSalary'] = so_survey_df['RawSalary']\
	.str.replace(',', '')\
	.str.replace('$', '')\
	.str.replace('£', '')\
	.astype('float')

	# Print the RawSalary column
	print(so_survey_df['RawSalary'])
	-----
	What does your data look like? (I)
	----
	# Create a histogram
	so_numeric_df.hist()
	plt.show()
	---
	# Create a boxplot of two columns
	so_numeric_df[['Age', 'Years Experience']].boxplot()
	plt.show()
	----
	# Create a boxplot of ConvertedSalary
	so_numeric_df[['ConvertedSalary']].boxplot()
	plt.show()
	-----
	What does your data look like? (II)
	----
	# Import packages
	import matplotlib.pyplot as plt
	import seaborn as sns

	# Plot pairwise relationships
	sns.pairplot(so_numeric_df)

	# Show plot
	plt.show()
	----
	Normalization
	----
	# Import MinMaxScaler
	from sklearn.preprocessing import MinMaxScaler

	# Instantiate MinMaxScaler
	MM_scaler = MinMaxScaler()

	# Fit MM_scaler to the data
	MM_scaler.fit(so_numeric_df[['Age']])

	# Transform the data using the fitted scaler
	so_numeric_df['Age_MM'] = MM_scaler.transform(so_numeric_df[['Age']])

	# Compare the origional and transformed column
	print(so_numeric_df[['Age_MM', 'Age']].head())
	------
	Standardization
	------
	# Import StandardScaler
	from sklearn.preprocessing import StandardScaler

	# Instantiate StandardScaler
	SS_scaler = StandardScaler()

	# Fit SS_scaler to the data
	SS_scaler.fit(so_numeric_df[['Age']])

	# Transform the data using the fitted scaler
	so_numeric_df['Age_SS'] = SS_scaler.transform(so_numeric_df[['Age']])

	# Compare the origional and transformed column
	print(so_numeric_df[['Age_SS', 'Age']].head())
	----
	Log transformation
	----
	# Import PowerTransformer
	from sklearn.preprocessing import PowerTransformer

	# Instantiate PowerTransformer
	pow_trans = PowerTransformer()

	# Train the transform on the data
	pow_trans.fit(so_numeric_df[['ConvertedSalary']])

	# Apply the power transform to the data
	so_numeric_df['ConvertedSalary_LG'] = pow_trans.transform(so_numeric_df[['ConvertedSalary']])

	# Plot the data before and after the transformation
	so_numeric_df[['ConvertedSalary', 'ConvertedSalary_LG']].hist()
	plt.show()
	-----
	Percentage based outlier removal
	----
	# Find the 95th quantile
	quantile = so_numeric_df['ConvertedSalary'].quantile(0.95)

	# Trim the outliers
	trimmed_df = so_numeric_df[so_numeric_df['ConvertedSalary'] < quantile]

	# The original histogram
	so_numeric_df[['ConvertedSalary']].hist()
	plt.show()
	plt.clf()

	# The trimmed histogram
	trimmed_df[['ConvertedSalary']].hist()
	plt.show()
	-----
	Statistical outlier removal
	-----
	# Find the mean and standard dev
	std = so_numeric_df['ConvertedSalary'].std()
	mean = so_numeric_df['ConvertedSalary'].mean()

	# Calculate the cutoff
	cut_off = std * 3
	lower, upper = mean - cut_off, mean + cut_off

	# Trim the outliers
	trimmed_df = so_numeric_df[(so_numeric_df['ConvertedSalary'] < upper) \
	& (so_numeric_df['ConvertedSalary'] > lower)]

	# The trimmed box plot
	trimmed_df[['ConvertedSalary']].boxplot()
	plt.show()
	------
	Train and testing transformations (I)
	-----
	# Import StandardScaler
	from sklearn.preprocessing import StandardScaler

	# Apply a standard scaler to the data
	SS_scaler = StandardScaler()

	# Fit the standard scaler to the data
	SS_scaler.fit(so_train_numeric[['Age']])

	# Transform the test data using the fitted scaler
	so_test_numeric['Age_ss'] = SS_scaler.transform(so_test_numeric[['Age']])
	print(so_test_numeric[['Age', 'Age_ss']].head())
	------
	Train and testing transformations (II)
	------
	train_std = so_train_numeric['ConvertedSalary'].std()
	train_mean = so_train_numeric['ConvertedSalary'].mean()

	cut_off = train_std * 3
	train_lower, train_upper = train_mean - cut_off, train_mean + cut_off

	# Trim the test DataFrame
	trimmed_df = so_test_numeric[(so_test_numeric['ConvertedSalary'] < train_upper) \
	& (so_test_numeric['ConvertedSalary'] > train_lower)]
	-------
	Cleaning up your text
	------
	# Replace all non letter characters with a whitespace
	speech_df['text_clean'] = speech_df['text'].str.replace('[^a-zA-Z]', ' ')

	# Change to lower case
	speech_df['text_clean'] = speech_df['text_clean'].str.lower()

	# Print the first 5 rows of the text_clean column
	print(speech_df['text_clean'].head())
	----
	High level text features
	----
	# Find the length of each text
	speech_df['char_cnt'] = speech_df['text_clean'].str.len()

	# Count the number of words in each text
	speech_df['word_cnt'] = speech_df['text_clean'].str.split().str.len()

	# Find the average length of word
	speech_df['avg_word_length'] = speech_df['char_cnt'] / speech_df['word_cnt']

	# Print the first 5 rows of these columns
	print(speech_df[['text_clean', 'char_cnt', 'word_cnt', 'avg_word_length']])
	-----
	LAST COURSE REMAINING INVOLVES TEXT