Created
March 1, 2020 17:02
-
-
Save vidit0210/c3ca8454dc1f3d7c65309cc0015b288d to your computer and use it in GitHub Desktop.
Feature Engineering for Machine Learning in Python-dataCamp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
---- | |
Selecting specific data types | |
---- | |
# Create subset of only the numeric columns | |
so_numeric_df = so_survey_df.select_dtypes(include=['int', 'float']) | |
# Print the column names contained in so_survey_df_num | |
print(so_numeric_df.columns) | |
--- | |
One-hot encoding and dummy variables | |
---- | |
# Convert the Country column to a one hot encoded Data Frame | |
one_hot_encoded = pd.get_dummies(so_survey_df, columns=['Country'], prefix='OH') | |
# Print the columns names | |
print(one_hot_encoded.columns) | |
--- | |
# Create dummy variables for the Country column | |
dummy = pd.get_dummies(so_survey_df, columns=['Country'], drop_first=True, prefix='DM') | |
# Print the columns names | |
print(dummy.columns) | |
--- | |
Dealing with uncommon categories | |
---- | |
# Create a series out of the Country column | |
countries = so_survey_df['Country'] | |
# Get the counts of each category | |
country_counts = countries.value_counts() | |
# Print the count values for each category | |
print(country_counts) | |
----- | |
# Create a series out of the Country column | |
countries = so_survey_df['Country'] | |
# Get the counts of each category | |
country_counts = countries.value_counts() | |
# Create a mask for only categories that occur less than 10 times | |
mask = countries.isin(country_counts[country_counts < 10].index) | |
# Print the top 5 rows in the mask series | |
print(mask.head()) | |
---- | |
# Create a series out of the Country column | |
countries = so_survey_df['Country'] | |
# Get the counts of each category | |
country_counts = countries.value_counts() | |
# Create a mask for only categories that occur less than 10 times | |
mask = countries.isin(country_counts[country_counts < 10].index) | |
# Label all other categories as Other | |
countries[mask] = 'Other' | |
# Print the updated category counts | |
print(pd.value_counts(countries)) | |
----- | |
Binarizing columns | |
----- | |
# Create the Paid_Job column filled with zeros | |
so_survey_df['Paid_Job'] = 0 | |
# Replace all the Paid_Job values where ConvertedSalary is > 0 | |
so_survey_df.loc[so_survey_df['ConvertedSalary'] > 0, 'Paid_Job'] = 1 | |
# Print the first five rows of the columns | |
print(so_survey_df[['Paid_Job', 'ConvertedSalary']].head()) | |
----- | |
Binning values | |
----- | |
# Bin the continuous variable ConvertedSalary into 5 bins | |
so_survey_df['equal_binned'] = pd.cut(so_survey_df['ConvertedSalary'], 5) | |
# Print the first 5 rows of the equal_binned column | |
print(so_survey_df[['equal_binned', 'ConvertedSalary']].head()) | |
------ | |
# Import numpy | |
import numpy as np | |
# Specify the boundaries of the bins | |
bins = [-np.inf, 10000, 50000, 100000, 150000, np.inf] | |
# Bin labels | |
labels = ['Very low', 'Low', 'Medium', 'High', 'Very high'] | |
# Bin the continuous variable ConvertedSalary using these boundaries | |
so_survey_df['boundary_binned'] = pd.cut(so_survey_df['ConvertedSalary'], | |
bins, labels = labels) | |
# Print the first 5 rows of the boundary_binned column | |
print(so_survey_df[['boundary_binned', 'ConvertedSalary']].head()) | |
------- | |
Finding the missing values | |
----- | |
print(sub_df.head(10).isnull()) | |
-- | |
Listwise deletion | |
---- | |
# Create a new DataFrame dropping all incomplete rows | |
no_missing_values_rows = so_survey_df.dropna(how='any') | |
# Print the shape of the new DataFrame | |
print(no_missing_values_rows.shape) | |
---- | |
# Create a new DataFrame dropping all columns with incomplete rows | |
no_missing_values_cols = so_survey_df.dropna(how='any', axis=1) | |
# Print the shape of the new DataFrame | |
print(no_missing_values_cols.shape) | |
---- | |
# Drop all rows where Gender is missing | |
no_gender = so_survey_df.dropna(subset=['Gender']) | |
# Print the shape of the new DataFrame | |
print(no_gender.shape) | |
---- | |
Replacing missing values with constants | |
----- | |
# Replace missing values | |
so_survey_df['Gender'].fillna(value='Not Given', inplace=True) | |
# Print the count of each value | |
print(so_survey_df['Gender'].value_counts()) | |
---- | |
Filling continuous missing values | |
----- | |
# Fill missing values with the mean | |
so_survey_df['StackOverflowJobsRecommend'].fillna(so_survey_df['StackOverflowJobsRecommend'].mean(), inplace=True) | |
# Print the first five rows of StackOverflowJobsRecommend column | |
print(so_survey_df['StackOverflowJobsRecommend'].head()) | |
---- | |
# Fill missing values with the mean | |
so_survey_df['StackOverflowJobsRecommend'].fillna(so_survey_df['StackOverflowJobsRecommend'].mean(), inplace=True) | |
# Round the StackOverflowJobsRecommend values | |
so_survey_df['StackOverflowJobsRecommend'] = round(so_survey_df['StackOverflowJobsRecommend']) | |
# Print the top 5 rows | |
print(so_survey_df['StackOverflowJobsRecommend'].head()) | |
----- | |
Dealing with stray characters (I) | |
---- | |
so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace(',', '') | |
---- | |
so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace('$', '') | |
---- | |
Dealing with stray characters (II) | |
------ | |
# Attempt to convert the column to numeric values | |
numeric_vals = pd.to_numeric(so_survey_df['RawSalary'], errors='coerce') | |
# Find the indexes of missing values | |
idx = numeric_vals.isna() | |
# Print the relevant rows | |
print(so_survey_df['RawSalary'][idx]) | |
---- | |
# Replace the offending characters | |
so_survey_df['RawSalary'] = so_survey_df['RawSalary'].str.replace('£', '') | |
# Convert the column to float | |
so_survey_df['RawSalary'] = so_survey_df['RawSalary'].astype('float') | |
# Print the column | |
print(so_survey_df['RawSalary']) | |
---- | |
Method chaining | |
---- | |
# Use method chaining | |
so_survey_df['RawSalary'] = so_survey_df['RawSalary']\ | |
.str.replace(',', '')\ | |
.str.replace('$', '')\ | |
.str.replace('£', '')\ | |
.astype('float') | |
# Print the RawSalary column | |
print(so_survey_df['RawSalary']) | |
----- | |
What does your data look like? (I) | |
---- | |
# Create a histogram | |
so_numeric_df.hist() | |
plt.show() | |
--- | |
# Create a boxplot of two columns | |
so_numeric_df[['Age', 'Years Experience']].boxplot() | |
plt.show() | |
---- | |
# Create a boxplot of ConvertedSalary | |
so_numeric_df[['ConvertedSalary']].boxplot() | |
plt.show() | |
----- | |
What does your data look like? (II) | |
---- | |
# Import packages | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
# Plot pairwise relationships | |
sns.pairplot(so_numeric_df) | |
# Show plot | |
plt.show() | |
---- | |
Normalization | |
---- | |
# Import MinMaxScaler | |
from sklearn.preprocessing import MinMaxScaler | |
# Instantiate MinMaxScaler | |
MM_scaler = MinMaxScaler() | |
# Fit MM_scaler to the data | |
MM_scaler.fit(so_numeric_df[['Age']]) | |
# Transform the data using the fitted scaler | |
so_numeric_df['Age_MM'] = MM_scaler.transform(so_numeric_df[['Age']]) | |
# Compare the origional and transformed column | |
print(so_numeric_df[['Age_MM', 'Age']].head()) | |
------ | |
Standardization | |
------ | |
# Import StandardScaler | |
from sklearn.preprocessing import StandardScaler | |
# Instantiate StandardScaler | |
SS_scaler = StandardScaler() | |
# Fit SS_scaler to the data | |
SS_scaler.fit(so_numeric_df[['Age']]) | |
# Transform the data using the fitted scaler | |
so_numeric_df['Age_SS'] = SS_scaler.transform(so_numeric_df[['Age']]) | |
# Compare the origional and transformed column | |
print(so_numeric_df[['Age_SS', 'Age']].head()) | |
---- | |
Log transformation | |
---- | |
# Import PowerTransformer | |
from sklearn.preprocessing import PowerTransformer | |
# Instantiate PowerTransformer | |
pow_trans = PowerTransformer() | |
# Train the transform on the data | |
pow_trans.fit(so_numeric_df[['ConvertedSalary']]) | |
# Apply the power transform to the data | |
so_numeric_df['ConvertedSalary_LG'] = pow_trans.transform(so_numeric_df[['ConvertedSalary']]) | |
# Plot the data before and after the transformation | |
so_numeric_df[['ConvertedSalary', 'ConvertedSalary_LG']].hist() | |
plt.show() | |
----- | |
Percentage based outlier removal | |
---- | |
# Find the 95th quantile | |
quantile = so_numeric_df['ConvertedSalary'].quantile(0.95) | |
# Trim the outliers | |
trimmed_df = so_numeric_df[so_numeric_df['ConvertedSalary'] < quantile] | |
# The original histogram | |
so_numeric_df[['ConvertedSalary']].hist() | |
plt.show() | |
plt.clf() | |
# The trimmed histogram | |
trimmed_df[['ConvertedSalary']].hist() | |
plt.show() | |
----- | |
Statistical outlier removal | |
----- | |
# Find the mean and standard dev | |
std = so_numeric_df['ConvertedSalary'].std() | |
mean = so_numeric_df['ConvertedSalary'].mean() | |
# Calculate the cutoff | |
cut_off = std * 3 | |
lower, upper = mean - cut_off, mean + cut_off | |
# Trim the outliers | |
trimmed_df = so_numeric_df[(so_numeric_df['ConvertedSalary'] < upper) \ | |
& (so_numeric_df['ConvertedSalary'] > lower)] | |
# The trimmed box plot | |
trimmed_df[['ConvertedSalary']].boxplot() | |
plt.show() | |
------ | |
Train and testing transformations (I) | |
----- | |
# Import StandardScaler | |
from sklearn.preprocessing import StandardScaler | |
# Apply a standard scaler to the data | |
SS_scaler = StandardScaler() | |
# Fit the standard scaler to the data | |
SS_scaler.fit(so_train_numeric[['Age']]) | |
# Transform the test data using the fitted scaler | |
so_test_numeric['Age_ss'] = SS_scaler.transform(so_test_numeric[['Age']]) | |
print(so_test_numeric[['Age', 'Age_ss']].head()) | |
------ | |
Train and testing transformations (II) | |
------ | |
train_std = so_train_numeric['ConvertedSalary'].std() | |
train_mean = so_train_numeric['ConvertedSalary'].mean() | |
cut_off = train_std * 3 | |
train_lower, train_upper = train_mean - cut_off, train_mean + cut_off | |
# Trim the test DataFrame | |
trimmed_df = so_test_numeric[(so_test_numeric['ConvertedSalary'] < train_upper) \ | |
& (so_test_numeric['ConvertedSalary'] > train_lower)] | |
------- | |
Cleaning up your text | |
------ | |
# Replace all non letter characters with a whitespace | |
speech_df['text_clean'] = speech_df['text'].str.replace('[^a-zA-Z]', ' ') | |
# Change to lower case | |
speech_df['text_clean'] = speech_df['text_clean'].str.lower() | |
# Print the first 5 rows of the text_clean column | |
print(speech_df['text_clean'].head()) | |
---- | |
High level text features | |
---- | |
# Find the length of each text | |
speech_df['char_cnt'] = speech_df['text_clean'].str.len() | |
# Count the number of words in each text | |
speech_df['word_cnt'] = speech_df['text_clean'].str.split().str.len() | |
# Find the average length of word | |
speech_df['avg_word_length'] = speech_df['char_cnt'] / speech_df['word_cnt'] | |
# Print the first 5 rows of these columns | |
print(speech_df[['text_clean', 'char_cnt', 'word_cnt', 'avg_word_length']]) | |
----- | |
LAST COURSE REMAINING INVOLVES TEXT | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment