Created
December 28, 2016 22:00
-
-
Save shengch02/5062118c1b338ae61c65df1fa69776d1 to your computer and use it in GitHub Desktop.
(Python) Implement binary decision trees with different early stopping methods. Compare models with different stopping parameters.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#explore various techniques for preventing overfitting in decision trees | |
import math | |
import pandas as pd | |
import numpy as np | |
#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or | |
#the loan with be charged off and possibly go into default | |
import sframe | |
loans = sframe.SFrame('lending-club-data.gl/') | |
#target column 'safe_loans' with +1 means a safe loan and -1 for risky loan | |
loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x==0 else -1) | |
loans = loans.remove_column('bad_loans') | |
#use a subset of features (categorical and numeric) | |
features = ['grade', 'emp_length', 'home_ownership', 'term'] | |
target = 'safe_loans' | |
loans = loans[features+[target]] | |
#combat class imbalance: undersample the large class until the class distribution is half and half | |
safe_loans_raw = loans[loans[target]==+1] | |
risky_loans_raw = loans[loans[target]==-1] | |
percentage = len(risky_loans_raw)/float(len(safe_loans_raw)) | |
risky_loans = risky_loans_raw | |
safe_loans = safe_loans_raw.sample(percentage, seed=1) | |
loans_data = risky_loans.append(safe_loans) | |
#one-hot encoding: turn categorical variables into binary features | |
categorical_variables = [] | |
for feat_name, feat_type in zip(loans_data.column_names(), loans_data.column_types()): | |
if feat_type == str: | |
categorical_variables.append(feat_name) | |
for feature in categorical_variables: | |
loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x:1}) | |
loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature) | |
for column in loans_data_unpacked.column_names(): | |
loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0) | |
loans_data.remove_column(feature) | |
loans_data.add_columns(loans_data_unpacked) | |
#training-test split | |
train_data, test_data = loans_data.random_split(0.8, seed=1) | |
#early stopping methods for decision trees | |
#calculate whether the number of data points is less than or equal | |
#to the minimum node size | |
def reached_minimum_node_size(data, min_node_size): | |
return len(data)<=min_node_size | |
#compute the gain in error reduction | |
def error_reduction(effor_before_split, error_after_split): | |
return effor_before_split-error_after_split | |
#compute the number of misclassified examples | |
def intermediate_node_num_mistakes(labels_in_node): | |
if len(labels_in_node) == 0: | |
return 0 | |
else: | |
num_pos = sum(labels_in_node==1) | |
num_neg = sum(labels_in_node==-1) | |
return min(num_pos, num_neg) | |
#function to pick best feature to split on | |
def best_splitting_feature(data, features, target): | |
target_values = data[target] | |
best_feature = None | |
best_error = 10 | |
num_data_points = float(len(data)) | |
for feature in features: | |
left_split = data[data[feature]==0] | |
right_split = data[data[feature]==1] | |
left_mistakes = intermediate_node_num_mistakes(left_split[target]) | |
right_mistakes = intermediate_node_num_mistakes(right_split[target]) | |
error = (left_mistakes+right_mistakes)/num_data_points | |
if error < best_error: | |
best_error = error | |
best_feature = feature | |
return best_feature | |
#a function to create a leaf node | |
def create_leaf(target_values): | |
leaf = {'splitting_feature' : None, | |
'left':None, | |
'right':None, | |
'is_leaf':True | |
} | |
num_ones = len(target_values[target_values==1]) | |
num_minus_ones = len(target_values[target_values==-1]) | |
if num_ones > num_minus_ones: | |
leaf['prediction'] = 1 | |
else: | |
leaf['prediction'] = -1 | |
return leaf | |
#build the tree | |
def decision_tree_create(data, features, target, current_depth=0, max_depth=10, min_node_size=1, | |
min_error_reduction=0.0): | |
remaining_features = features[:] | |
target_values = data[target] | |
print '-----------------------------------------------------' | |
print 'Subtree, depth = %s (%s data points).' %(current_depth, len(target_values)) | |
#stop condition 1, all points are from the same class | |
if intermediate_node_num_mistakes(target_values)==0: | |
print 'Stopping condition 1 reached' | |
return create_leaf(target_values) | |
#stop conditoin 2, no more features to split on | |
if remaining_features==[]: | |
print 'Stopping condition 2 reached. No remaining features' | |
return create_leaf(target_values) | |
#early stop condition 1, reach the maximum depth | |
if current_depth>=max_depth: | |
print 'Reached maximum depth. Stopping for now' | |
return create_leaf(target_values) | |
#early stop condition 2, reach the minimum node size | |
if len(data)<=min_node_size: | |
print 'Early stopping condition reached, reach minimum node size' | |
return create_leaf(target_values) | |
#find the best splitting feature | |
splitting_feature = best_splitting_feature(data, features, target) | |
left_split = data[data[splitting_feature]==0] | |
right_split = data[data[splitting_feature]==1] | |
#early stopping condition 3, minimum error reduction | |
error_before_split = intermediate_node_num_mistakes(target_values)/float(len(data)) | |
left_mistakes = intermediate_node_num_mistakes(left_split[target]) | |
right_mistakes = intermediate_node_num_mistakes(right_split[target]) | |
error_after_split = (left_mistakes+right_mistakes)/float(len(data)) | |
if (error_before_split-error_after_split)<min_error_reduction: | |
print 'Early stopping condition 3, minimum error reduction' | |
return create_leaf(target_values) | |
remaining_features.remove(splitting_feature) | |
print 'Split on feature %s. (%s, %s)' % (splitting_feature, \ | |
len(left_split), len(right_split)) | |
if len(left_split)==len(data): | |
print 'Create leaf node' | |
return create_leaf(left_split[target]) | |
if len(right_split)==len(data): | |
print 'Create leaf node' | |
return create_leaf(right_split[target]) | |
#repeat (recurse) on left and right subtrees | |
left_tree = decision_tree_create(left_split, remaining_features, target, \ | |
current_depth+1, max_depth, min_node_size, min_error_reduction) | |
right_tree = decision_tree_create(right_split, remaining_features, target, \ | |
current_depth+1, max_depth, min_node_size, min_error_reduction) | |
return { 'is_leaf' : False, | |
'prediction' : None, | |
'splitting_feature': splitting_feature, | |
'left' : left_tree, | |
'right': right_tree} | |
features = train_data.column_names() | |
features.remove(target) | |
my_decision_tree_new = decision_tree_create(train_data, features, target, current_depth=0, max_depth=6, | |
min_node_size=100, min_error_reduction=0.0) | |
#make predictions | |
def classify(tree, x, annotate=False): | |
if tree['is_leaf']: | |
if annotate: | |
print 'At leaf, predicting %s' % tree['prediction'] | |
return tree['prediction'] | |
else: | |
split_feature_value = x[tree['splitting_feature']] | |
if annotate: | |
print 'Split on %s = %s' % (tree['splitting_feature'], | |
split_feature_value) | |
if split_feature_value==0: | |
return classify(tree['left'], x, annotate) | |
else: | |
return classify(tree['right'], x, annotate) | |
#prediction | |
print test_data[0] | |
print 'Predicted class: %s' % classify(my_decision_tree_new, test_data[0], annotate=True) | |
#evaluate the decision tree | |
def evaluate_classification_error(tree, data): | |
prediction = data.apply(lambda x: classify(tree, x)) | |
return float(sum(data['safe_loans']!=prediction))/len(data) | |
print(evaluate_classification_error(my_decision_tree_new, test_data)) | |
#print out a single decision stump | |
def print_stump(tree, name='root'): | |
split_name = tree['splitting_feature'] | |
if split_name is None: | |
print '(leaf, lable: %s)' % tree['prediction'] | |
return None | |
split_feature, split_value = split_name.split('.') | |
print ' [{0} == 0] [{0} == 1] '.format(split_name) | |
print ' (%s) (%s)' \ | |
% (('leaf, label: ' + str(tree['left']['prediction']) if tree['left']['is_leaf'] else 'subtree'), | |
('leaf, label: ' + str(tree['right']['prediction']) if tree['right']['is_leaf'] else 'subtree')) | |
print_stump(my_decision_tree_new) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment