Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save shengch02/cf89b4b34b265ef6f17186c8f943f97c to your computer and use it in GitHub Desktop.
Save shengch02/cf89b4b34b265ef6f17186c8f943f97c to your computer and use it in GitHub Desktop.
(Python) Train a boosted ensemble of decision-trees (gradient boosted trees) on the lending club dataset. Predict whether a loan will default along with prediction probabilities. Evaluate the trained model and compare it with a baseline.
#Boosting a decision stump from scratch
import pandas as pd
import numpy as np
#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or
#the loan with be charged off and possibly go into default
import sframe
loans = sframe.SFrame('lending-club-data.gl/')
#target column 'safe_loans' with +1 means a safe loan and -1 for risky loan
loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x==0 else -1)
loans = loans.remove_column('bad_loans')
#use a subset of features (categorical and numeric)
features = ['grade', 'term', 'home_ownership', 'emp_length']
target = 'safe_loans'
loans, loans_with_na = loans[features+[target]].dropna_split()
#Count the number of rows with missing data
num_rows_with_na = loans_with_na.num_rows()
num_rows = loans.num_rows()
print 'Dropping %s observations; keeping %s' % (num_rows_with_na, num_rows)
#undersample the larger class in order to balance our dataset
safe_loans_raw = loans[loans[target]==1]
risky_loans_raw = loans[loans[target]==-1]
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
safe_loans = safe_loans_raw.sample(percentage, seed=1)
risky_loans = risky_loans_raw
loans_data = risky_loans.append(safe_loans)
print 'Percentage of safe loans :', len(safe_loans)/float(len(loans_data))
print 'Percentage of risky loans :', len(risky_loans)/float(len(loans_data))
print 'Total number of loans :', len(loans_data)
#One-hot encoding
categorical_variables = []
for feat_name, feat_type in zip(loans_data.column_names(), loans_data.column_types()):
if feat_type == str:
categorical_variables.append(feat_name)
for feature in categorical_variables:
loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x:1})
loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature)
for column in loans_data_unpacked.column_names():
loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0)
loans_data.remove_column(feature)
loans_data.add_columns(loans_data_unpacked)
#split data into training and validation
train_data, validation_data = loans_data.random_split(0.8, seed=1)
features = train_data.column_names()
features.remove(target)
#calculate the weight of mistakes for making the 'weighted-majority' prediction
def intermediate_node_weighted_mistakes(labels_in_node, data_weights):
total_weight_positive = sum(data_weights[labels_in_node==1])
weighted_mistakes_all_negative = total_weight_positive
total_weight_negative = sum(data_weights[labels_in_node==-1])
weighted_mistakes_all_positive = total_weight_negative
if weighted_mistakes_all_negative < weighted_mistakes_all_positive:
return (weighted_mistakes_all_negative, -1)
else:
return (weighted_mistakes_all_positive, 1)
#pick the best feature to split on, with weighting of points incorporated
def best_splitting_feature(data, features, target, data_weights):
best_feature = None
best_error = float('+inf')
num_points = float(len(data))
for feature in features:
left_split = data[data[feature]==0]
right_split = data[data[feature]==1]
left_data_weights = data_weights[data[feature]==0]
right_data_weights = data_weights[data[feature]==1]
left_weighted_mistakes, left_class = intermediate_node_weighted_mistakes(left_split[target], left_data_weights)
right_weighted_mistakes, right_class = intermediate_node_weighted_mistakes(right_split[target], right_data_weights)
error = (left_weighted_mistakes+right_weighted_mistakes)/sum(data_weights)
if error<best_error:
best_feature = feature
best_error = error
return best_feature
#create the leaf
def create_leaf(target_values, data_weights):
leaf = {'splitting_feature' : None, 'is_leaf' : True}
weighter_error, best_class = intermediate_node_weighted_mistakes(target_values, data_weights)
leaf['prediction'] = best_class
return leaf
#build the tree
def weighted_decision_tree_create(data, features, target, data_weights, current_depth=1, max_depth=10):
remaining_features = features[:]
target_values = data[target]
print '--------------'
print 'Subtree, depth = %s (%s data points).' % (current_depth, len(target_values))
#stopping condition 1, error is 0
if intermediate_node_weighted_mistakes(target_values, data_weights)[0] <= 1e-15:
print 'Stopping condition 1 reached.'
return create_leaf(target_values, data_weights)
#stopping condition 2, no more features
if remaining_features == []:
print 'Stopping condition 2 reached.'
return create_leaf(target_values, data_weights)
#stopping condition 3, reach the maximum depth
if current_depth > max_depth:
print 'Reached maximum depth.'
return create_leaf(target_values, data_weights)
splitting_feature = best_splitting_feature(data, features, target, data_weights)
remaining_features.remove(splitting_feature)
left_split = data[data[splitting_feature]==0]
right_split = data[data[splitting_feature]==1]
left_data_weights = data_weights[data[splitting_feature]==0]
right_data_weights = data_weights[data[splitting_feature]==1]
print 'Split on feature %s. (%s, %s)' % (splitting_feature, len(left_split), len(right_split))
if len(left_split)==len(data):
print 'Creating leaf node.'
return create_leaf(left_split[target], data_weights)
if len(right_split)==len(data):
print 'Creating leaf node.'
return create_leaf(right_split[target], data_weights)
#Repeat recurse on left and right subtrees
left_tree = weighted_decision_tree_create(left_split, remaining_features, target, left_data_weights, \
current_depth+1, max_depth)
right_tree = weighted_decision_tree_create(right_split, remaining_features, target, \
right_data_weights, current_depth+1, max_depth)
return {'is_leaf' : False, 'prediction' : None, 'splitting_feature' : splitting_feature, \
'left' : left_tree, 'right' : right_tree}
#count the nodes in the tree
def count_nodes(tree):
if tree['is_leaf']:
return 1
return 1+count_nodes(tree['left'])+count_nodes(tree['right'])
#make predictions with the tree
def classify(tree, x, annotate=False):
if tree['is_leaf']:
if annotate:
print 'At leaf, prediction %s' % tree['prediction']
return tree['prediction']
else:
split_feature_value = x[tree['splitting_feature']]
if annotate:
print 'Split on %s = %s' % (tree['splitting_feature'], split_feature_value)
if split_feature_value==0:
return classify(tree['left'], x, annotate)
else:
return classify(tree['right'], x, annotate)
#evalate classification error
def evaluate_classification_error(tree, data, target):
prediction = data.apply(lambda x : classify(tree, x))
return (prediction!=data[target]).sum()/float(len(data))
#one test example
example_data_weights = sframe.SArray([1.0]*10+[0.0]*(len(train_data)-20)+[1.0]*10)
small_data_decision_tree_subset20 = weighted_decision_tree_create(train_data, features,
target, example_data_weights, max_depth=2)
print evaluate_classification_error(small_data_decision_tree_subset20, train_data, target)
set20_predt = train_data.apply(lambda x: classify(small_data_decision_tree_subset20, x))
#implement the Adaboost from scratch
from math import (log, exp)
def adaboost_with_tree_stumps(data, features, target, num_tree_stumps):
alpha = sframe.SArray([1.0]*len(data))
weights = []
tree_stumps = []
target_values = data[target]
for t in range(num_tree_stumps):
print '========================='
print 'Adaboost Iteration %d' % t
tree_stump = weighted_decision_tree_create(data, features, target,
data_weights=alpha, max_depth=1)
tree_stumps.append(tree_stump)
predictions = data.apply(lambda x: classify(tree_stump, x))
is_correct = predictions==target_values
is_wrong = predictions!=target_values
weighted_error = sum(alpha[is_wrong])/sum(alpha)
weight = 0.5*log((1-weighted_error)/weighted_error)
weights.append(weight)
adjustment = is_correct.apply(lambda is_correct: exp(-weight) if is_correct else exp(weight))
alpha = alpha*adjustment
alpha = alpha/sum(alpha)
return weights, tree_stumps
def predict_adaboost(stump_weights, tree_stumps, data):
scores = sframe.SArray([0.0]*len(data))
for i, tree_stump in enumerate(tree_stumps):
predictions = data.apply(lambda x: classify(tree_stump, x))
scores = scores+stump_weights[i]*predictions
return scores.apply(lambda score: +1 if score>0 else -1)
stump_weights, tree_stumps = adaboost_with_tree_stumps(train_data, features, target, 30)
error_all = []
for n in range(30):
predictions = predict_adaboost(stump_weights[:n], tree_stumps[:n], train_data)
error = sum(predictions!=train_data[target])/float(len(train_data))
error_all.append(error)
print 'Iteration %s, training error %s' % (n+1, error_all[n])
#visualize the training error
import matplotlib.pyplot as plt
plt.plot(range(1,31), error_all, '-', linewidth=2.0, label='Training error')
plt.xlabel('# of iterations')
plt.ylabel('Classification error')
plt.show()
#validation error
validation_error_all = []
for n in range(30):
predictions = predict_adaboost(stump_weights[:n], tree_stumps[:n], validation_data)
error = sum(predictions!=validation_data[target])/float(len(validation_data))
validation_error_all.append(error)
print 'Iteration %s, validation error %s' % (n+1, validation_error_all[n])
plt.plot(range(1,31), error_all, '-', linewidth=2.0, label='Training error')
plt.plot(range(1,31), validation_error_all, '-', linewidth=2.0, label='Validation error')
plt.xlabel('# of iterations')
plt.ylabel('Classification error')
plt.legend(loc='best', prop={'size':15})
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment