Created
January 2, 2017 17:08
-
-
Save shengch02/cf89b4b34b265ef6f17186c8f943f97c to your computer and use it in GitHub Desktop.
(Python) Train a boosted ensemble of decision-trees (gradient boosted trees) on the lending club dataset. Predict whether a loan will default along with prediction probabilities. Evaluate the trained model and compare it with a baseline.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Boosting a decision stump from scratch | |
import pandas as pd | |
import numpy as np | |
#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or | |
#the loan with be charged off and possibly go into default | |
import sframe | |
loans = sframe.SFrame('lending-club-data.gl/') | |
#target column 'safe_loans' with +1 means a safe loan and -1 for risky loan | |
loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x==0 else -1) | |
loans = loans.remove_column('bad_loans') | |
#use a subset of features (categorical and numeric) | |
features = ['grade', 'term', 'home_ownership', 'emp_length'] | |
target = 'safe_loans' | |
loans, loans_with_na = loans[features+[target]].dropna_split() | |
#Count the number of rows with missing data | |
num_rows_with_na = loans_with_na.num_rows() | |
num_rows = loans.num_rows() | |
print 'Dropping %s observations; keeping %s' % (num_rows_with_na, num_rows) | |
#undersample the larger class in order to balance our dataset | |
safe_loans_raw = loans[loans[target]==1] | |
risky_loans_raw = loans[loans[target]==-1] | |
percentage = len(risky_loans_raw)/float(len(safe_loans_raw)) | |
safe_loans = safe_loans_raw.sample(percentage, seed=1) | |
risky_loans = risky_loans_raw | |
loans_data = risky_loans.append(safe_loans) | |
print 'Percentage of safe loans :', len(safe_loans)/float(len(loans_data)) | |
print 'Percentage of risky loans :', len(risky_loans)/float(len(loans_data)) | |
print 'Total number of loans :', len(loans_data) | |
#One-hot encoding | |
categorical_variables = [] | |
for feat_name, feat_type in zip(loans_data.column_names(), loans_data.column_types()): | |
if feat_type == str: | |
categorical_variables.append(feat_name) | |
for feature in categorical_variables: | |
loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x:1}) | |
loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature) | |
for column in loans_data_unpacked.column_names(): | |
loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0) | |
loans_data.remove_column(feature) | |
loans_data.add_columns(loans_data_unpacked) | |
#split data into training and validation | |
train_data, validation_data = loans_data.random_split(0.8, seed=1) | |
features = train_data.column_names() | |
features.remove(target) | |
#calculate the weight of mistakes for making the 'weighted-majority' prediction | |
def intermediate_node_weighted_mistakes(labels_in_node, data_weights): | |
total_weight_positive = sum(data_weights[labels_in_node==1]) | |
weighted_mistakes_all_negative = total_weight_positive | |
total_weight_negative = sum(data_weights[labels_in_node==-1]) | |
weighted_mistakes_all_positive = total_weight_negative | |
if weighted_mistakes_all_negative < weighted_mistakes_all_positive: | |
return (weighted_mistakes_all_negative, -1) | |
else: | |
return (weighted_mistakes_all_positive, 1) | |
#pick the best feature to split on, with weighting of points incorporated | |
def best_splitting_feature(data, features, target, data_weights): | |
best_feature = None | |
best_error = float('+inf') | |
num_points = float(len(data)) | |
for feature in features: | |
left_split = data[data[feature]==0] | |
right_split = data[data[feature]==1] | |
left_data_weights = data_weights[data[feature]==0] | |
right_data_weights = data_weights[data[feature]==1] | |
left_weighted_mistakes, left_class = intermediate_node_weighted_mistakes(left_split[target], left_data_weights) | |
right_weighted_mistakes, right_class = intermediate_node_weighted_mistakes(right_split[target], right_data_weights) | |
error = (left_weighted_mistakes+right_weighted_mistakes)/sum(data_weights) | |
if error<best_error: | |
best_feature = feature | |
best_error = error | |
return best_feature | |
#create the leaf | |
def create_leaf(target_values, data_weights): | |
leaf = {'splitting_feature' : None, 'is_leaf' : True} | |
weighter_error, best_class = intermediate_node_weighted_mistakes(target_values, data_weights) | |
leaf['prediction'] = best_class | |
return leaf | |
#build the tree | |
def weighted_decision_tree_create(data, features, target, data_weights, current_depth=1, max_depth=10): | |
remaining_features = features[:] | |
target_values = data[target] | |
print '--------------' | |
print 'Subtree, depth = %s (%s data points).' % (current_depth, len(target_values)) | |
#stopping condition 1, error is 0 | |
if intermediate_node_weighted_mistakes(target_values, data_weights)[0] <= 1e-15: | |
print 'Stopping condition 1 reached.' | |
return create_leaf(target_values, data_weights) | |
#stopping condition 2, no more features | |
if remaining_features == []: | |
print 'Stopping condition 2 reached.' | |
return create_leaf(target_values, data_weights) | |
#stopping condition 3, reach the maximum depth | |
if current_depth > max_depth: | |
print 'Reached maximum depth.' | |
return create_leaf(target_values, data_weights) | |
splitting_feature = best_splitting_feature(data, features, target, data_weights) | |
remaining_features.remove(splitting_feature) | |
left_split = data[data[splitting_feature]==0] | |
right_split = data[data[splitting_feature]==1] | |
left_data_weights = data_weights[data[splitting_feature]==0] | |
right_data_weights = data_weights[data[splitting_feature]==1] | |
print 'Split on feature %s. (%s, %s)' % (splitting_feature, len(left_split), len(right_split)) | |
if len(left_split)==len(data): | |
print 'Creating leaf node.' | |
return create_leaf(left_split[target], data_weights) | |
if len(right_split)==len(data): | |
print 'Creating leaf node.' | |
return create_leaf(right_split[target], data_weights) | |
#Repeat recurse on left and right subtrees | |
left_tree = weighted_decision_tree_create(left_split, remaining_features, target, left_data_weights, \ | |
current_depth+1, max_depth) | |
right_tree = weighted_decision_tree_create(right_split, remaining_features, target, \ | |
right_data_weights, current_depth+1, max_depth) | |
return {'is_leaf' : False, 'prediction' : None, 'splitting_feature' : splitting_feature, \ | |
'left' : left_tree, 'right' : right_tree} | |
#count the nodes in the tree | |
def count_nodes(tree): | |
if tree['is_leaf']: | |
return 1 | |
return 1+count_nodes(tree['left'])+count_nodes(tree['right']) | |
#make predictions with the tree | |
def classify(tree, x, annotate=False): | |
if tree['is_leaf']: | |
if annotate: | |
print 'At leaf, prediction %s' % tree['prediction'] | |
return tree['prediction'] | |
else: | |
split_feature_value = x[tree['splitting_feature']] | |
if annotate: | |
print 'Split on %s = %s' % (tree['splitting_feature'], split_feature_value) | |
if split_feature_value==0: | |
return classify(tree['left'], x, annotate) | |
else: | |
return classify(tree['right'], x, annotate) | |
#evalate classification error | |
def evaluate_classification_error(tree, data, target): | |
prediction = data.apply(lambda x : classify(tree, x)) | |
return (prediction!=data[target]).sum()/float(len(data)) | |
#one test example | |
example_data_weights = sframe.SArray([1.0]*10+[0.0]*(len(train_data)-20)+[1.0]*10) | |
small_data_decision_tree_subset20 = weighted_decision_tree_create(train_data, features, | |
target, example_data_weights, max_depth=2) | |
print evaluate_classification_error(small_data_decision_tree_subset20, train_data, target) | |
set20_predt = train_data.apply(lambda x: classify(small_data_decision_tree_subset20, x)) | |
#implement the Adaboost from scratch | |
from math import (log, exp) | |
def adaboost_with_tree_stumps(data, features, target, num_tree_stumps): | |
alpha = sframe.SArray([1.0]*len(data)) | |
weights = [] | |
tree_stumps = [] | |
target_values = data[target] | |
for t in range(num_tree_stumps): | |
print '=========================' | |
print 'Adaboost Iteration %d' % t | |
tree_stump = weighted_decision_tree_create(data, features, target, | |
data_weights=alpha, max_depth=1) | |
tree_stumps.append(tree_stump) | |
predictions = data.apply(lambda x: classify(tree_stump, x)) | |
is_correct = predictions==target_values | |
is_wrong = predictions!=target_values | |
weighted_error = sum(alpha[is_wrong])/sum(alpha) | |
weight = 0.5*log((1-weighted_error)/weighted_error) | |
weights.append(weight) | |
adjustment = is_correct.apply(lambda is_correct: exp(-weight) if is_correct else exp(weight)) | |
alpha = alpha*adjustment | |
alpha = alpha/sum(alpha) | |
return weights, tree_stumps | |
def predict_adaboost(stump_weights, tree_stumps, data): | |
scores = sframe.SArray([0.0]*len(data)) | |
for i, tree_stump in enumerate(tree_stumps): | |
predictions = data.apply(lambda x: classify(tree_stump, x)) | |
scores = scores+stump_weights[i]*predictions | |
return scores.apply(lambda score: +1 if score>0 else -1) | |
stump_weights, tree_stumps = adaboost_with_tree_stumps(train_data, features, target, 30) | |
error_all = [] | |
for n in range(30): | |
predictions = predict_adaboost(stump_weights[:n], tree_stumps[:n], train_data) | |
error = sum(predictions!=train_data[target])/float(len(train_data)) | |
error_all.append(error) | |
print 'Iteration %s, training error %s' % (n+1, error_all[n]) | |
#visualize the training error | |
import matplotlib.pyplot as plt | |
plt.plot(range(1,31), error_all, '-', linewidth=2.0, label='Training error') | |
plt.xlabel('# of iterations') | |
plt.ylabel('Classification error') | |
plt.show() | |
#validation error | |
validation_error_all = [] | |
for n in range(30): | |
predictions = predict_adaboost(stump_weights[:n], tree_stumps[:n], validation_data) | |
error = sum(predictions!=validation_data[target])/float(len(validation_data)) | |
validation_error_all.append(error) | |
print 'Iteration %s, validation error %s' % (n+1, validation_error_all[n]) | |
plt.plot(range(1,31), error_all, '-', linewidth=2.0, label='Training error') | |
plt.plot(range(1,31), validation_error_all, '-', linewidth=2.0, label='Validation error') | |
plt.xlabel('# of iterations') | |
plt.ylabel('Classification error') | |
plt.legend(loc='best', prop={'size':15}) | |
plt.show() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment