shengch02 · January 2, 2017 17:08
diff --git a/Implementing gradient boosted trees from scratch b/Implementing gradient boosted trees from scratch
 #Boosting a decision stump from scratch
 import pandas as pd
 import numpy as np

 #the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or 
 #the loan with be charged off and possibly go into default
 import sframe
 loans = sframe.SFrame('lending-club-data.gl/')

 #target column 'safe_loans' with +1 means a safe loan and -1 for risky loan
 loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x==0 else -1)
 loans = loans.remove_column('bad_loans')

 #use a subset of features (categorical and numeric)
 features = ['grade', 'term', 'home_ownership', 'emp_length']
 target = 'safe_loans'
 loans, loans_with_na = loans[features+[target]].dropna_split()

 #Count the number of rows with missing data
 num_rows_with_na = loans_with_na.num_rows()
 num_rows = loans.num_rows()
 print 'Dropping %s observations; keeping %s' % (num_rows_with_na, num_rows)

 #undersample the larger class in order to balance our dataset
 safe_loans_raw = loans[loans[target]==1]
 risky_loans_raw = loans[loans[target]==-1]
 percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
 safe_loans = safe_loans_raw.sample(percentage, seed=1)
 risky_loans = risky_loans_raw
 loans_data = risky_loans.append(safe_loans)
 print 'Percentage of safe loans :', len(safe_loans)/float(len(loans_data))
 print 'Percentage of risky loans :', len(risky_loans)/float(len(loans_data))
 print 'Total number of loans :', len(loans_data)

 #One-hot encoding
 categorical_variables = []
 for feat_name, feat_type in zip(loans_data.column_names(), loans_data.column_types()):
 	if feat_type == str:
 		categorical_variables.append(feat_name)
 for feature in categorical_variables:
 	loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x:1})
 	loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature)
 	for column in loans_data_unpacked.column_names():
 		loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0)
 	loans_data.remove_column(feature)
 	loans_data.add_columns(loans_data_unpacked)

 #split data into training and validation
 train_data, validation_data = loans_data.random_split(0.8, seed=1)
 features = train_data.column_names()
 features.remove(target)

 #calculate the weight of mistakes for making the 'weighted-majority' prediction 
 def intermediate_node_weighted_mistakes(labels_in_node, data_weights):
 	total_weight_positive = sum(data_weights[labels_in_node==1])
 	weighted_mistakes_all_negative = total_weight_positive
 	total_weight_negative = sum(data_weights[labels_in_node==-1])
 	weighted_mistakes_all_positive = total_weight_negative
 	if weighted_mistakes_all_negative < weighted_mistakes_all_positive:
 		return (weighted_mistakes_all_negative, -1)
 	else:
 		return (weighted_mistakes_all_positive, 1)

 #pick the best feature to split on, with weighting of points incorporated
 def best_splitting_feature(data, features, target, data_weights):
 	best_feature = None
 	best_error = float('+inf')
 	num_points = float(len(data))
 	for feature in features:
 		left_split = data[data[feature]==0]
 		right_split = data[data[feature]==1]
 		left_data_weights = data_weights[data[feature]==0]
 		right_data_weights = data_weights[data[feature]==1]
 		left_weighted_mistakes, left_class = intermediate_node_weighted_mistakes(left_split[target], left_data_weights)
 		right_weighted_mistakes, right_class = intermediate_node_weighted_mistakes(right_split[target], right_data_weights)
 		error = (left_weighted_mistakes+right_weighted_mistakes)/sum(data_weights)
 		if error<best_error:
 			best_feature = feature
 			best_error = error
 	return best_feature

 #create the leaf
 def create_leaf(target_values, data_weights):
 	leaf = {'splitting_feature' : None, 'is_leaf' : True}
 	weighter_error, best_class = intermediate_node_weighted_mistakes(target_values, data_weights)
 	leaf['prediction'] = best_class
 	return leaf

 #build the tree
 def weighted_decision_tree_create(data, features, target, data_weights, current_depth=1, max_depth=10):
 	remaining_features = features[:]
 	target_values = data[target]
 	print '--------------'
 	print 'Subtree, depth = %s (%s data points).' % (current_depth, len(target_values))
 	#stopping condition 1, error is 0
 	if intermediate_node_weighted_mistakes(target_values, data_weights)[0] <= 1e-15:
 		print 'Stopping condition 1 reached.'
 		return create_leaf(target_values, data_weights)
 	#stopping condition 2, no more features
 	if remaining_features == []:
 		print 'Stopping condition 2 reached.'
 		return create_leaf(target_values, data_weights)	
 	#stopping condition 3, reach the maximum depth
 	if current_depth > max_depth:
 		print 'Reached maximum depth.'
 		return create_leaf(target_values, data_weights)
 	splitting_feature = best_splitting_feature(data, features, target, data_weights)
 	remaining_features.remove(splitting_feature)
 	left_split = data[data[splitting_feature]==0]
 	right_split = data[data[splitting_feature]==1]
 	left_data_weights = data_weights[data[splitting_feature]==0]
 	right_data_weights = data_weights[data[splitting_feature]==1]
 	print 'Split on feature %s. (%s, %s)' % (splitting_feature, len(left_split), len(right_split))
 	if len(left_split)==len(data):
 		print 'Creating leaf node.'
 		return create_leaf(left_split[target], data_weights)
 	if len(right_split)==len(data):
 		print 'Creating leaf node.'
 		return create_leaf(right_split[target], data_weights)
 	#Repeat recurse on left and right subtrees
 	left_tree = weighted_decision_tree_create(left_split, remaining_features, target, left_data_weights, \
 			current_depth+1, max_depth)
 	right_tree = weighted_decision_tree_create(right_split, remaining_features, target, \
 			right_data_weights, current_depth+1, max_depth)
 	return {'is_leaf' : False, 'prediction' : None, 'splitting_feature' : splitting_feature, \
 		'left' : left_tree, 'right' : right_tree}

 #count the nodes in the tree
 def count_nodes(tree):
 	if tree['is_leaf']:
 		return 1
 	return 1+count_nodes(tree['left'])+count_nodes(tree['right'])

 #make predictions with the tree
 def classify(tree, x, annotate=False):
 	if tree['is_leaf']:
 		if annotate:
 			print 'At leaf, prediction %s' % tree['prediction']
 		return tree['prediction']
 	else:
 		split_feature_value = x[tree['splitting_feature']]
 		if annotate:
 			print 'Split on %s = %s' % (tree['splitting_feature'], split_feature_value)
 		if split_feature_value==0:
 			return classify(tree['left'], x, annotate)
 		else:
 			return classify(tree['right'], x, annotate)

 #evalate classification error
 def evaluate_classification_error(tree, data, target):
 	prediction = data.apply(lambda x : classify(tree, x))
 	return (prediction!=data[target]).sum()/float(len(data))

 #one test example
 example_data_weights = sframe.SArray([1.0]*10+[0.0]*(len(train_data)-20)+[1.0]*10)
 small_data_decision_tree_subset20 = weighted_decision_tree_create(train_data, features, 		
 		target, example_data_weights, max_depth=2)
 print evaluate_classification_error(small_data_decision_tree_subset20, train_data, target)
 set20_predt = train_data.apply(lambda x: classify(small_data_decision_tree_subset20, x))

 #implement the Adaboost from scratch
 from math import (log, exp)
 def adaboost_with_tree_stumps(data, features, target, num_tree_stumps):
 	alpha = sframe.SArray([1.0]*len(data))
 	weights = []
 	tree_stumps = []
 	target_values = data[target]
 	for t in range(num_tree_stumps):
 		print '========================='
 		print 'Adaboost Iteration %d' % t
 		tree_stump = weighted_decision_tree_create(data, features, target,
 			data_weights=alpha, max_depth=1)
 		tree_stumps.append(tree_stump)
 		predictions = data.apply(lambda x: classify(tree_stump, x))
 		is_correct = predictions==target_values
 		is_wrong = predictions!=target_values
 		weighted_error = sum(alpha[is_wrong])/sum(alpha)
 		weight = 0.5*log((1-weighted_error)/weighted_error)
 		weights.append(weight)
 		adjustment = is_correct.apply(lambda is_correct: exp(-weight) if is_correct else exp(weight))
 		alpha = alpha*adjustment
 		alpha = alpha/sum(alpha)
 	return weights, tree_stumps

 def predict_adaboost(stump_weights, tree_stumps, data):
 	scores = sframe.SArray([0.0]*len(data))
 	for i, tree_stump in enumerate(tree_stumps):
 		predictions = data.apply(lambda x: classify(tree_stump, x))
 		scores = scores+stump_weights[i]*predictions
 	return scores.apply(lambda score: +1 if score>0 else -1)

 stump_weights, tree_stumps = adaboost_with_tree_stumps(train_data, features, target, 30)
 error_all = []
 for n in range(30):
 	predictions = predict_adaboost(stump_weights[:n], tree_stumps[:n], train_data)
 	error = sum(predictions!=train_data[target])/float(len(train_data))
 	error_all.append(error)
 	print 'Iteration %s, training error %s' % (n+1, error_all[n])

 #visualize the training error
 import matplotlib.pyplot as plt
 plt.plot(range(1,31), error_all, '-', linewidth=2.0, label='Training error')
 plt.xlabel('# of iterations')
 plt.ylabel('Classification error')
 plt.show()

 #validation error
 validation_error_all = []
 for n in range(30):
 	predictions = predict_adaboost(stump_weights[:n], tree_stumps[:n], validation_data)
 	error = sum(predictions!=validation_data[target])/float(len(validation_data))
 	validation_error_all.append(error)
 	print 'Iteration %s, validation error %s' % (n+1, validation_error_all[n])
 plt.plot(range(1,31), error_all, '-', linewidth=2.0, label='Training error')
 plt.plot(range(1,31), validation_error_all, '-', linewidth=2.0, label='Validation error')
 plt.xlabel('# of iterations')
 plt.ylabel('Classification error')
 plt.legend(loc='best', prop={'size':15})
 plt.show()
	#Boosting a decision stump from scratch
	import pandas as pd
	import numpy as np

	#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or
	#the loan with be charged off and possibly go into default
	import sframe
	loans = sframe.SFrame('lending-club-data.gl/')

	#target column 'safe_loans' with +1 means a safe loan and -1 for risky loan
	loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x==0 else -1)
	loans = loans.remove_column('bad_loans')

	#use a subset of features (categorical and numeric)
	features = ['grade', 'term', 'home_ownership', 'emp_length']
	target = 'safe_loans'
	loans, loans_with_na = loans[features+[target]].dropna_split()

	#Count the number of rows with missing data
	num_rows_with_na = loans_with_na.num_rows()
	num_rows = loans.num_rows()
	print 'Dropping %s observations; keeping %s' % (num_rows_with_na, num_rows)

	#undersample the larger class in order to balance our dataset
	safe_loans_raw = loans[loans[target]==1]
	risky_loans_raw = loans[loans[target]==-1]
	percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
	safe_loans = safe_loans_raw.sample(percentage, seed=1)
	risky_loans = risky_loans_raw
	loans_data = risky_loans.append(safe_loans)
	print 'Percentage of safe loans :', len(safe_loans)/float(len(loans_data))
	print 'Percentage of risky loans :', len(risky_loans)/float(len(loans_data))
	print 'Total number of loans :', len(loans_data)

	#One-hot encoding
	categorical_variables = []
	for feat_name, feat_type in zip(loans_data.column_names(), loans_data.column_types()):
	if feat_type == str:
	categorical_variables.append(feat_name)
	for feature in categorical_variables:
	loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x:1})
	loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature)
	for column in loans_data_unpacked.column_names():
	loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0)
	loans_data.remove_column(feature)
	loans_data.add_columns(loans_data_unpacked)

	#split data into training and validation
	train_data, validation_data = loans_data.random_split(0.8, seed=1)
	features = train_data.column_names()
	features.remove(target)

	#calculate the weight of mistakes for making the 'weighted-majority' prediction
	def intermediate_node_weighted_mistakes(labels_in_node, data_weights):
	total_weight_positive = sum(data_weights[labels_in_node==1])
	weighted_mistakes_all_negative = total_weight_positive
	total_weight_negative = sum(data_weights[labels_in_node==-1])
	weighted_mistakes_all_positive = total_weight_negative
	if weighted_mistakes_all_negative < weighted_mistakes_all_positive:
	return (weighted_mistakes_all_negative, -1)
	else:
	return (weighted_mistakes_all_positive, 1)

	#pick the best feature to split on, with weighting of points incorporated
	def best_splitting_feature(data, features, target, data_weights):
	best_feature = None
	best_error = float('+inf')
	num_points = float(len(data))
	for feature in features:
	left_split = data[data[feature]==0]
	right_split = data[data[feature]==1]
	left_data_weights = data_weights[data[feature]==0]
	right_data_weights = data_weights[data[feature]==1]
	left_weighted_mistakes, left_class = intermediate_node_weighted_mistakes(left_split[target], left_data_weights)
	right_weighted_mistakes, right_class = intermediate_node_weighted_mistakes(right_split[target], right_data_weights)
	error = (left_weighted_mistakes+right_weighted_mistakes)/sum(data_weights)
	if error<best_error:
	best_feature = feature
	best_error = error
	return best_feature

	#create the leaf
	def create_leaf(target_values, data_weights):
	leaf = {'splitting_feature' : None, 'is_leaf' : True}
	weighter_error, best_class = intermediate_node_weighted_mistakes(target_values, data_weights)
	leaf['prediction'] = best_class
	return leaf

	#build the tree
	def weighted_decision_tree_create(data, features, target, data_weights, current_depth=1, max_depth=10):
	remaining_features = features[:]
	target_values = data[target]
	print '--------------'
	print 'Subtree, depth = %s (%s data points).' % (current_depth, len(target_values))
	#stopping condition 1, error is 0
	if intermediate_node_weighted_mistakes(target_values, data_weights)[0] <= 1e-15:
	print 'Stopping condition 1 reached.'
	return create_leaf(target_values, data_weights)
	#stopping condition 2, no more features
	if remaining_features == []:
	print 'Stopping condition 2 reached.'
	return create_leaf(target_values, data_weights)
	#stopping condition 3, reach the maximum depth
	if current_depth > max_depth:
	print 'Reached maximum depth.'
	return create_leaf(target_values, data_weights)
	splitting_feature = best_splitting_feature(data, features, target, data_weights)
	remaining_features.remove(splitting_feature)
	left_split = data[data[splitting_feature]==0]
	right_split = data[data[splitting_feature]==1]
	left_data_weights = data_weights[data[splitting_feature]==0]
	right_data_weights = data_weights[data[splitting_feature]==1]
	print 'Split on feature %s. (%s, %s)' % (splitting_feature, len(left_split), len(right_split))
	if len(left_split)==len(data):
	print 'Creating leaf node.'
	return create_leaf(left_split[target], data_weights)
	if len(right_split)==len(data):
	print 'Creating leaf node.'
	return create_leaf(right_split[target], data_weights)
	#Repeat recurse on left and right subtrees
	left_tree = weighted_decision_tree_create(left_split, remaining_features, target, left_data_weights, \
	current_depth+1, max_depth)
	right_tree = weighted_decision_tree_create(right_split, remaining_features, target, \
	right_data_weights, current_depth+1, max_depth)
	return {'is_leaf' : False, 'prediction' : None, 'splitting_feature' : splitting_feature, \
	'left' : left_tree, 'right' : right_tree}

	#count the nodes in the tree
	def count_nodes(tree):
	if tree['is_leaf']:
	return 1
	return 1+count_nodes(tree['left'])+count_nodes(tree['right'])

	#make predictions with the tree
	def classify(tree, x, annotate=False):
	if tree['is_leaf']:
	if annotate:
	print 'At leaf, prediction %s' % tree['prediction']
	return tree['prediction']
	else:
	split_feature_value = x[tree['splitting_feature']]
	if annotate:
	print 'Split on %s = %s' % (tree['splitting_feature'], split_feature_value)
	if split_feature_value==0:
	return classify(tree['left'], x, annotate)
	else:
	return classify(tree['right'], x, annotate)

	#evalate classification error
	def evaluate_classification_error(tree, data, target):
	prediction = data.apply(lambda x : classify(tree, x))
	return (prediction!=data[target]).sum()/float(len(data))

	#one test example
	example_data_weights = sframe.SArray([1.0]10+[0.0](len(train_data)-20)+[1.0]*10)
	small_data_decision_tree_subset20 = weighted_decision_tree_create(train_data, features,
	target, example_data_weights, max_depth=2)
	print evaluate_classification_error(small_data_decision_tree_subset20, train_data, target)
	set20_predt = train_data.apply(lambda x: classify(small_data_decision_tree_subset20, x))

	#implement the Adaboost from scratch
	from math import (log, exp)
	def adaboost_with_tree_stumps(data, features, target, num_tree_stumps):
	alpha = sframe.SArray([1.0]*len(data))
	weights = []
	tree_stumps = []
	target_values = data[target]
	for t in range(num_tree_stumps):
	print '========================='
	print 'Adaboost Iteration %d' % t
	tree_stump = weighted_decision_tree_create(data, features, target,
	data_weights=alpha, max_depth=1)
	tree_stumps.append(tree_stump)
	predictions = data.apply(lambda x: classify(tree_stump, x))
	is_correct = predictions==target_values
	is_wrong = predictions!=target_values
	weighted_error = sum(alpha[is_wrong])/sum(alpha)
	weight = 0.5*log((1-weighted_error)/weighted_error)
	weights.append(weight)
	adjustment = is_correct.apply(lambda is_correct: exp(-weight) if is_correct else exp(weight))
	alpha = alpha*adjustment
	alpha = alpha/sum(alpha)
	return weights, tree_stumps

	def predict_adaboost(stump_weights, tree_stumps, data):
	scores = sframe.SArray([0.0]*len(data))
	for i, tree_stump in enumerate(tree_stumps):
	predictions = data.apply(lambda x: classify(tree_stump, x))
	scores = scores+stump_weights[i]*predictions
	return scores.apply(lambda score: +1 if score>0 else -1)

	stump_weights, tree_stumps = adaboost_with_tree_stumps(train_data, features, target, 30)
	error_all = []
	for n in range(30):
	predictions = predict_adaboost(stump_weights[:n], tree_stumps[:n], train_data)
	error = sum(predictions!=train_data[target])/float(len(train_data))
	error_all.append(error)
	print 'Iteration %s, training error %s' % (n+1, error_all[n])

	#visualize the training error
	import matplotlib.pyplot as plt
	plt.plot(range(1,31), error_all, '-', linewidth=2.0, label='Training error')
	plt.xlabel('# of iterations')
	plt.ylabel('Classification error')
	plt.show()

	#validation error
	validation_error_all = []
	for n in range(30):
	predictions = predict_adaboost(stump_weights[:n], tree_stumps[:n], validation_data)
	error = sum(predictions!=validation_data[target])/float(len(validation_data))
	validation_error_all.append(error)
	print 'Iteration %s, validation error %s' % (n+1, validation_error_all[n])
	plt.plot(range(1,31), error_all, '-', linewidth=2.0, label='Training error')
	plt.plot(range(1,31), validation_error_all, '-', linewidth=2.0, label='Validation error')
	plt.xlabel('# of iterations')
	plt.ylabel('Classification error')
	plt.legend(loc='best', prop={'size':15})
	plt.show()