Last active
April 12, 2020 13:45
-
-
Save BenjaminFraser/7d54d99c88d2cf9a38a90a276c0f43e3 to your computer and use it in GitHub Desktop.
Python implementation of a random forest model using Numpy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class RandomForest(): | |
""" Python implementation of a random forest regressor """ | |
def __init__(self, x, y, num_trees, sample_size, feature_proportion=1.0, | |
min_leaf=5, bootstrap=False, random_seed=12): | |
np.random.seed(random_seed) | |
self.x = x | |
self.y = y | |
self.num_trees = num_trees | |
self.sample_size = sample_size | |
self.feature_proportion = feature_proportion | |
self.min_leaf = min_leaf | |
self.bootstrap = bootstrap | |
self.trees = [self.create_tree(bootstrap) for i in range(num_trees)] | |
def create_tree(self, bootstrap=False): | |
""" Form individual decision tree """ | |
# obtain a random sample of indices and identify oob samples | |
idxs = np.random.permutation(self.y.shape[0])[:self.sample_size] | |
oob_idxs = None | |
# if bootstrap chosen get bootstrap sample and oob indexes | |
if bootstrap: | |
idxs, oob_idxs = self.bootstrap_samples(idxs) | |
return DecisionTree(self.x.iloc[idxs], self.y[idxs], | |
feat_proportion=self.feature_proportion, | |
idxs=np.array(range(self.sample_size)), | |
oob_idxs=oob_idxs, | |
min_leaf=self.min_leaf) | |
def predict(self, x): | |
""" Return the mean of predictions across trees """ | |
# call predict function from each Tree class | |
return np.mean([t.predict(x) for t in self.trees], axis=0) | |
def oob_score(self): | |
""" Calculate and return each tree OOB R2 score and the average | |
OOB score across all decision trees """ | |
tree_oob_scores = [] | |
# find oob score for each tree and append to results | |
for tree in self.trees: | |
# find current tree oob predictions and labels | |
tree_oob_labels = self.y[tree.oob_idxs] | |
tree_oob_preds = tree.predict(self.x.iloc[tree.oob_idxs].values) | |
# calculate R2 score for predictions on current tree | |
tree_oob_r2 = r2_score(tree_oob_labels, tree_oob_preds) | |
# add R2 score for oob predictions from this tree | |
tree_oob_scores.append(tree_oob_r2) | |
tree_oob_scores = np.array(tree_oob_scores) | |
# find average oob scores across all trees | |
avg_oob_score = np.mean(tree_oob_scores) | |
return tree_oob_scores, avg_oob_score | |
def bootstrap_samples(self, idxs): | |
""" Return bootstrapped sample indices based on y and sample size """ | |
# take sample (with replacement) of idxs and set as bootstrap sample | |
sample_idxs = np.random.randint(0, len(idxs), size=self.sample_size) | |
bootstrap_idxs = idxs[sample_idxs] | |
# find out-of-bag (OOB) samples from the passed idxs array | |
i = np.arange(self.sample_size) | |
oob_i = np.array([ind for ind in i if ind not in sample_idxs]) | |
oob_idxs = idxs[oob_i] | |
return bootstrap_idxs, oob_idxs | |
def feature_importances(self): | |
""" Find the feature importances by shuffling each feature | |
and finding the drop in score relative to baseline. """ | |
# find baseline r2 score - all features will compare against this | |
baseline_score = r2_score(self.y, self.predict(self.x.values)) | |
# dictionary to store feature importances | |
feat_importances = {} | |
columns = self.x.columns | |
# iterate through each column, shuffle and get new score | |
for feat_column in columns: | |
# shuffle only current column | |
temp_df = self.x.copy() | |
feat_vals = temp_df[feat_column].values | |
np.random.shuffle(feat_vals) | |
# find new R2 score with shuffled feature | |
shuffled_score = r2_score(self.y, self.predict(temp_df.values)) | |
# calculate how much score has changed - this represents importance | |
feat_score = (baseline_score - shuffled_score) / baseline_score | |
# add to importance dict | |
feat_importances[feat_column] = feat_score | |
importance_df = pd.DataFrame.from_dict(feat_importances, | |
orient='index', | |
columns=['Importance']) | |
return importance_df.sort_values('Importance', ascending=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment