Last active
May 31, 2022 22:37
-
-
Save tjvananne/5b6444ad4c44240d2eaba0179b1dfb27 to your computer and use it in GitHub Desktop.
Target Shuffling in R - iris data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Target Shuffling | |
#' Author: Taylor Van Anne | |
#' | |
#' Note: this is just my interpretation of what target shuffling means | |
#' to me. I think there are a few different ways to actually conduct | |
#' the shuffling, but this is a single approach. | |
#' | |
#' A different approach than what I did here would be to shuffle the | |
#' entire target variable before the train/test split. I chose to | |
#' instead shuffle only within the test label values (after splitting | |
#' the label values into train/test) | |
# load libraries | |
library(randomForest) | |
library(ggplot2) | |
# this is the number of iterations of model building | |
num_iters <- 100 | |
# allocating numeric vector space to store our results as we loop | |
results <- numeric(num_iters) | |
results_shuffled <- numeric(num_iters) | |
# setting a random seed for reproducibility | |
set.seed(4) | |
# begin the loop: | |
for(i in 1:num_iters) { | |
# replicate the iris data | |
myiris <- iris | |
# report out every tenth iteration | |
if(i %% 10 == 0) {print(paste0("iteration: ", i))} | |
# capture labels in character vector, remove label from x-data | |
myiris_labels <- myiris$Species | |
myiris$Species <- NULL | |
# identify train/test split | |
indx_train <- sample(1:nrow(myiris), floor(.7 * nrow(myiris))) | |
indx_test <- setdiff(1:nrow(myiris), indx_train) | |
# split features (x) into train and test | |
myiris_train <- myiris[indx_train, ] | |
myiris_test <- myiris[indx_test, ] | |
# split labels (y) into train, test, and store a shuffled version of test y values as well | |
y_train <- myiris_labels[indx_train] | |
y_test <- myiris_labels[indx_test] | |
y_test_shuffled <- y_test[sample(1:length(y_test), length(y_test))] | |
# build model based on real y values, then one based on shuffled y values | |
myrf <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test, keep.forest = TRUE) | |
myrf_shuffled <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test_shuffled, keep.forest = TRUE) | |
# make predictions based on real y values, then based on the model that saw shuffled y values | |
myrf_preds <- predict(myrf, myiris_test) | |
myrf_preds_shuffled <- predict(myrf_shuffled, myiris_test) | |
rm(myrf, myrf_shuffled) | |
# determine accuracy of each model | |
myrf_accuracy <- sum(myrf_preds == y_test, na.rm=T) / length(y_test) | |
myrf_accuracy_shuffled <- sum(myrf_preds_shuffled == y_test_shuffled, na.rm=T) / length(y_test_shuffled) | |
# store the accuracy in the pre-allocated numeric vector space | |
results[i] <- myrf_accuracy | |
results_shuffled[i] <- myrf_accuracy_shuffled | |
} | |
# label which results came from which experiment and combine into one data frame | |
df_results <- data.frame(accuracy=results, type='y_test') | |
df_results_shuffled <- data.frame(accuracy=results_shuffled, type='y_shuffled') | |
df_all <- rbind(df_results, df_results_shuffled) | |
# plot the density distribution of each group | |
ggplot(df_all, aes(x=accuracy, fill=type)) + | |
geom_density(alpha=0.4) + | |
theme_bw(base_size=16) + | |
ggtitle("Model vs Target-Shuffled Model") | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# input shuffling | |
# exploring this concept | |
# based on this video: https://www.youtube.com/watch?v=OlHW7frH3ug | |
# just found another one: https://www.youtube.com/watch?v=tD8HZuWqIQw | |
# libraries: | |
library(randomForest) | |
library(ggplot2) | |
# data set: | |
myiris <- iris | |
# function definitions: | |
shuffle_this_col <- function(param_df, param_col) { | |
# shuffles a column in the dataframe | |
# example: shuffle_this_col(myiris, 'Sepal.Width') | |
col_vals <- param_df[, param_col] | |
shuf_vals <- col_vals[sample(1:length(col_vals), length(col_vals))] | |
param_df[, param_col] <- shuf_vals | |
return(param_df) | |
} | |
# "config" for experiment | |
iter_per_var <- 100 | |
target_var <- 'Species' | |
vars_to_shuf <- setdiff(names(myiris), target_var) | |
set.seed(1776) | |
# scores will be model accuracy score, exp_desc is description of the experiment | |
scores <- numeric(iter_per_var * length(vars_to_shuf) * 2) | |
exp_desc <- character(iter_per_var * length(vars_to_shuf) * 2) | |
#' explaining the pre-allocated space above: I want `iter_per_var` number of | |
#' iterations per shuffled variable, so those are multiplied. I'd also like | |
#' to keep a "base-case" non-shuffled score in there as well, so that is why | |
#' I multiply by 2 at the end there. | |
# for each variable to shuffle, for number of iterations, build models and test | |
# old school incrementer for "simplicity" (I'm an old soul) | |
count <- 1 | |
for(i in 1:length(vars_to_shuf)) { | |
# now we only have to do this subset once | |
this_var <- vars_to_shuf[i] | |
for(j in 1:iter_per_var) { | |
if(j %% 10 == 0) {print(paste0("iteration: ", j, " -- for variable: ", this_var))} | |
# isolate a copy of the data | |
myiris <- iris | |
# capture labels in character vector, remove label from x-data | |
myiris_labels <- myiris$Species | |
myiris$Species <- NULL | |
myiris_shuf <- shuffle_this_col(myiris, this_var) | |
# identify train/test split | |
indx_train <- sample(1:nrow(myiris), floor(.7 * nrow(myiris))) | |
indx_test <- setdiff(1:nrow(myiris), indx_train) | |
# split features (x) into train and test | |
myiris_train <- myiris[indx_train, ] | |
myiris_train_shuf <- myiris_shuf[indx_train, ] | |
myiris_test <- myiris[indx_test, ] | |
myiris_test_shuf <- myiris_shuf[indx_test, ] # <----- should we do this? (try it with and without) | |
# split labels (y) into train, test, and store a shuffled version of test y values as well | |
y_train <- myiris_labels[indx_train] | |
y_test <- myiris_labels[indx_test] | |
# build model based on real y values, then one based on shuffled y values | |
myrf <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test, keep.forest = TRUE) | |
myrf_shuffled <- randomForest(x=myiris_train_shuf, y=y_train, xtest=myiris_test_shuf, ytest=y_test, keep.forest = TRUE) | |
# make predictions based on real y values, then based on the model that saw shuffled y values | |
myrf_preds <- predict(myrf, myiris_test) | |
myrf_preds_shuffled <- predict(myrf_shuffled, myiris_test_shuf) | |
rm(myrf, myrf_shuffled) | |
# determine accuracy of each model | |
myrf_accuracy <- sum(myrf_preds == y_test, na.rm=T) / length(y_test) | |
myrf_accuracy_shuffled <- sum(myrf_preds_shuffled == y_test, na.rm=T) / length(y_test) | |
# capture the "baseline" scores | |
scores[count] <- myrf_accuracy | |
exp_desc[count] <- 'baseline' | |
# mid loop increment -- it makes sense, trust me. | |
count <- count + 1 | |
# capture the "shuffled scores" | |
scores[count] <- myrf_accuracy_shuffled | |
exp_desc[count] <- this_var | |
# end of loop increment | |
count <- count + 1 | |
} # end inner for (j) | |
} # end outter for (i) | |
results_all <- data.frame(scores=scores, type=exp_desc) | |
ggplot(data=results_all, aes(x=scores, fill=type)) + | |
geom_density(alpha=0.3) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment