tjvananne · May 31, 2022 22:37
diff --git a/aaa_target_shuffling.R b/aaa_target_shuffling.R


 #' Target Shuffling 
 #' Author: Taylor Van Anne
 #' 
 #' Note: this is just my interpretation of what target shuffling means
 #' to me. I think there are a few different ways to actually conduct 
 #' the shuffling, but this is a single approach. 
 #' 
 #' A different approach than what I did here would be to shuffle the
 #' entire target variable before the train/test split. I chose to
 #' instead shuffle only within the test label values (after splitting
 #' the label values into train/test)


 # load libraries
 library(randomForest)
 library(ggplot2)

 # this is the number of iterations of model building
 num_iters <- 100

 # allocating numeric vector space to store our results as we loop
 results <- numeric(num_iters)
 results_shuffled <- numeric(num_iters)

 # setting a random seed for reproducibility
 set.seed(4)


 # begin the loop:
 for(i in 1:num_iters) {
    
    # replicate the iris data
    myiris <- iris
    
    # report out every tenth iteration
    if(i %% 10 == 0) {print(paste0("iteration: ", i))}
    
    # capture labels in character vector, remove label from x-data
    myiris_labels <- myiris$Species
    myiris$Species <- NULL
    
    # identify train/test split
    indx_train <- sample(1:nrow(myiris), floor(.7 * nrow(myiris)))
    indx_test <- setdiff(1:nrow(myiris), indx_train)
    
    # split features (x) into train and test
    myiris_train <- myiris[indx_train, ]
    myiris_test <- myiris[indx_test, ]
    
    # split labels (y) into train, test, and store a shuffled version of test y values as well
    y_train <- myiris_labels[indx_train]
    y_test <- myiris_labels[indx_test]
    y_test_shuffled <- y_test[sample(1:length(y_test), length(y_test))]
    
    # build model based on real y values, then one based on shuffled y values
    myrf <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test, keep.forest = TRUE)
    myrf_shuffled <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test_shuffled, keep.forest = TRUE)
    
    # make predictions based on real y values, then based on the model that saw shuffled y values
    myrf_preds <- predict(myrf, myiris_test)
    myrf_preds_shuffled <- predict(myrf_shuffled, myiris_test)    
    rm(myrf, myrf_shuffled)
    
    # determine accuracy of each model
    myrf_accuracy <- sum(myrf_preds == y_test, na.rm=T) / length(y_test)
    myrf_accuracy_shuffled <- sum(myrf_preds_shuffled == y_test_shuffled, na.rm=T) / length(y_test_shuffled)
    
    # store the accuracy in the pre-allocated numeric vector space
    results[i] <- myrf_accuracy
    results_shuffled[i] <- myrf_accuracy_shuffled
 }


 # label which results came from which experiment and combine into one data frame
 df_results <- data.frame(accuracy=results, type='y_test')
 df_results_shuffled <- data.frame(accuracy=results_shuffled, type='y_shuffled')
 df_all <- rbind(df_results, df_results_shuffled)


 # plot the density distribution of each group
 ggplot(df_all, aes(x=accuracy, fill=type)) +
    geom_density(alpha=0.4) +
    theme_bw(base_size=16) +
    ggtitle("Model vs Target-Shuffled Model")






diff --git a/zzz_input_shuffling.R b/zzz_input_shuffling.R


 # input shuffling
 # exploring this concept
 # based on this video: https://www.youtube.com/watch?v=OlHW7frH3ug
 # just found another one: https://www.youtube.com/watch?v=tD8HZuWqIQw 


 # libraries:
 library(randomForest)
 library(ggplot2)


 # data set:
 myiris <- iris


 # function definitions:

    shuffle_this_col <- function(param_df, param_col) {
        # shuffles a column in the dataframe
        # example: shuffle_this_col(myiris, 'Sepal.Width')
                
        col_vals <- param_df[, param_col]
        shuf_vals <- col_vals[sample(1:length(col_vals), length(col_vals))]
        param_df[, param_col] <- shuf_vals
        
        return(param_df)
    }

    

 # "config" for experiment
 iter_per_var <- 100
 target_var <- 'Species'
 vars_to_shuf <- setdiff(names(myiris), target_var)
 set.seed(1776)    



 # scores will be model accuracy score, exp_desc is description of the experiment
 scores <- numeric(iter_per_var * length(vars_to_shuf) * 2)
 exp_desc <- character(iter_per_var * length(vars_to_shuf) * 2)

    #' explaining the pre-allocated space above: I want `iter_per_var` number of
    #' iterations per shuffled variable, so those are multiplied. I'd also like
    #' to keep a "base-case" non-shuffled score in there as well, so that is why
    #' I multiply by 2 at the end there.


    
 # for each variable to shuffle, for number of iterations, build models and test
    
 # old school incrementer for "simplicity" (I'm an old soul)
 count <- 1


 for(i in 1:length(vars_to_shuf)) {
    
    # now we only have to do this subset once
    this_var <- vars_to_shuf[i]
    
    
    for(j in 1:iter_per_var) {
        
        if(j %% 10 == 0) {print(paste0("iteration: ", j, " -- for variable: ", this_var))}
        
        # isolate a copy of the data
        myiris <- iris
        
        # capture labels in character vector, remove label from x-data
        myiris_labels <- myiris$Species
        myiris$Species <- NULL
        myiris_shuf <- shuffle_this_col(myiris, this_var)
        
        # identify train/test split
        indx_train <- sample(1:nrow(myiris), floor(.7 * nrow(myiris)))
        indx_test <- setdiff(1:nrow(myiris), indx_train)
        
        # split features (x) into train and test
        myiris_train <- myiris[indx_train, ]
        myiris_train_shuf <- myiris_shuf[indx_train, ]
        myiris_test <- myiris[indx_test, ]
        myiris_test_shuf <- myiris_shuf[indx_test, ]  # <----- should we do this? (try it with and without)
        
        
        # split labels (y) into train, test, and store a shuffled version of test y values as well
        y_train <- myiris_labels[indx_train]
        y_test <- myiris_labels[indx_test]
        
        # build model based on real y values, then one based on shuffled y values
        myrf <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test, keep.forest = TRUE)
        myrf_shuffled <- randomForest(x=myiris_train_shuf, y=y_train, xtest=myiris_test_shuf, ytest=y_test, keep.forest = TRUE)
        
        # make predictions based on real y values, then based on the model that saw shuffled y values
        myrf_preds <- predict(myrf, myiris_test)
        myrf_preds_shuffled <- predict(myrf_shuffled, myiris_test_shuf)    
        rm(myrf, myrf_shuffled)
        
        # determine accuracy of each model
        myrf_accuracy <- sum(myrf_preds == y_test, na.rm=T) / length(y_test)
        myrf_accuracy_shuffled <- sum(myrf_preds_shuffled == y_test, na.rm=T) / length(y_test)
        
        # capture the "baseline" scores
        scores[count] <- myrf_accuracy
        exp_desc[count] <- 'baseline'
        
        
        # mid loop increment -- it makes sense, trust me.
        count <- count + 1
        
        # capture the "shuffled scores"
        scores[count] <- myrf_accuracy_shuffled
        exp_desc[count] <- this_var
        
        
        # end of loop increment
        count <- count + 1
        
        
    } # end inner for (j)

 } # end outter for (i)

 results_all <- data.frame(scores=scores, type=exp_desc)

 ggplot(data=results_all, aes(x=scores, fill=type)) +
    geom_density(alpha=0.3)


	#' Target Shuffling
	#' Author: Taylor Van Anne
	#'
	#' Note: this is just my interpretation of what target shuffling means
	#' to me. I think there are a few different ways to actually conduct
	#' the shuffling, but this is a single approach.
	#'
	#' A different approach than what I did here would be to shuffle the
	#' entire target variable before the train/test split. I chose to
	#' instead shuffle only within the test label values (after splitting
	#' the label values into train/test)


	# load libraries
	library(randomForest)
	library(ggplot2)

	# this is the number of iterations of model building
	num_iters <- 100

	# allocating numeric vector space to store our results as we loop
	results <- numeric(num_iters)
	results_shuffled <- numeric(num_iters)

	# setting a random seed for reproducibility
	set.seed(4)


	# begin the loop:
	for(i in 1:num_iters) {

	# replicate the iris data
	myiris <- iris

	# report out every tenth iteration
	if(i %% 10 == 0) {print(paste0("iteration: ", i))}

	# capture labels in character vector, remove label from x-data
	myiris_labels <- myiris$Species
	myiris$Species <- NULL

	# identify train/test split
	indx_train <- sample(1:nrow(myiris), floor(.7 * nrow(myiris)))
	indx_test <- setdiff(1:nrow(myiris), indx_train)

	# split features (x) into train and test
	myiris_train <- myiris[indx_train, ]
	myiris_test <- myiris[indx_test, ]

	# split labels (y) into train, test, and store a shuffled version of test y values as well
	y_train <- myiris_labels[indx_train]
	y_test <- myiris_labels[indx_test]
	y_test_shuffled <- y_test[sample(1:length(y_test), length(y_test))]

	# build model based on real y values, then one based on shuffled y values
	myrf <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test, keep.forest = TRUE)
	myrf_shuffled <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test_shuffled, keep.forest = TRUE)

	# make predictions based on real y values, then based on the model that saw shuffled y values
	myrf_preds <- predict(myrf, myiris_test)
	myrf_preds_shuffled <- predict(myrf_shuffled, myiris_test)
	rm(myrf, myrf_shuffled)

	# determine accuracy of each model
	myrf_accuracy <- sum(myrf_preds == y_test, na.rm=T) / length(y_test)
	myrf_accuracy_shuffled <- sum(myrf_preds_shuffled == y_test_shuffled, na.rm=T) / length(y_test_shuffled)

	# store the accuracy in the pre-allocated numeric vector space
	results[i] <- myrf_accuracy
	results_shuffled[i] <- myrf_accuracy_shuffled
	}


	# label which results came from which experiment and combine into one data frame
	df_results <- data.frame(accuracy=results, type='y_test')
	df_results_shuffled <- data.frame(accuracy=results_shuffled, type='y_shuffled')
	df_all <- rbind(df_results, df_results_shuffled)


	# plot the density distribution of each group
	ggplot(df_all, aes(x=accuracy, fill=type)) +
	geom_density(alpha=0.4) +
	theme_bw(base_size=16) +
	ggtitle("Model vs Target-Shuffled Model")


	# input shuffling
	# exploring this concept
	# based on this video: https://www.youtube.com/watch?v=OlHW7frH3ug
	# just found another one: https://www.youtube.com/watch?v=tD8HZuWqIQw


	# libraries:
	library(randomForest)
	library(ggplot2)


	# data set:
	myiris <- iris


	# function definitions:

	shuffle_this_col <- function(param_df, param_col) {
	# shuffles a column in the dataframe
	# example: shuffle_this_col(myiris, 'Sepal.Width')

	col_vals <- param_df[, param_col]
	shuf_vals <- col_vals[sample(1:length(col_vals), length(col_vals))]
	param_df[, param_col] <- shuf_vals

	return(param_df)
	}



	# "config" for experiment
	iter_per_var <- 100
	target_var <- 'Species'
	vars_to_shuf <- setdiff(names(myiris), target_var)
	set.seed(1776)



	# scores will be model accuracy score, exp_desc is description of the experiment
	scores <- numeric(iter_per_var * length(vars_to_shuf) * 2)
	exp_desc <- character(iter_per_var * length(vars_to_shuf) * 2)

	#' explaining the pre-allocated space above: I want `iter_per_var` number of
	#' iterations per shuffled variable, so those are multiplied. I'd also like
	#' to keep a "base-case" non-shuffled score in there as well, so that is why
	#' I multiply by 2 at the end there.



	# for each variable to shuffle, for number of iterations, build models and test

	# old school incrementer for "simplicity" (I'm an old soul)
	count <- 1


	for(i in 1:length(vars_to_shuf)) {

	# now we only have to do this subset once
	this_var <- vars_to_shuf[i]


	for(j in 1:iter_per_var) {

	if(j %% 10 == 0) {print(paste0("iteration: ", j, " -- for variable: ", this_var))}

	# isolate a copy of the data
	myiris <- iris

	# capture labels in character vector, remove label from x-data
	myiris_labels <- myiris$Species
	myiris$Species <- NULL
	myiris_shuf <- shuffle_this_col(myiris, this_var)

	# identify train/test split
	indx_train <- sample(1:nrow(myiris), floor(.7 * nrow(myiris)))
	indx_test <- setdiff(1:nrow(myiris), indx_train)

	# split features (x) into train and test
	myiris_train <- myiris[indx_train, ]
	myiris_train_shuf <- myiris_shuf[indx_train, ]
	myiris_test <- myiris[indx_test, ]
	myiris_test_shuf <- myiris_shuf[indx_test, ] # <----- should we do this? (try it with and without)


	# split labels (y) into train, test, and store a shuffled version of test y values as well
	y_train <- myiris_labels[indx_train]
	y_test <- myiris_labels[indx_test]

	# build model based on real y values, then one based on shuffled y values
	myrf <- randomForest(x=myiris_train, y=y_train, xtest=myiris_test, ytest=y_test, keep.forest = TRUE)
	myrf_shuffled <- randomForest(x=myiris_train_shuf, y=y_train, xtest=myiris_test_shuf, ytest=y_test, keep.forest = TRUE)

	# make predictions based on real y values, then based on the model that saw shuffled y values
	myrf_preds <- predict(myrf, myiris_test)
	myrf_preds_shuffled <- predict(myrf_shuffled, myiris_test_shuf)
	rm(myrf, myrf_shuffled)

	# determine accuracy of each model
	myrf_accuracy <- sum(myrf_preds == y_test, na.rm=T) / length(y_test)
	myrf_accuracy_shuffled <- sum(myrf_preds_shuffled == y_test, na.rm=T) / length(y_test)

	# capture the "baseline" scores
	scores[count] <- myrf_accuracy
	exp_desc[count] <- 'baseline'


	# mid loop increment -- it makes sense, trust me.
	count <- count + 1

	# capture the "shuffled scores"
	scores[count] <- myrf_accuracy_shuffled
	exp_desc[count] <- this_var


	# end of loop increment
	count <- count + 1


	} # end inner for (j)

	} # end outter for (i)

	results_all <- data.frame(scores=scores, type=exp_desc)

	ggplot(data=results_all, aes(x=scores, fill=type)) +
	geom_density(alpha=0.3)