tjvananne · April 12, 2021 03:53
diff --git a/01_adaBoost_implementation_readme.txt b/01_adaBoost_implementation_readme.txt
 #=========================================================================
 #STAT W4400 
 #Homework 03 
 # yl3394, Yanjin Li 
 #Problem 1 AdaBoost 
 #=========================================================================
 # In this problem I will implement AdaBoost algorithm in R. The algorithm 
 # requires two auxiliary functions, to train and to evaluate the weak leaner.
 # And, then we will have the third function for implementing the resulting 
 # boosting classifier. Here, we will use the decision stumps as our weak 
 # learners. 
 #=========================================================================

 #' Trying to make this obvious, but I borrowed this code from this repo right here:
 #' https://github.com/yl3394/adaboost-implementation-in-r 
 #' this is one of the cleanest implementations of adaboost in R that I've seen
 #' Thanks to y13394 for this!

 # also, this link is for a good article on the mathematical theory of boosting:
 # https://blog.statsbot.co/ensemble-learning-d1dcd548e936


diff --git a/02_adaboost_functions.R b/02_adaboost_functions.R
 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # 1.  Decision Stump: Weak Learner Training Routine 
 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # w,y are vectors containing the weights and class labels, the output pars
 # is a list which contains the parameters specifying the result classifiers.
 # In this part, I will train the decision stump by between-axis and within-
 # axis comparisons. 

 train <- function(X, w, y){
  n <- nrow(X)
  p <- ncol(X)
  elist <- matrix(nrow=p)
  mlist <- matrix(nrow=p)
  tlist <- matrix(nrow=p)
  for(j in 1:p){
    # Sort data points along dimension 
    indx <- order(X[,j])
    x_j <- X[indx, j]
    # Using a cummulative sum, count the weight when progressively shifting 
    # the threshold to the right 
    w_cum <- cumsum(w[indx] * y[indx]) 
    
    # Handle multiple occurences of same x_j values 
    # threshold point must not lie between elements of the same value 
    w_cum[duplicated((x_j)==1)] <- NA
    
    # Find the optimal threshold and classify accordingly 
    m <- max(abs(w_cum), na.rm = TRUE)  #remove RM 
    print(m)
    maxIndx <- min(which(abs(w_cum)==m))
    mlist[j] <- (w_cum[maxIndx]<0)*2 - 1  
    tlist[j] <- x_j[maxIndx]
    c <- ((x_j > tlist[j])*2 -1) * mlist[j]  
    #here (...>...) will give us 1 if is true,otherwise 0 
    elist[j] <- w %*% (c!=y) 
  }
  m <- min(elist)
  j_star <- min(which(elist==m))
  
  pars <- list(j=j_star, theta = tlist[j_star], mode = mlist[j_star])
  return(pars)
 }

 classify <- function(X, pars){
  j <- pars$j
  t <- pars$theta
  m <- pars$mode
  x <- X[,j]
  pred <- m * (x-t)
  pred[pred < 0] <- -1
  pred[pred >= 0] <- 1
  return(pred)
 }


 agg_class <- function(X, alpha, allPars){
  pred_agg <- vector(length = nrow(X))
  M <- length(alpha)
  for(m in 1:M){
    pred_agg <- pred_agg + (alpha[m] * classify(X, allPars[m])) 
  }
  pred_agg[pred_agg >= 0] <- 1
  pred_agg[pred_agg < 0] <- -1
  return(pred_agg)
 }

 adaBoost <- function(X, y, B){
  n <- nrow(X)
  w <- rep(1/n, times = n)
  alpha <- rep(0,times=B)
  allPars <- rep(list(list()),B)
  for(b in 1:B){
    # Step 1: train base classifier 
    allPars[[b]] <- train(X, w, y)
    # Step 2: compute error
    misClass <- (y!= classify(X, allPars[[b]]))
    e <- (w %*% misClass/sum(w))
    # Step 3: compute voting weight 
    alpha[b] <- log((1-e)/e)
    # Step 4: recompute weights 
    w <- w * exp(alpha[b] * misClass)
  }
  return(list(allPars=allPars, alpha=alpha))
 }
diff --git a/03_adaboost_executing_functions.R b/03_adaboost_executing_functions.R
 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # 0.  Set-ups
 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 install.packages("ada")
 library("rpart")
 library("ada")
 uspsdata <- read.table(file.choose(), header = F)
 uspscl <- read.table(file.choose(), header = F)
 uspsdata <- as.matrix(uspsdata)
 uspscl <- as.matrix(uspscl)
 n <- nrow(uspsdata)

 ## File to run AdaBoost

 B_max <- 60
 nCV <- 5 
 n <- nrow(uspsdata)

 testErrorRate <- matrix(0, nrow = B_max, ncol = nCV)
 trainErrorRate <- matrix(0, nrow = B_max, ncol = nCV)
 for(i in 1:nCV){
  # Randomly split data in training and test half 
  n <- nrow(uspsdata)
  p <- sample.int(n)
  trainIndx <- p[1:round(n/2)]
  testIndx <- p[-(1:round(n/2))]
  
  ada <-adaBoost(uspsdata[trainIndx,], uspscl[trainIndx], B_max)
  allPars <- ada$allPars 
  alpha <- ada$alpha 
  
  # Determine error rate, depending on the number of base classifier 
  for(B in 1:B_max){
    c_hat_test <- agg_class(uspsdata[testIndx, ], alpha[1:B], allPars[1:B])
    testErrorRate[B,i] <- mean(uspscl[testIndx]!= c_hat_test)
    c_hat_train <- agg_class(uspsdata[trainIndx, ], alpha[1:B], allPars[1:B])
    trainErrorRate[B,i] <- mean(uspscl[trainIndx]!=c_hat_train)
  }
 }

 # Plot 
 matplot(trainErrorRate ,type="l",lty=1:nCV, ylim = c(0,0.5))
 matplot(testErrorRate ,type="l",lty=1:nCV, ylim = c(0,0.05))
	#=========================================================================
	#STAT W4400
	#Homework 03
	# yl3394, Yanjin Li
	#Problem 1 AdaBoost
	#=========================================================================
	# In this problem I will implement AdaBoost algorithm in R. The algorithm
	# requires two auxiliary functions, to train and to evaluate the weak leaner.
	# And, then we will have the third function for implementing the resulting
	# boosting classifier. Here, we will use the decision stumps as our weak
	# learners.
	#=========================================================================

	#' Trying to make this obvious, but I borrowed this code from this repo right here:
	#' https://github.com/yl3394/adaboost-implementation-in-r
	#' this is one of the cleanest implementations of adaboost in R that I've seen
	#' Thanks to y13394 for this!

	# also, this link is for a good article on the mathematical theory of boosting:
	# https://blog.statsbot.co/ensemble-learning-d1dcd548e936
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# 1. Decision Stump: Weak Learner Training Routine
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# w,y are vectors containing the weights and class labels, the output pars
	# is a list which contains the parameters specifying the result classifiers.
	# In this part, I will train the decision stump by between-axis and within-
	# axis comparisons.

	train <- function(X, w, y){
	n <- nrow(X)
	p <- ncol(X)
	elist <- matrix(nrow=p)
	mlist <- matrix(nrow=p)
	tlist <- matrix(nrow=p)
	for(j in 1:p){
	# Sort data points along dimension
	indx <- order(X[,j])
	x_j <- X[indx, j]
	# Using a cummulative sum, count the weight when progressively shifting
	# the threshold to the right
	w_cum <- cumsum(w[indx] * y[indx])

	# Handle multiple occurences of same x_j values
	# threshold point must not lie between elements of the same value
	w_cum[duplicated((x_j)==1)] <- NA

	# Find the optimal threshold and classify accordingly
	m <- max(abs(w_cum), na.rm = TRUE) #remove RM
	print(m)
	maxIndx <- min(which(abs(w_cum)==m))
	mlist[j] <- (w_cum[maxIndx]<0)*2 - 1
	tlist[j] <- x_j[maxIndx]
	c <- ((x_j > tlist[j])2 -1) mlist[j]
	#here (...>...) will give us 1 if is true,otherwise 0
	elist[j] <- w %*% (c!=y)
	}
	m <- min(elist)
	j_star <- min(which(elist==m))

	pars <- list(j=j_star, theta = tlist[j_star], mode = mlist[j_star])
	return(pars)
	}

	classify <- function(X, pars){
	j <- pars$j
	t <- pars$theta
	m <- pars$mode
	x <- X[,j]
	pred <- m * (x-t)
	pred[pred < 0] <- -1
	pred[pred >= 0] <- 1
	return(pred)
	}


	agg_class <- function(X, alpha, allPars){
	pred_agg <- vector(length = nrow(X))
	M <- length(alpha)
	for(m in 1:M){
	pred_agg <- pred_agg + (alpha[m] * classify(X, allPars[m]))
	}
	pred_agg[pred_agg >= 0] <- 1
	pred_agg[pred_agg < 0] <- -1
	return(pred_agg)
	}

	adaBoost <- function(X, y, B){
	n <- nrow(X)
	w <- rep(1/n, times = n)
	alpha <- rep(0,times=B)
	allPars <- rep(list(list()),B)
	for(b in 1:B){
	# Step 1: train base classifier
	allPars[[b]] <- train(X, w, y)
	# Step 2: compute error
	misClass <- (y!= classify(X, allPars[[b]]))
	e <- (w %*% misClass/sum(w))
	# Step 3: compute voting weight
	alpha[b] <- log((1-e)/e)
	# Step 4: recompute weights
	w <- w * exp(alpha[b] * misClass)
	}
	return(list(allPars=allPars, alpha=alpha))
	}
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	# 0. Set-ups
	#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	install.packages("ada")
	library("rpart")
	library("ada")
	uspsdata <- read.table(file.choose(), header = F)
	uspscl <- read.table(file.choose(), header = F)
	uspsdata <- as.matrix(uspsdata)
	uspscl <- as.matrix(uspscl)
	n <- nrow(uspsdata)

	## File to run AdaBoost

	B_max <- 60
	nCV <- 5
	n <- nrow(uspsdata)

	testErrorRate <- matrix(0, nrow = B_max, ncol = nCV)
	trainErrorRate <- matrix(0, nrow = B_max, ncol = nCV)
	for(i in 1:nCV){
	# Randomly split data in training and test half
	n <- nrow(uspsdata)
	p <- sample.int(n)
	trainIndx <- p[1:round(n/2)]
	testIndx <- p[-(1:round(n/2))]

	ada <-adaBoost(uspsdata[trainIndx,], uspscl[trainIndx], B_max)
	allPars <- ada$allPars
	alpha <- ada$alpha

	# Determine error rate, depending on the number of base classifier
	for(B in 1:B_max){
	c_hat_test <- agg_class(uspsdata[testIndx, ], alpha[1:B], allPars[1:B])
	testErrorRate[B,i] <- mean(uspscl[testIndx]!= c_hat_test)
	c_hat_train <- agg_class(uspsdata[trainIndx, ], alpha[1:B], allPars[1:B])
	trainErrorRate[B,i] <- mean(uspscl[trainIndx]!=c_hat_train)
	}
	}

	# Plot
	matplot(trainErrorRate ,type="l",lty=1:nCV, ylim = c(0,0.5))
	matplot(testErrorRate ,type="l",lty=1:nCV, ylim = c(0,0.05))