tomhopper · October 30, 2017 14:54
diff --git a/central_limit_theorem.R b/central_limit_theorem.R
 ## Demonstration of central limit theorem
 ## Based on code in an anonymous comment to the blog post at \url{https://sas-and-r.blogspot.com/2012/01/example-919-demonstrating-central-limit.html}

 ## Libraries ####
 library(nortest)
 library(dplyr)
 library(ggplot2)

 ## Data used ####
 # right-triangle distribution (peak at 0; minimum at 1)
 data_vec <- rbeta(10000, shape1 = 1, shape2 = 2);

 ## Function ####
 #' @title Multiple comparisons of a given data vector to the normal distribution
 #' @param data_vec Data to sample for calculating means
 #' @param sample_size Sample size to use for calculating means
 #' @return A number representing the proportion of p-values < 0.05
 #' @description Given a data vector and a sample size, computes 200 means 
 #'   based on \code{sample_size}, then compares the means to the normal distribution
 #'  on the hypothesis H0: data_vec is normal using \code{sf.test()} from 
 #'  \code{library(nortest)}. Finally, computes and returns the fraction of p-values
 #'  that reject the null hypothesis. Given a non-normal distribution, the Central
 #'  Limit Theorem tells us that this proportion
 #'  should decrease as \code{sample_size} increases.
 clt <- function(data_vec, sample_size) {
  
  means<-array(0, 200)
  pvals <- array(0, 200)
  
  for(i in 1:200) {
    
    for(j in i:200) { means[j] <- mean(sample(data, sample_size)) }
    pvals[i] <- sf.test(means)$p.val
  }
  length(pvals[pvals<0.05])/200
 }

 ## Run \code{clt()} multiple times for a range of sample sizes. ####
 sample_size <- c(rep(2, times = 100), 
                 rep(10, times = 100), 
                 rep(100, times = 100),
                 rep(1000, times = 100))

 prop_pvalue <- data_frame(size = sample_size,
                          pvalue = rep(NA, length(sample_size)))
 for(x in 1:length(sample_size)) {
  prop_pvalue[x, 2] <- clt(data = data_vec, sample_size = sample_size[x])
 }

 ## Graph results ####
 prop_pvalue %>% 
  mutate(size = as.factor(size)) %>% 
  ggplot(aes(x = size, y = pvalue)) +
  geom_boxplot() +
  coord_flip() +
  labs(title = "Central Limit Theorem",
       x = expression(paste("Sample size, ", italic("n"))),
       y = "Proportion of non-normal distributions") +
  theme_minimal()
	## Demonstration of central limit theorem
	## Based on code in an anonymous comment to the blog post at \url{https://sas-and-r.blogspot.com/2012/01/example-919-demonstrating-central-limit.html}

	## Libraries ####
	library(nortest)
	library(dplyr)
	library(ggplot2)

	## Data used ####
	# right-triangle distribution (peak at 0; minimum at 1)
	data_vec <- rbeta(10000, shape1 = 1, shape2 = 2);

	## Function ####
	#' @title Multiple comparisons of a given data vector to the normal distribution
	#' @param data_vec Data to sample for calculating means
	#' @param sample_size Sample size to use for calculating means
	#' @return A number representing the proportion of p-values < 0.05
	#' @description Given a data vector and a sample size, computes 200 means
	#' based on \code{sample_size}, then compares the means to the normal distribution
	#' on the hypothesis H0: data_vec is normal using \code{sf.test()} from
	#' \code{library(nortest)}. Finally, computes and returns the fraction of p-values
	#' that reject the null hypothesis. Given a non-normal distribution, the Central
	#' Limit Theorem tells us that this proportion
	#' should decrease as \code{sample_size} increases.
	clt <- function(data_vec, sample_size) {

	means<-array(0, 200)
	pvals <- array(0, 200)

	for(i in 1:200) {

	for(j in i:200) { means[j] <- mean(sample(data, sample_size)) }
	pvals[i] <- sf.test(means)$p.val
	}
	length(pvals[pvals<0.05])/200
	}

	## Run \code{clt()} multiple times for a range of sample sizes. ####
	sample_size <- c(rep(2, times = 100),
	rep(10, times = 100),
	rep(100, times = 100),
	rep(1000, times = 100))

	prop_pvalue <- data_frame(size = sample_size,
	pvalue = rep(NA, length(sample_size)))
	for(x in 1:length(sample_size)) {
	prop_pvalue[x, 2] <- clt(data = data_vec, sample_size = sample_size[x])
	}

	## Graph results ####
	prop_pvalue %>%
	mutate(size = as.factor(size)) %>%
	ggplot(aes(x = size, y = pvalue)) +
	geom_boxplot() +
	coord_flip() +
	labs(title = "Central Limit Theorem",
	x = expression(paste("Sample size, ", italic("n"))),
	y = "Proportion of non-normal distributions") +
	theme_minimal()