Created
October 30, 2017 14:54
-
-
Save tomhopper/a4b8bce5182cc742d6b09ea1b66fca7a to your computer and use it in GitHub Desktop.
Demonstration of central limit theorem
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Demonstration of central limit theorem | |
## Based on code in an anonymous comment to the blog post at \url{https://sas-and-r.blogspot.com/2012/01/example-919-demonstrating-central-limit.html} | |
## Libraries #### | |
library(nortest) | |
library(dplyr) | |
library(ggplot2) | |
## Data used #### | |
# right-triangle distribution (peak at 0; minimum at 1) | |
data_vec <- rbeta(10000, shape1 = 1, shape2 = 2); | |
## Function #### | |
#' @title Multiple comparisons of a given data vector to the normal distribution | |
#' @param data_vec Data to sample for calculating means | |
#' @param sample_size Sample size to use for calculating means | |
#' @return A number representing the proportion of p-values < 0.05 | |
#' @description Given a data vector and a sample size, computes 200 means | |
#' based on \code{sample_size}, then compares the means to the normal distribution | |
#' on the hypothesis H0: data_vec is normal using \code{sf.test()} from | |
#' \code{library(nortest)}. Finally, computes and returns the fraction of p-values | |
#' that reject the null hypothesis. Given a non-normal distribution, the Central | |
#' Limit Theorem tells us that this proportion | |
#' should decrease as \code{sample_size} increases. | |
clt <- function(data_vec, sample_size) { | |
means<-array(0, 200) | |
pvals <- array(0, 200) | |
for(i in 1:200) { | |
for(j in i:200) { means[j] <- mean(sample(data, sample_size)) } | |
pvals[i] <- sf.test(means)$p.val | |
} | |
length(pvals[pvals<0.05])/200 | |
} | |
## Run \code{clt()} multiple times for a range of sample sizes. #### | |
sample_size <- c(rep(2, times = 100), | |
rep(10, times = 100), | |
rep(100, times = 100), | |
rep(1000, times = 100)) | |
prop_pvalue <- data_frame(size = sample_size, | |
pvalue = rep(NA, length(sample_size))) | |
for(x in 1:length(sample_size)) { | |
prop_pvalue[x, 2] <- clt(data = data_vec, sample_size = sample_size[x]) | |
} | |
## Graph results #### | |
prop_pvalue %>% | |
mutate(size = as.factor(size)) %>% | |
ggplot(aes(x = size, y = pvalue)) + | |
geom_boxplot() + | |
coord_flip() + | |
labs(title = "Central Limit Theorem", | |
x = expression(paste("Sample size, ", italic("n"))), | |
y = "Proportion of non-normal distributions") + | |
theme_minimal() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment