mike-lawrence · March 24, 2020 12:50
diff --git a/2x2.r b/2x2.r
 library(tidyverse)
 library(broom)
 library(lme4)

 #set default contrasts to halfsum (this matters!)
 halfsum_contrasts = function (...) contr.sum(...) * 0.5
 options(contrasts = c('halfsum_contrasts','contr.poly'))


 run_sim = function(N,K,seed,do_checks=F){
 	set.seed(seed)
 	coefs = c(0,0,0,1)
 	sds = c(3,3,3,3)
 	noise = 4
 	
 	subj_coefs = matrix(
 		rnorm(N*length(coefs),coefs,sds)
 		, nrow = N
 		, byrow = T
 	)
 	
 	if(do_checks){
 		#check that matrix came out right
 		print(apply(subj_coefs,2,mean)) #should be close to coef
 		print(apply(subj_coefs,2,sd)) #should be close to sds
 		print(cor(subj_coefs)) #should be close to identity
 	}
 	
 	#create the condition tibble and contrasts
 	conditions = tidyr::expand_grid(
 		a = factor(c('a_pre','a_post'))
 		, b = factor(c('b_pre','b_post'))
 	)
 	contrasts = model.matrix(
 		data = conditions
 		, object = ~ a*b
 	)
 	
 	#compute each subject's value-per-cell
 	subj_vals = list()
 	for(i in 1:N){
 		subj_vals[[i]] = conditions
 		subj_vals[[i]]$subj = i
 		subj_vals[[i]]$value  = as.numeric(as.matrix(contrasts) %*% subj_coefs[i,])
 	}
 	subj_vals = dplyr::bind_rows(subj_vals)
 	subj_vals$subj = factor(subj_vals$subj)
 	
 	if(do_checks){
 		#compute difference between conditions
 		# (this is why we use halfsum contrasts)
 		subj_vals %>%
 			dplyr::group_by(
 				subj
 			) %>%
 			dplyr::summarise(
 				intercept = mean(value)
 				, ab = (
 						mean(value[(a=='a_post')&(b=='b_post')]) - 
 						mean(value[(a=='a_pre')&(b=='b_post')])
 					) - (
 						mean(value[(a=='a_post')&(b=='b_pre')]) - 
 						mean(value[(a=='a_pre')&(b=='b_pre')])
 					)
 				, a = mean(value[a=='a_post']) - mean(value[a=='a_pre'])
 				, b = mean(value[b=='b_post']) - mean(value[b=='b_pre'])
 			) %>%
 			tidyr::gather(
 				key = key
 				, value = value
 				, -subj
 			) %>%
 			dplyr::group_by(
 				key
 			) %>%
 			dplyr::summarise(
 				mean = mean(value)
 				, sd = sd(value)
 			) #should roughly match coefs and sds
 	}
 	
 	
 	if(do_checks){
 		#compute an lm per participant then check the mean/sd of resulting distributions
 		subj_vals %>%
 			dplyr::group_by(
 				subj
 			) %>%
 			dplyr::summarise(
 				broom::tidy(lm(value~a*b))
 			) %>%
 			dplyr::group_by(
 				term
 			) %>%
 			dplyr::summarise(
 				mean = mean(estimate)
 				, sd = sd(estimate)
 			) #should roughly match coefs and sds
 	}
 	
 	#sample from each subject-by-condition mean with measurement noise
 	subj_vals %>%
 		dplyr::group_by(
 			subj
 			, a
 			, b
 		) %>%
 		dplyr::summarise(
 			value = rnorm(K,value,noise)
 		) ->
 		obs
 	
 	if(do_checks){
 		#compute difference between conditions
 		# (this is why we use halfsum contrasts)
 		obs %>%
 			dplyr::group_by(
 				subj
 			) %>%
 			dplyr::summarise(
 				intercept = mean(value)
 				, ab = (
 					mean(value[(a=='a_post')&(b=='b_post')]) - 
 					mean(value[(a=='a_pre')&(b=='b_post')])
 				) - (
 					mean(value[(a=='a_post')&(b=='b_pre')]) - 
 					mean(value[(a=='a_pre')&(b=='b_pre')])
 				)
 				, a = mean(value[a=='a_post']) - mean(value[a=='a_pre'])
 				, b = mean(value[b=='b_post']) - mean(value[b=='b_pre'])
 			) %>%
 			tidyr::gather(
 				key = key
 				, value = value
 				, -subj
 			) %>%
 			dplyr::group_by(
 				key
 			) %>%
 			dplyr::summarise(
 				mean = mean(value)
 				, sd = sd(value)
 			) #should roughly match coefs and sds
 	}
 	
 	if(do_checks){
 		#compute an lm per participant then check the mean/sd of resulting distributions
 		obs %>%
 			dplyr::group_by(
 				subj
 			) %>%
 			dplyr::summarise(
 				broom::tidy(lm(value~a*b))
 			) %>%
 			dplyr::group_by(
 				term
 			) %>%
 			dplyr::summarise(
 				mean = mean(estimate)
 				, sd = sd(estimate)
 			) #should roughly match coefs and sds
 	}
 	
 	#fit and obtain intervals
 	fit = lme4::lmer(
 		formula = value ~ a*b + (a*b|subj)
 		, data = obs
 	)
 	ci = confint(fit,method='Wald')
 	
 	if(do_checks){
 		print(
 			tibble(
 				N = N
 				, K = K
 				, seed = seed
 				, lo = ci[nrow(ci),1]
 				, hi = ci[nrow(ci),2]
 			)
 		)
 	}
 	
 	return(tibble(
 		bound = c('lo','hi')
 		, value = ci[nrow(ci),]
 	))
 	
 }

 #running once
 run_sim(N=100,K=5,seed=3)

 #running across a grid
 tidyr::expand_grid(
 	N = c(,20,40,80,160)
 	, K = c(5,10,20)
 	, iteration = 1:1e3
 ) %>%
 	dplyr::mutate(
 		seed=1:n()
 	) %>%
 	dplyr::group_by(
 		N
 		, K
 		, iteration
 	) %>%
 	dplyr::summarize(
 		run_sim(
 			N = N
 			, K = K
 			, seed = seed
 		)
 	) ->
 	out


 out %>%
 	tidyr::spread(
 		key = bound
 		, value = value
 	) %>%
 	dplyr::group_by(
 		N
 		, K
 	) %>%
 	dplyr::summarise(
 		hi_less_than_zero = mean(hi<0)
 		, hi_greater_than_zero = mean(hi>0)
 		, lo_less_than_zero = mean(lo<0)
 		, lo_greater_than_zero = mean(lo>0)
 	)
diff --git a/single_predictor.r b/single_predictor.r
 library(tidyverse)
 library(broom)
 library(lme4)

 #set default contrasts to halfsum (this matters!)
 halfsum_contrasts = function (...) contr.sum(...) * 0.5
 options(contrasts = c('halfsum_contrasts','contr.poly'))


 run_sim = function(N,K,seed,do_checks=F){
 	set.seed(seed)
 	coefs = c(0,1)
 	sds = c(1,3)
 	noise = 4

 	subj_coefs = matrix(
 		rnorm(N*length(coefs),coefs,sds)
 		, nrow = N
 		, byrow = T
 	)

 	if(do_checks){
 		#check that matrix came out right
 		print(apply(subj_coefs,2,mean)) #should be close to coef
 		print(apply(subj_coefs,2,sd)) #should be close to sds
 		print(cor(subj_coefs)) #should be close to identity
 	}

 	#create the condition tibble and contrasts
 	conditions = tidyr::expand_grid(
 		a = factor(c('a_pre','a_post'))
 	)
 	contrasts = model.matrix(
 		data = conditions
 		, object = ~ a
 	)

 	#compute each subject's value-per-cell
 	subj_vals = list()
 	for(i in 1:N){
 		subj_vals[[i]] = conditions
 		subj_vals[[i]]$subj = i
 		subj_vals[[i]]$value  = as.numeric(as.matrix(contrasts) %*% subj_coefs[i,])
 	}
 	subj_vals = dplyr::bind_rows(subj_vals)
 	subj_vals$subj = factor(subj_vals$subj)

 	if(do_checks){
 		#compute difference between conditions
 		# (this is why we use halfsum contrasts)
 		subj_vals %>%
 			dplyr::group_by(
 				subj
 			) %>%
 			dplyr::summarise(
 				intercept = mean(value)
 				, a = mean(value[a=='a_post']) - mean(value[a=='a_pre'])
 			) %>%
 			tidyr::gather(
 				key = key
 				, value = value
 				, -subj
 			) %>%
 			dplyr::group_by(
 				key
 			) %>%
 			dplyr::summarise(
 				mean = mean(value)
 				, sd = sd(value)
 			) #should roughly match coefs and sds
 	}


 	if(do_checks){
 		#compute an lm per participant then check the mean/sd of resulting distributions
 		subj_vals %>%
 			dplyr::group_by(
 				subj
 			) %>%
 			dplyr::summarise(
 				broom::tidy(lm(value~a))
 			) %>%
 			dplyr::group_by(
 				term
 			) %>%
 			dplyr::summarise(
 				mean = mean(estimate)
 				, sd = sd(estimate)
 			) #should roughly match coefs and sds
 	}

 	#sample from each subject-by-condition mean with measurement noise
 	subj_vals %>%
 		dplyr::group_by(
 			subj
 			, a
 		) %>%
 		dplyr::summarise(
 			value = rnorm(K,value,noise)
 		) ->
 		obs

 	if(do_checks){
 		#compute difference between conditions
 		# (this is why we use halfsum contrasts)
 		obs %>%
 			dplyr::group_by(
 				subj
 			) %>%
 			dplyr::summarise(
 				intercept = mean(value)
 				, a = mean(value[a=='a_post']) - mean(value[a=='a_pre'])
 			) %>%
 			tidyr::gather(
 				key = key
 				, value = value
 				, -subj
 			) %>%
 			dplyr::group_by(
 				key
 			) %>%
 			dplyr::summarise(
 				mean = mean(value)
 				, sd = sd(value)
 			) #should roughly match coefs and sds
 	}

 	if(do_checks){
 		#compute an lm per participant then check the mean/sd of resulting distributions
 		obs %>%
 			dplyr::group_by(
 				subj
 			) %>%
 			dplyr::summarise(
 				broom::tidy(lm(value~a))
 			) %>%
 			dplyr::group_by(
 				term
 			) %>%
 			dplyr::summarise(
 				mean = mean(estimate)
 				, sd = sd(estimate)
 			) #should roughly match coefs and sds
 	}

 	#fit and obtain intervals
 	fit = lme4::lmer(
 		formula = value ~ a + (a|subj)
 		, data = obs
 	)
 	ci = confint(fit,method='Wald')

 	if(do_checks){
 		print(
 			tibble(
 				N = N
 				, K = K
 				, seed = seed
 				, lo = ci[nrow(ci),1]
 				, hi = ci[nrow(ci),2]
 			)
 		)
 	}

 	return(tibble(
 		bound = c('lo','hi')
 		, value = ci[nrow(ci),]
 	))

 }

 #running once
 run_sim(N=100,K=5,seed=3)

 #running across a grid
 tidyr::expand_grid(
 	N = c(20,40,80,160)
 	, K = c(5,10,20)
 	, iteration = 1:1e3
 ) %>%
 	dplyr::mutate(
 		seed=1:n()
 	) %>%
 	dplyr::group_by(
 		N
 		, K
 		, iteration
 	) %>%
 	dplyr::summarize(
 		run_sim(
 			N = N
 			, K = K
 			, seed = seed
 		)
 	) ->
 	out


 out %>%
 	tidyr::spread(
 		key = bound
 		, value = value
 	) %>%
 	dplyr::group_by(
 		N
 		, K
 	) %>%
 	dplyr::summarise(
 		hi_less_than_zero = mean(hi<0)
 		, hi_greater_than_zero = mean(hi>0)
 		, lo_less_than_zero = mean(lo<0)
 		, lo_greater_than_zero = mean(lo>0)
 	)
	library(tidyverse)
	library(broom)
	library(lme4)

	#set default contrasts to halfsum (this matters!)
	halfsum_contrasts = function (...) contr.sum(...) * 0.5
	options(contrasts = c('halfsum_contrasts','contr.poly'))


	run_sim = function(N,K,seed,do_checks=F){
	set.seed(seed)
	coefs = c(0,0,0,1)
	sds = c(3,3,3,3)
	noise = 4

	subj_coefs = matrix(
	rnorm(N*length(coefs),coefs,sds)
	, nrow = N
	, byrow = T
	)

	if(do_checks){
	#check that matrix came out right
	print(apply(subj_coefs,2,mean)) #should be close to coef
	print(apply(subj_coefs,2,sd)) #should be close to sds
	print(cor(subj_coefs)) #should be close to identity
	}

	#create the condition tibble and contrasts
	conditions = tidyr::expand_grid(
	a = factor(c('a_pre','a_post'))
	, b = factor(c('b_pre','b_post'))
	)
	contrasts = model.matrix(
	data = conditions
	, object = ~ a*b
	)

	#compute each subject's value-per-cell
	subj_vals = list()
	for(i in 1:N){
	subj_vals[[i]] = conditions
	subj_vals[[i]]$subj = i
	subj_vals[[i]]$value = as.numeric(as.matrix(contrasts) %*% subj_coefs[i,])
	}
	subj_vals = dplyr::bind_rows(subj_vals)
	subj_vals$subj = factor(subj_vals$subj)

	if(do_checks){
	#compute difference between conditions
	# (this is why we use halfsum contrasts)
	subj_vals %>%
	dplyr::group_by(
	subj
	) %>%
	dplyr::summarise(
	intercept = mean(value)
	, ab = (
	mean(value[(a=='a_post')&(b=='b_post')]) -
	mean(value[(a=='a_pre')&(b=='b_post')])
	) - (
	mean(value[(a=='a_post')&(b=='b_pre')]) -
	mean(value[(a=='a_pre')&(b=='b_pre')])
	)
	, a = mean(value[a=='a_post']) - mean(value[a=='a_pre'])
	, b = mean(value[b=='b_post']) - mean(value[b=='b_pre'])
	) %>%
	tidyr::gather(
	key = key
	, value = value
	, -subj
	) %>%
	dplyr::group_by(
	key
	) %>%
	dplyr::summarise(
	mean = mean(value)
	, sd = sd(value)
	) #should roughly match coefs and sds
	}


	if(do_checks){
	#compute an lm per participant then check the mean/sd of resulting distributions
	subj_vals %>%
	dplyr::group_by(
	subj
	) %>%
	dplyr::summarise(
	broom::tidy(lm(value~a*b))
	) %>%
	dplyr::group_by(
	term
	) %>%
	dplyr::summarise(
	mean = mean(estimate)
	, sd = sd(estimate)
	) #should roughly match coefs and sds
	}

	#sample from each subject-by-condition mean with measurement noise
	subj_vals %>%
	dplyr::group_by(
	subj
	, a
	, b
	) %>%
	dplyr::summarise(
	value = rnorm(K,value,noise)
	) ->
	obs

	if(do_checks){
	#compute difference between conditions
	# (this is why we use halfsum contrasts)
	obs %>%
	dplyr::group_by(
	subj
	) %>%
	dplyr::summarise(
	intercept = mean(value)
	, ab = (
	mean(value[(a=='a_post')&(b=='b_post')]) -
	mean(value[(a=='a_pre')&(b=='b_post')])
	) - (
	mean(value[(a=='a_post')&(b=='b_pre')]) -
	mean(value[(a=='a_pre')&(b=='b_pre')])
	)
	, a = mean(value[a=='a_post']) - mean(value[a=='a_pre'])
	, b = mean(value[b=='b_post']) - mean(value[b=='b_pre'])
	) %>%
	tidyr::gather(
	key = key
	, value = value
	, -subj
	) %>%
	dplyr::group_by(
	key
	) %>%
	dplyr::summarise(
	mean = mean(value)
	, sd = sd(value)
	) #should roughly match coefs and sds
	}

	if(do_checks){
	#compute an lm per participant then check the mean/sd of resulting distributions
	obs %>%
	dplyr::group_by(
	subj
	) %>%
	dplyr::summarise(
	broom::tidy(lm(value~a*b))
	) %>%
	dplyr::group_by(
	term
	) %>%
	dplyr::summarise(
	mean = mean(estimate)
	, sd = sd(estimate)
	) #should roughly match coefs and sds
	}

	#fit and obtain intervals
	fit = lme4::lmer(
	formula = value ~ ab + (ab\|subj)
	, data = obs
	)
	ci = confint(fit,method='Wald')

	if(do_checks){
	print(
	tibble(
	N = N
	, K = K
	, seed = seed
	, lo = ci[nrow(ci),1]
	, hi = ci[nrow(ci),2]
	)
	)
	}

	return(tibble(
	bound = c('lo','hi')
	, value = ci[nrow(ci),]
	))

	}

	#running once
	run_sim(N=100,K=5,seed=3)

	#running across a grid
	tidyr::expand_grid(
	N = c(,20,40,80,160)
	, K = c(5,10,20)
	, iteration = 1:1e3
	) %>%
	dplyr::mutate(
	seed=1:n()
	) %>%
	dplyr::group_by(
	N
	, K
	, iteration
	) %>%
	dplyr::summarize(
	run_sim(
	N = N
	, K = K
	, seed = seed
	)
	) ->
	out


	out %>%
	tidyr::spread(
	key = bound
	, value = value
	) %>%
	dplyr::group_by(
	N
	, K
	) %>%
	dplyr::summarise(
	hi_less_than_zero = mean(hi<0)
	, hi_greater_than_zero = mean(hi>0)
	, lo_less_than_zero = mean(lo<0)
	, lo_greater_than_zero = mean(lo>0)
	)