Last active
May 6, 2022 18:50
-
-
Save grantmcdermott/7d8f9ea20d2bbf54d3366f5a72482ad9 to your computer and use it in GitHub Desktop.
Bayesian bootstrap
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Context: https://twitter.com/grant_mcdermott/status/1487528757418102787 | |
library(data.table) | |
library(fixest) | |
bboot = | |
function(object, reps = 100L, cluster = NULL, ...) { | |
fixest_obj = inherits(object, c('fixest', 'fixest_multi')) | |
if (inherits(object, c('lm'))) { | |
Ymat = object$model[, 1] | |
} else if (fixest_obj) { | |
Ymat = model.matrix(object, type = 'lhs', as.matrix = TRUE) | |
} else { | |
stop('\nModel or object class not currently supported.\n') | |
} | |
Xmat = model.matrix(object) | |
n_weights = nrow(Xmat) | |
fmat = NULL | |
if (fixest_obj && !is.null(object$fixef_vars)) { | |
fmat = model.matrix(object, type = 'fixef') | |
} | |
## Have to do a bit of leg work to pull out the clusters and match to | |
## model matrix | |
if (!is.null(cluster)) { | |
if (inherits(cluster, "formula")) { | |
cl_string = strsplit(paste0(cluster)[2], split = ' \\+ ')[[1]] | |
} else { | |
cl_string = paste(cluster) | |
} | |
if (!is.null(fmat) && all(cl_string %in% colnames(fmat))) { | |
cl_mat = fmat[, cl_string] | |
} else if (all(cl_string %in% colnames(Xmat))) { | |
cl_mat = Xmat[, cl_string] | |
} else { | |
DATA = eval(object$call$data) | |
if (all(cl_string %in% names(DATA))) { | |
all_vars = sapply(list(Ymat, Xmat, fmat), colnames) | |
if (inherits(all_vars, 'list')) all_vars = do.call('c', all_vars) | |
all_vars = union(all_vars, cl_string) | |
DATA = data.frame(DATA)[, intersect(colnames(DATA), all_vars)] | |
DATA = DATA[complete.cases(DATA), ] | |
cl_mat = model.matrix(~0+., DATA[, cl_string, drop=FALSE]) | |
} else { | |
stop(paste0('Could not find ', cluster, '. Please provide a valid input.\n')) | |
} | |
} | |
if (!inherits(cl_mat, "matrix")) cl_mat = matrix(cl_mat) | |
n_weights = nrow(unique(cl_mat)) | |
## Keep track of cluster id for consistent weighting within each | |
## cluster later on | |
cl_mat = data.table::as.data.table(cl_mat) | |
cl_mat$cl_id = data.table::frank(cl_mat, ties.method = "dense") | |
} | |
## Pre-allocate space for efficiency | |
wfits = matrix(0, reps, length(object$coefficients)) | |
for (i in 1:reps) { | |
if (is.null(cluster)) { | |
weights = rexp(n_weights, rate = 1) | |
} else { | |
weights = cl_mat[, wt := rexp(1, rate = 1), by = cl_id][, wt] | |
} | |
## Normalise weights | |
## (Unnecessary? https://twitter.com/deaneckles/status/1487506960698200067) | |
# weights = weights / sum(weights) | |
## Demean X and Y matrices if fixed effects are present | |
if (!is.null(fmat)) { | |
Xmat = fixest::demean(X = Xmat, f = fmat, weights = weights) | |
Ymat = fixest::demean(X = Ymat, f = fmat, weights = weights) | |
} | |
## Fit weighted reg | |
wfits[i, ] = lm.wfit(x = Xmat, y = Ymat, w = weights)$coefficients | |
} | |
colnames(wfits) = colnames(Xmat) | |
class(wfits) = "bboot" | |
## Meta attributes | |
attr(wfits, "coefs") = try(coefficients(object), silent = TRUE) | |
attr(wfits, "df") = try(df.residual(object), silent = TRUE) | |
attr(wfits, "se") = try(sqrt(diag(cov(wfits))), silent = TRUE) | |
attr(wfits, "reps") = reps | |
return(wfits) | |
} | |
# | |
## Methods | |
# | |
summary.bboot = function(object, level = 0.95, ...) { | |
alpha = 1 - level | |
lwr = alpha/2 | |
upr = 1-lwr | |
est = attr(object, "coefs") | |
se = attr(object, "se") | |
df = attr(object, "df") | |
cnames = c("Estimate", "Std. Error") | |
tval = as.vector(est)/se | |
if (is.null(df)) df = 0 | |
if (is.finite(df) && df > 0) { | |
pval = 2 * pt(abs(tval), df = df, lower.tail = FALSE) | |
fac = qt(alpha, df = df) | |
cnames = c(cnames, "t value", "Pr(>|t|)") | |
} else { | |
pval = 2 * pnorm(abs(tval), lower.tail = FALSE) | |
fac = qnorm(alpha) | |
cnames = c(cnames, "z value", "Pr(>|z|)") | |
} | |
out = cbind(est, se, tval, pval) | |
colnames(out) = cnames | |
ci = cbind(est + fac * se, est - fac * se) | |
colnames(ci) = c("Lower", "Upper") | |
out = cbind(out, ci) | |
# qtiles = t(apply(object, 2, \(x) c(mean(x), quantile(x, c(lwr, upr))))) | |
# colnames(qtiles) = c("mean", "conf.low", "conf.upper") | |
# attr(out, "quantiles") = qtiles | |
out | |
} | |
print.bboot = function(object, ...) { | |
out = summary(object, ...) | |
cat("Bayesian bootstrap: Standard errors based on", attr(object, "reps"), "reps.", "\n\n") | |
print(out, digits = 4, quote = FALSE, print.gap = 2L) | |
} | |
# | |
## Examples | |
# | |
set.seed(123) | |
mod = lm(mpg ~ wt + hp, mtcars) | |
bb_mod = bboot(mod, reps = 1e3) | |
bb_mod | |
hist(bb_mod[, 'wt'], | |
breaks = 100, | |
border = 'white', | |
main = 'Bayesian bootstrap: wt') | |
# Add cluster variables with a formula | |
set.seed(42) | |
bboot(mod, reps = 1e3, cluster = ~cyl) | |
# Same result for fixest::feols (no FEs) | |
set.seed(42) | |
feols(mpg ~ wt + hp, mtcars) |> | |
bboot(reps = 1e3, cluster = ~cyl) | |
# Incl. FEs is fine too (although adds a bit of overhead) | |
feols(mpg ~ wt + hp | cyl, mtcars, vcov = 'iid') |> | |
bboot(reps = 1e3) | |
# Can combine FEs and clustering as well | |
feols(mpg ~ wt + hp | cyl, mtcars, vcov = ~cyl) |> | |
bboot(reps = 1e3, cluster = ~cyl) |
You don't need to apologize.
Thank you very much for the detailed answer!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi again @Oravishayrizi. Sorry for the long delay between replies. Too many balls in the air...
I spoke to @zeileis about what he does for
sandwhich::vcovBS
and the short answer is just grab the standard Pearson correlation matrix (i.e. produced by applyingcov()
on all of the bootstrapped results). No small sample adjustments, which I think is reasonable, although there's been some discussion about what to do in the presence of few clusters.Anyway, I've adjusted my original
bboot
function above to include an SE attribute based on this approach. It's very simple and just callssqrt(diag(wfits))
. The correspondingsummary/print
methods retrieve this attribute and prints it.Also, since I'm now sticking to what sandwich and lmtest do now, I've also gone ahead and adjusted some other model statistics to be closer in line with your parametric suggestions. The default print method should be what you're looking for. I'm taking some lazy shortcuts, but really this is just a proof-of-concept until Achim or Laurent decide to role something like this into sandwich or fixest. (If not then I might do it for my own ritest package.)