Skip to content

Instantly share code, notes, and snippets.

@tyleransom
Created December 4, 2024 04:20
Show Gist options
  • Save tyleransom/ee05e9e5e98f801b2c02c48a88ed7af0 to your computer and use it in GitHub Desktop.
Save tyleransom/ee05e9e5e98f801b2c02c48a88ed7af0 to your computer and use it in GitHub Desktop.
Code to download and analyze NHANES obesity data
# Required libraries
library(RNHANES)
library(tidyverse)
library(survey)
library(haven)
library(stats)
library(readr)
library(patchwork)
library(scales)
#' Constants and configurations
NHANES_CONFIG <- list(
cycles = c(
#"1971-1975",
#"1976-1980",
#"1988-1994",
"1999-2000",
"2001-2002",
"2003-2004",
"2005-2006",
"2007-2008",
"2009-2010",
"2011-2012",
"2013-2014",
"2015-2016",
"2017-2018",
"2021-2022"
),
cycle_codes = c(
"2001-2002" = "B",
"2003-2004" = "C",
"2005-2006" = "D",
"2007-2008" = "E",
"2009-2010" = "F",
"2011-2012" = "G",
"2013-2014" = "H",
"2015-2016" = "I",
"2017-2018" = "J",
"2021-2022" = "L"
)
)
#' Standard column names for consistent data structure
STANDARD_COLS <- c(
"SEQN",
"BMI",
"CYCLE",
"GENDER",
"AGE",
"WTMEC2YR",
"SDMVPSU",
"SDMVSTRA"
)
#' Process raw NHANES data into standardized format
#' @param data Data frame containing raw NHANES data
#' @param cycle_year String indicating the NHANES cycle
#' @return Processed data frame with standardized columns
standardize_nhanes_data <- function(data, cycle_year) {
data %>%
select(SEQN, RIDAGEYR, RIAGENDR, BMXBMI, WTMEC2YR, SDMVPSU, SDMVSTRA) %>%
rename(
BMI = BMXBMI,
AGE = RIDAGEYR,
GENDER = RIAGENDR
) %>%
mutate(
CYCLE = cycle_year,
GENDER = factor(GENDER, levels = c(1, 2), labels = c("Male", "Female"))
)
}
#' Load BMI data for NHANES 1971-1975
#' @return Processed data frame
load_bmi_data_1971_1975 <- function() {
url <- "https://wwwn.cdc.gov/nchs/data/nhanes1/DU4111.txt"
raw_data <- read_fwf(
file = url,
col_positions = fwf_positions(
start = c(1, 101, 104, 255, 266, 95, 196),
end = c(5, 102, 104, 259, 269, 99, 198),
col_names = c("SEQN", "N1BM0101", "N1BM0104", "N1BM0255", "N1BM0266", "N1BM0095", "N1BM0196")
)
) %>%
mutate(across(everything(), ~as.numeric(str_trim(.x)))) # Convert all to numeric after trimming
processed_data <- raw_data %>%
mutate(
SEQN = as.integer(SEQN), # Ensure SEQN is integer type
BMI = ifelse(!is.na(N1BM0255) & !is.na(N1BM0266) & N1BM0266 > 0,
N1BM0255 * 0.453592 / ((N1BM0266/100)^2),
NA_real_),
CYCLE = "1971-1975",
GENDER = factor(N1BM0104, levels = c(1, 2), labels = c("Male", "Female")),
AGE = N1BM0101,
WTMEC2YR = 1,
SDMVSTRA = N1BM0095,
SDMVPSU = N1BM0196
) %>%
select(all_of(STANDARD_COLS))
return(processed_data)
}
#' Load BMI data for NHANES 1976-1980
load_bmi_data_1976_1980 <- function() {
url <- "https://wwwn.cdc.gov/nchs/data/nhanes2/DU5301.txt"
raw_data <- read_fwf(
file = url,
col_positions = fwf_positions(
start = c(1, 55, 407, 412, 418, 422, 190, 282, 324, 326),
end = c(5, 55, 411, 416, 421, 424, 191, 287, 325, 326),
col_names = c("SEQN", "SEX", "WT1", "WT2", "HT1", "HT2",
"AGE", "WTMEC2YR", "SDMVSTRA", "SDMVPSU")
)
) %>%
mutate(across(everything(), ~ifelse(.x == "" | .x == " ", NA_character_, .x))) %>%
mutate(across(everything(), ~as.numeric(str_trim(.x)))) %>%
mutate(
SEQN = as.integer(SEQN),
# Convert string measurements to numeric, handling implied decimals
WT1 = ifelse(WT1 >= 9999, NA_real_, as.numeric(WT1)/100), # lbs with 2 decimal places
WT2 = ifelse(WT2 >= 9999, NA_real_, as.numeric(WT2)/100), # kg with 2 decimal places
HT1 = ifelse(HT1 >= 9999, NA_real_, as.numeric(HT1)/10), # cm with 1 decimal place
HT2 = ifelse(HT2 >= 999, NA_real_, as.numeric(HT2)/10), # inches with 1 decimal place
# Use kg and cm for BMI calculation
WT = WT2, # Already in kg
HT = HT1, # Already in cm
# Calculate BMI
BMI = ifelse(!is.na(WT) & !is.na(HT) & HT > 0 & WT > 0,
WT / ((HT/100)^2),
NA_real_),
CYCLE = "1976-1980",
GENDER = factor(SEX, levels = c(1, 2), labels = c("Male", "Female"))
) %>%
select(SEQN, BMI, CYCLE, GENDER, AGE, WTMEC2YR, SDMVSTRA, SDMVPSU)
}
#' Load BMI data for NHANES 1988-1994
load_bmi_data_1988_1994 <- function() {
url <- "https://wwwn.cdc.gov/nchs/data/nhanes3/1a/exam.dat"
raw_data <- read_fwf(
file = url,
col_positions = fwf_positions(
start = c(1, 14199, 14203, 115, 13, 335, 333, 12),
end = c(5, 14202, 14206, 117, 13, 337, 334, 19),
col_names = c("SEQN", "BMPWT", "BMPHT", "HSAGEIR", "HSSEX",
"SDPSTRA6", "SDPPSU6", "WTPFEX6")
)
) %>%
mutate(across(everything(), ~as.numeric(str_trim(.x))))
processed_data <- raw_data %>%
mutate(
SEQN = as.integer(SEQN),
BMI = ifelse(!is.na(BMPWT) & !is.na(BMPHT) & BMPHT > 0,
BMPWT / ((BMPHT/100)^2),
NA_real_),
CYCLE = "1988-1994",
GENDER = factor(HSSEX, levels = c(1, 2), labels = c("Male", "Female")),
AGE = HSAGEIR,
SDMVSTRA = SDPSTRA6,
SDMVPSU = SDPPSU6,
WTMEC2YR = WTPFEX6
) %>%
select(SEQN, BMI, CYCLE, GENDER, AGE, WTMEC2YR, SDMVSTRA, SDMVPSU)
}
#' Load BMI data for NHANES 1999-2000
#' @return Processed data frame
load_bmi_data_1999_2000 <- function() {
nhanes_load_data("BMX", "1999-2000", demographics = TRUE) %>%
standardize_nhanes_data("1999-2000")
}
#' Load BMI data for a specific NHANES cycle
#' @param cycle_years String indicating the NHANES cycle
#' @return Processed data frame
load_bmi_data_cycle <- function(cycle_years) {
tryCatch({
cycle_code <- NHANES_CONFIG$cycle_codes[cycle_years]
bmi_url <- sprintf("https://wwwn.cdc.gov/Nchs/Nhanes/%s/BMX_%s.XPT", cycle_years, cycle_code)
demo_url <- sprintf("https://wwwn.cdc.gov/Nchs/Nhanes/%s/DEMO_%s.XPT", cycle_years, cycle_code)
bmi_data <- read_xpt(url(bmi_url))
demo_data <- read_xpt(url(demo_url))
inner_join(bmi_data, demo_data, by = "SEQN") %>%
standardize_nhanes_data(cycle_years)
}, error = function(e) {
warning(sprintf("Error loading data for %s: %s", cycle_years, e$message))
return(NULL)
})
}
#' Calculate summary statistics for BMI data
#' @param data Data frame containing BMI data
#' @return Data frame with summary statistics
calculate_bmi_summary <- function(data) {
data %>%
group_by(CYCLE) %>%
summarize(
n = n(),
mean_bmi = mean(BMI, na.rm = TRUE),
median_bmi = median(BMI, na.rm = TRUE),
sd_bmi = sd(BMI, na.rm = TRUE),
obesity_rate = mean(BMI >= 30, na.rm = TRUE) * 100,
severe_obesity_rate = mean(BMI >= 35, na.rm = TRUE) * 100,
.groups = "drop"
) %>%
arrange(CYCLE)
}
#' Main function to load and process all NHANES BMI data
#' @return List containing processed data and summary statistics
process_nhanes_bmi_data <- function() {
# Load data for special cycles
data_list <- list(
#"1971-1975" = load_bmi_data_1971_1975(),
#"1976-1980" = load_bmi_data_1976_1980(),
#"1988-1994" = load_bmi_data_1988_1994(),
"1999-2000" = load_bmi_data_1999_2000()
)
# Load data for standard cycles
standard_cycles <- setdiff(
NHANES_CONFIG$cycles,
#c("1971-1975", "1976-1980", "1988-1994", "1999-2000")
c("1999-2000")
)
for (cycle in standard_cycles) {
data_list[[cycle]] <- load_bmi_data_cycle(cycle)
}
# Combine all data
all_data <- bind_rows(data_list)
# Calculate summary statistics
summary_stats <- calculate_bmi_summary(all_data)
list(
data = all_data,
summary = summary_stats
)
}
# Construct results
results <- process_nhanes_bmi_data()
all_data <- results$data
# adult obesity rate has fallen
all_data %>% filter(AGE %in% 20:55) %>%
group_by(CYCLE) %>%
summarize(
n = n(),
mean_bmi = mean(BMI, na.rm = TRUE),
median_bmi = median(BMI, na.rm = TRUE),
sd_bmi = sd(BMI, na.rm = TRUE),
obesity_rate = mean(BMI >= 30, na.rm = TRUE) * 100,
severe_obesity_rate = mean(BMI >= 35, na.rm = TRUE) * 100,
.groups = "drop"
) %>%
arrange(CYCLE)
# child obesity rate has risen
all_data %>% filter(AGE < 20) %>%
group_by(CYCLE) %>%
summarize(
n = n(),
mean_bmi = mean(BMI, na.rm = TRUE),
median_bmi = median(BMI, na.rm = TRUE),
sd_bmi = sd(BMI, na.rm = TRUE),
obesity_rate = mean(BMI >= 30, na.rm = TRUE) * 100,
severe_obesity_rate = mean(BMI >= 35, na.rm = TRUE) * 100,
.groups = "drop"
) %>%
arrange(CYCLE)
# age-based analysis
age_stratified_analysis <- all_data %>%
mutate(age_group = cut(AGE, breaks = c(0, 12, 20, 35, 45, 55, 70, 100))) %>%
group_by(CYCLE, age_group) %>%
summarize(
mean_bmi = mean(BMI, na.rm = TRUE),
obesity_rate = mean(BMI >= 30, na.rm = TRUE) * 100,
severe_obesity_rate = mean(BMI >= 35, na.rm = TRUE) * 100,
n = n()
)
# density analysis
upper_quantiles <- all_data %>%
filter(AGE %in% 20:55) %>%
group_by(CYCLE) %>%
summarize(
q75 = quantile(BMI, 0.75, na.rm = TRUE),
q90 = quantile(BMI, 0.90, na.rm = TRUE),
q95 = quantile(BMI, 0.95, na.rm = TRUE)
)
# Raw obesity rates by cycle
raw_obesity <- all_data %>%
filter(AGE %in% 20:55) %>%
group_by(CYCLE) %>%
summarize(
obesity_rate = mean(BMI >= 30, na.rm = TRUE) * 100,
n = n()
)
# Age-stratified obesity rates
age_obesity <- all_data %>%
filter(AGE %in% 20:55) %>%
mutate(age_group = cut(AGE, breaks = c(20, 30, 40, 50, 55))) %>%
group_by(CYCLE, age_group) %>%
summarize(
obesity_rate = mean(BMI >= 30, na.rm = TRUE) * 100,
n = n()
)
# VISUALIZATIONS
# Helper function to convert cycle to year
cycle_to_year <- function(cycle) {
as.numeric(case_when(
#cycle == "1971-1975" ~ "1973",
#cycle == "1976-1980" ~ "1978",
#cycle == "1988-1994" ~ "1991",
TRUE ~ substr(cycle, 6, 9)
))
}
# Common theme elements
theme_nhanes <- function() {
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0),
panel.grid.minor = element_blank(),
plot.title = element_text(face = "bold", size = 12),
legend.position = "bottom"
)
}
# Common x-axis scale
scale_x_nhanes <- function() {
scale_x_continuous(
breaks = c(2000, 2010, 2020),
limits = c(1998, 2025)
)
}
# Helper function for weighted variance
weighted.var <- function(x, w, na.rm = FALSE) {
if (na.rm) {
keep <- !is.na(x) & !is.na(w)
x <- x[keep]
w <- w[keep]
}
sum.w <- sum(w)
w.mean <- sum(w * x) / sum.w
sum(w * (x - w.mean)^2) / (sum.w * (length(w) - 1) / length(w))
}
# For weighted percentiles, use a helper function
weighted_quantile <- function(x, w, probs) {
# Order the data
ord <- order(x)
x <- x[ord]
w <- w[ord]
# Normalize weights to sum to 1
w <- w/sum(w)
# Calculate cumulative weights
cw <- cumsum(w)
# Initialize results vector
result <- numeric(length(probs))
# Calculate weighted quantiles
for (i in seq_along(probs)) {
if (probs[i] < cw[1]) {
result[i] <- x[1]
} else {
result[i] <- x[find.segment(cw, probs[i])]
}
}
return(result)
}
# Helper function for weighted_quantile
find.segment <- function(cw, p) {
which(cw >= p)[1]
}
# 1. Weighted BMI percentiles over time
percentile_plot <- all_data %>%
filter(AGE %in% 20:55) %>%
group_by(CYCLE) %>%
summarize(
p95 = weighted_quantile(BMI[!is.na(BMI)], WTMEC2YR[!is.na(BMI)], 0.95),
p90 = weighted_quantile(BMI[!is.na(BMI)], WTMEC2YR[!is.na(BMI)], 0.90),
p75 = weighted_quantile(BMI[!is.na(BMI)], WTMEC2YR[!is.na(BMI)], 0.75),
p50 = weighted_quantile(BMI[!is.na(BMI)], WTMEC2YR[!is.na(BMI)], 0.50),
p25 = weighted_quantile(BMI[!is.na(BMI)], WTMEC2YR[!is.na(BMI)], 0.25),
year = cycle_to_year(first(CYCLE))
) %>%
pivot_longer(cols = c(p95, p90, p75, p50, p25),
names_to = "percentile",
values_to = "bmi") %>%
ggplot(aes(x = year, y = bmi, color = percentile, group = percentile)) +
geom_line(size = 1) +
geom_point(size = 3) +
theme_nhanes() +
scale_x_nhanes() +
labs(title = "Survey-Weighted BMI Percentiles Over Time (Ages 20-55)",
x = "Year",
y = "BMI",
color = "Percentile",
caption = "Graph created by @tyleransom using NHANES Data") +
scale_color_manual(values = c("p25" = "#984EA3",
"p50" = "#FF7F00",
"p75" = "#4DAF4A",
"p90" = "#377EB8",
"p95" = "#E41A1C"),
labels = c("25th", "50th", "75th", "90th", "95th"))
# 2. Age group obesity trends
age_group_plot <- all_data %>%
filter(AGE %in% 20:55) %>%
mutate(age_group = case_when(
AGE >= 20 & AGE < 30 ~ "20-30",
AGE >= 30 & AGE < 40 ~ "30-40",
AGE >= 40 & AGE < 50 ~ "40-50",
AGE >= 50 & AGE <= 55 ~ "50-55"
)) %>%
group_by(CYCLE, age_group) %>%
summarize(
obesity_rate = weighted.mean(BMI >= 30, WTMEC2YR, na.rm = TRUE) * 100,
n = n(),
year = cycle_to_year(first(CYCLE))
) %>%
ggplot(aes(x = year, y = obesity_rate, color = age_group, group = age_group)) +
geom_line(size = 1) +
geom_point(size = 3) +
theme_nhanes() +
scale_x_nhanes() +
labs(title = "Survey-Weighted Obesity Rates by Age Group Over Time",
x = "Year",
y = "Obesity Rate (%)",
color = "Age Group",
caption = "Graph created by @tyleransom using NHANES Data")
# 3. Children's and Youth's obesity trends
youth_obesity_plot <- all_data %>%
filter(AGE %in% 12:19) %>%
group_by(CYCLE) %>%
summarize(
obesity_rate = weighted.mean(BMI >= 30, WTMEC2YR, na.rm = TRUE) * 100,
severe_obesity_rate = weighted.mean(BMI >= 35, WTMEC2YR, na.rm = TRUE) * 100,
mean_bmi = weighted.mean(BMI, WTMEC2YR, na.rm = TRUE),
sd_bmi = sqrt(weighted.var(BMI, WTMEC2YR, na.rm = TRUE)),
year = cycle_to_year(first(CYCLE))
) %>%
pivot_longer(cols = c(obesity_rate, severe_obesity_rate),
names_to = "metric",
values_to = "rate") %>%
ggplot(aes(x = year, y = rate, color = metric, group = metric)) +
geom_line(size = 1) +
geom_point(size = 3) +
theme_nhanes() +
scale_x_nhanes() +
labs(title = "Survey-Weighted Youth Obesity Trends (ages 12-19)",
x = "Year",
y = "Rate (%)",
color = "Category",
caption = "Graph created by @tyleransom using NHANES Data") +
scale_color_manual(values = c("obesity_rate" = "#E41A1C",
"severe_obesity_rate" = "#377EB8"),
labels = c("Obesity (BMI ≥ 30)",
"Severe Obesity (BMI ≥ 35)"))
child_obesity_plot <- all_data %>%
filter(AGE %in% 3:11) %>%
group_by(CYCLE) %>%
summarize(
obesity_rate = weighted.mean(BMI >= 30, WTMEC2YR, na.rm = TRUE) * 100,
severe_obesity_rate = weighted.mean(BMI >= 35, WTMEC2YR, na.rm = TRUE) * 100,
mean_bmi = weighted.mean(BMI, WTMEC2YR, na.rm = TRUE),
sd_bmi = sqrt(weighted.var(BMI, WTMEC2YR, na.rm = TRUE)),
year = cycle_to_year(first(CYCLE))
) %>%
pivot_longer(cols = c(obesity_rate, severe_obesity_rate),
names_to = "metric",
values_to = "rate") %>%
ggplot(aes(x = year, y = rate, color = metric, group = metric)) +
geom_line(size = 1) +
geom_point(size = 3) +
theme_nhanes() +
scale_x_nhanes() +
labs(title = "Survey-Weighted Child Obesity Trends (ages 3-11)",
x = "Year",
y = "Rate (%)",
color = "Category",
caption = "Graph created by @tyleransom using NHANES Data") +
scale_color_manual(values = c("obesity_rate" = "#E41A1C",
"severe_obesity_rate" = "#377EB8"),
labels = c("Obesity (BMI ≥ 30)",
"Severe Obesity (BMI ≥ 35)"))
# 4. BMI Distribution Change (Density plots for first and last cycle)
distribution_plot <- check_data %>%
group_by(year) %>%
do({
# Calculate weighted kernel density
d <- density(.$BMI,
weights = .$WTMEC2YR/sum(.$WTMEC2YR),
n = 512,
from = 15,
to = 85)
data.frame(
x = d$x,
y = d$y
)
}) %>%
ungroup()
# Convert to wider format for ribbon
distribution_wide <- distribution_plot %>%
pivot_wider(names_from = year,
names_prefix = "y",
values_from = y)
# Create plot with ribbon
distribution_plot <- ggplot(distribution_wide, aes(x = x)) +
geom_ribbon(aes(ymin = 0, ymax = y2018, fill = "2018"), alpha = 0.3) +
geom_ribbon(aes(ymin = 0, ymax = y2022, fill = "2022"), alpha = 0.3) +
geom_line(aes(y = y2018, color = "2018")) +
geom_line(aes(y = y2022, color = "2022")) +
theme_nhanes() +
scale_x_continuous(breaks = seq(20, 80, 10)) +
coord_cartesian(xlim = c(15, 85)) +
scale_y_continuous(labels = scales::number_format(accuracy = 0.01)) +
labs(title = "Survey-Weighted BMI Distribution Change (Ages 20-55)",
x = "BMI",
y = "Density",
fill = "Year",
color = "Year",
caption = "Graph created by @tyleransom using NHANES Data") +
scale_fill_manual(values = c("2018" = "#E41A1C", "2022" = "#377EB8")) +
scale_color_manual(values = c("2018" = "#E41A1C", "2022" = "#377EB8"))
# 5. Overall adult obesity trend (FT replication)
overall_adult_trend <- all_data %>%
filter(AGE %in% 20:55) %>%
group_by(CYCLE) %>%
summarize(
obesity_rate = weighted.mean(BMI >= 30, WTMEC2YR, na.rm = TRUE) * 100,
n = n(),
year = cycle_to_year(first(CYCLE))
) %>%
ggplot(aes(x = year, y = obesity_rate)) +
geom_line(size = 1, color = "#2E74B5") + # FT blue
geom_point(size = 3, color = "#2E74B5") +
theme_nhanes() +
scale_x_nhanes() +
labs(
title = "Survey-Weighted U.S. Adult Obesity Rate (Ages 20-55)",
x = "Year",
y = "Obesity Rate (%)",
caption = "Graph created by @tyleransom using NHANES Data") +
scale_y_continuous(
limits = c(0, 50),
breaks = seq(0, 50, 10)
)
# Combine plots using patchwork
combined_plots <- (percentile_plot + age_group_plot) /
(child_obesity_plot + distribution_plot)
# Save all plots
ggsave("wgt_nhanes_obesity_trends.png", combined_plots,
width = 15, height = 12, units = "in")
ggsave("wgt_ft_obesity_trend.png", overall_adult_trend,
width = 8, height = 6, units = "in")
ggsave("wgt_percentile_trends.png", percentile_plot,
width = 8, height = 6, units = "in")
ggsave("wgt_age_group_trends.png", age_group_plot,
width = 8, height = 6, units = "in")
ggsave("wgt_child_obesity_trends.png", child_obesity_plot,
width = 8, height = 6, units = "in")
ggsave("wgt_youth_obesity_trends.png", youth_obesity_plot,
width = 8, height = 6, units = "in")
ggsave("wgt_bmi_distribution.png", distribution_plot,
width = 8, height = 6, units = "in")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment