tyleransom · December 4, 2024 04:20
diff --git a/script.R b/script.R
 # Required libraries
 library(RNHANES)
 library(tidyverse)
 library(survey)
 library(haven)
 library(stats)
 library(readr)
 library(patchwork)
 library(scales)

 #' Constants and configurations
 NHANES_CONFIG <- list(
    cycles = c(
        #"1971-1975",
        #"1976-1980",
        #"1988-1994",
        "1999-2000",
        "2001-2002",
        "2003-2004",
        "2005-2006",
        "2007-2008",
        "2009-2010",
        "2011-2012",
        "2013-2014",
        "2015-2016",
        "2017-2018",
        "2021-2022"
    ),
    cycle_codes = c(
        "2001-2002" = "B",
        "2003-2004" = "C",
        "2005-2006" = "D",
        "2007-2008" = "E",
        "2009-2010" = "F",
        "2011-2012" = "G",
        "2013-2014" = "H",
        "2015-2016" = "I",
        "2017-2018" = "J",
        "2021-2022" = "L"
    )
 )

 #' Standard column names for consistent data structure
 STANDARD_COLS <- c(
    "SEQN",
    "BMI",
    "CYCLE",
    "GENDER",
    "AGE",
    "WTMEC2YR",
    "SDMVPSU",
    "SDMVSTRA"
 )

 #' Process raw NHANES data into standardized format
 #' @param data Data frame containing raw NHANES data
 #' @param cycle_year String indicating the NHANES cycle
 #' @return Processed data frame with standardized columns
 standardize_nhanes_data <- function(data, cycle_year) {
    data %>%
        select(SEQN, RIDAGEYR, RIAGENDR, BMXBMI, WTMEC2YR, SDMVPSU, SDMVSTRA) %>%
        rename(
            BMI = BMXBMI,
            AGE = RIDAGEYR,
            GENDER = RIAGENDR
        ) %>%
        mutate(
            CYCLE = cycle_year,
            GENDER = factor(GENDER, levels = c(1, 2), labels = c("Male", "Female"))
        )
 }

 #' Load BMI data for NHANES 1971-1975
 #' @return Processed data frame
 load_bmi_data_1971_1975 <- function() {
    url <- "https://wwwn.cdc.gov/nchs/data/nhanes1/DU4111.txt"
    
    raw_data <- read_fwf(
        file = url,
        col_positions = fwf_positions(
            start = c(1, 101, 104, 255, 266, 95, 196),
            end   = c(5, 102, 104, 259, 269, 99, 198),
            col_names = c("SEQN", "N1BM0101", "N1BM0104", "N1BM0255", "N1BM0266", "N1BM0095", "N1BM0196")
        )
    ) %>%
        mutate(across(everything(), ~as.numeric(str_trim(.x)))) # Convert all to numeric after trimming
    
    processed_data <- raw_data %>%
        mutate(
            SEQN = as.integer(SEQN), # Ensure SEQN is integer type
            BMI = ifelse(!is.na(N1BM0255) & !is.na(N1BM0266) & N1BM0266 > 0,
                         N1BM0255 * 0.453592 / ((N1BM0266/100)^2),
                         NA_real_),
            CYCLE = "1971-1975",
            GENDER = factor(N1BM0104, levels = c(1, 2), labels = c("Male", "Female")),
            AGE = N1BM0101,
            WTMEC2YR = 1,
            SDMVSTRA = N1BM0095,
            SDMVPSU = N1BM0196
        ) %>%
        select(all_of(STANDARD_COLS))
    
    return(processed_data)
 }

 #' Load BMI data for NHANES 1976-1980
 load_bmi_data_1976_1980 <- function() {
    url <- "https://wwwn.cdc.gov/nchs/data/nhanes2/DU5301.txt"
    
    raw_data <- read_fwf(
        file = url,
        col_positions = fwf_positions(
            start = c(1,    55,    407,   412,   418,   422,   190,   282,    324,   326),
            end   = c(5,    55,    411,   416,   421,   424,   191,   287,    325,   326),
            col_names = c("SEQN", "SEX", "WT1", "WT2", "HT1", "HT2", 
                          "AGE", "WTMEC2YR", "SDMVSTRA", "SDMVPSU")
        )
    ) %>%
        mutate(across(everything(), ~ifelse(.x == "" | .x == " ", NA_character_, .x))) %>%
        mutate(across(everything(), ~as.numeric(str_trim(.x)))) %>%
        mutate(
            SEQN = as.integer(SEQN),
            # Convert string measurements to numeric, handling implied decimals
            WT1 = ifelse(WT1 >= 9999, NA_real_, as.numeric(WT1)/100),  # lbs with 2 decimal places
            WT2 = ifelse(WT2 >= 9999, NA_real_, as.numeric(WT2)/100),  # kg with 2 decimal places
            HT1 = ifelse(HT1 >= 9999, NA_real_, as.numeric(HT1)/10),   # cm with 1 decimal place
            HT2 = ifelse(HT2 >= 999, NA_real_, as.numeric(HT2)/10),    # inches with 1 decimal place
            # Use kg and cm for BMI calculation
            WT = WT2,  # Already in kg
            HT = HT1,  # Already in cm
            # Calculate BMI
            BMI = ifelse(!is.na(WT) & !is.na(HT) & HT > 0 & WT > 0,
                         WT / ((HT/100)^2),
                         NA_real_),
            CYCLE = "1976-1980",
            GENDER = factor(SEX, levels = c(1, 2), labels = c("Male", "Female"))
        ) %>%
        select(SEQN, BMI, CYCLE, GENDER, AGE, WTMEC2YR, SDMVSTRA, SDMVPSU)
 }

 #' Load BMI data for NHANES 1988-1994
 load_bmi_data_1988_1994 <- function() {
    url <- "https://wwwn.cdc.gov/nchs/data/nhanes3/1a/exam.dat"
    
    raw_data <- read_fwf(
        file = url,
        col_positions = fwf_positions(
            start = c(1,      14199,  14203,  115,    13,     335,    333,    12),
            end   = c(5,      14202,  14206,  117,    13,     337,    334,    19),
            col_names = c("SEQN", "BMPWT", "BMPHT", "HSAGEIR", "HSSEX", 
                          "SDPSTRA6", "SDPPSU6", "WTPFEX6")
        )
    ) %>%
        mutate(across(everything(), ~as.numeric(str_trim(.x))))
    
    processed_data <- raw_data %>%
        mutate(
            SEQN = as.integer(SEQN),
            BMI = ifelse(!is.na(BMPWT) & !is.na(BMPHT) & BMPHT > 0,
                         BMPWT / ((BMPHT/100)^2),
                         NA_real_),
            CYCLE = "1988-1994",
            GENDER = factor(HSSEX, levels = c(1, 2), labels = c("Male", "Female")),
            AGE = HSAGEIR,
            SDMVSTRA = SDPSTRA6,
            SDMVPSU = SDPPSU6,
            WTMEC2YR = WTPFEX6
        ) %>%
        select(SEQN, BMI, CYCLE, GENDER, AGE, WTMEC2YR, SDMVSTRA, SDMVPSU)
 }

 #' Load BMI data for NHANES 1999-2000
 #' @return Processed data frame
 load_bmi_data_1999_2000 <- function() {
    nhanes_load_data("BMX", "1999-2000", demographics = TRUE) %>%
        standardize_nhanes_data("1999-2000")
 }


 #' Load BMI data for a specific NHANES cycle
 #' @param cycle_years String indicating the NHANES cycle
 #' @return Processed data frame
 load_bmi_data_cycle <- function(cycle_years) {
    tryCatch({
        cycle_code <- NHANES_CONFIG$cycle_codes[cycle_years]
        
        bmi_url <- sprintf("https://wwwn.cdc.gov/Nchs/Nhanes/%s/BMX_%s.XPT", cycle_years, cycle_code)
        demo_url <- sprintf("https://wwwn.cdc.gov/Nchs/Nhanes/%s/DEMO_%s.XPT", cycle_years, cycle_code)
        
        bmi_data <- read_xpt(url(bmi_url))
        demo_data <- read_xpt(url(demo_url))
        
        inner_join(bmi_data, demo_data, by = "SEQN") %>%
            standardize_nhanes_data(cycle_years)
    }, error = function(e) {
        warning(sprintf("Error loading data for %s: %s", cycle_years, e$message))
        return(NULL)
    })
 }

 #' Calculate summary statistics for BMI data
 #' @param data Data frame containing BMI data
 #' @return Data frame with summary statistics
 calculate_bmi_summary <- function(data) {
    data %>%
        group_by(CYCLE) %>%
        summarize(
            n = n(),
            mean_bmi = mean(BMI, na.rm = TRUE),
            median_bmi = median(BMI, na.rm = TRUE),
            sd_bmi = sd(BMI, na.rm = TRUE),
            obesity_rate = mean(BMI >= 30, na.rm = TRUE) * 100,
            severe_obesity_rate = mean(BMI >= 35, na.rm = TRUE) * 100,
            .groups = "drop"
        ) %>%
        arrange(CYCLE)
 }

 #' Main function to load and process all NHANES BMI data
 #' @return List containing processed data and summary statistics
 process_nhanes_bmi_data <- function() {
    # Load data for special cycles
    data_list <- list(
        #"1971-1975" = load_bmi_data_1971_1975(),
        #"1976-1980" = load_bmi_data_1976_1980(),
        #"1988-1994" = load_bmi_data_1988_1994(),
        "1999-2000" = load_bmi_data_1999_2000()
    )
    
    # Load data for standard cycles
    standard_cycles <- setdiff(
        NHANES_CONFIG$cycles,
        #c("1971-1975", "1976-1980", "1988-1994", "1999-2000")
        c("1999-2000")
    )
    
    for (cycle in standard_cycles) {
        data_list[[cycle]] <- load_bmi_data_cycle(cycle)
    }
    
    # Combine all data
    all_data <- bind_rows(data_list)
    
    # Calculate summary statistics
    summary_stats <- calculate_bmi_summary(all_data)
    
    list(
        data = all_data,
        summary = summary_stats
    )
 }

 # Construct results
 results <- process_nhanes_bmi_data()

 all_data <- results$data

 # adult obesity rate has fallen
 all_data %>% filter(AGE %in% 20:55) %>%
    group_by(CYCLE) %>%
    summarize(
        n = n(),
        mean_bmi = mean(BMI, na.rm = TRUE),
        median_bmi = median(BMI, na.rm = TRUE),
        sd_bmi = sd(BMI, na.rm = TRUE),
        obesity_rate = mean(BMI >= 30, na.rm = TRUE) * 100,
        severe_obesity_rate = mean(BMI >= 35, na.rm = TRUE) * 100,
        .groups = "drop"
    ) %>%
    arrange(CYCLE)

 # child obesity rate has risen
 all_data %>% filter(AGE < 20) %>%
    group_by(CYCLE) %>%
    summarize(
        n = n(),
        mean_bmi = mean(BMI, na.rm = TRUE),
        median_bmi = median(BMI, na.rm = TRUE),
        sd_bmi = sd(BMI, na.rm = TRUE),
        obesity_rate = mean(BMI >= 30, na.rm = TRUE) * 100,
        severe_obesity_rate = mean(BMI >= 35, na.rm = TRUE) * 100,
        .groups = "drop"
    ) %>%
    arrange(CYCLE)

 # age-based analysis
 age_stratified_analysis <- all_data %>%
    mutate(age_group = cut(AGE, breaks = c(0, 12, 20, 35, 45, 55, 70, 100))) %>%
    group_by(CYCLE, age_group) %>%
    summarize(
        mean_bmi = mean(BMI, na.rm = TRUE),
        obesity_rate = mean(BMI >= 30, na.rm = TRUE) * 100,
        severe_obesity_rate = mean(BMI >= 35, na.rm = TRUE) * 100,
        n = n()
    )

 # density analysis
 upper_quantiles <- all_data %>%
    filter(AGE %in% 20:55) %>%
    group_by(CYCLE) %>%
    summarize(
        q75 = quantile(BMI, 0.75, na.rm = TRUE),
        q90 = quantile(BMI, 0.90, na.rm = TRUE),
        q95 = quantile(BMI, 0.95, na.rm = TRUE)
    )

 # Raw obesity rates by cycle
 raw_obesity <- all_data  %>%
    filter(AGE %in% 20:55) %>%
    group_by(CYCLE) %>%
    summarize(
        obesity_rate = mean(BMI >= 30, na.rm = TRUE) * 100,
        n = n()
    )

 # Age-stratified obesity rates
 age_obesity <- all_data %>%
    filter(AGE %in% 20:55) %>%
    mutate(age_group = cut(AGE, breaks = c(20, 30, 40, 50, 55))) %>%
    group_by(CYCLE, age_group) %>%
    summarize(
        obesity_rate = mean(BMI >= 30, na.rm = TRUE) * 100,
        n = n()
    )







 # VISUALIZATIONS


 # Helper function to convert cycle to year
 cycle_to_year <- function(cycle) {
    as.numeric(case_when(
        #cycle == "1971-1975" ~ "1973",
        #cycle == "1976-1980" ~ "1978",
        #cycle == "1988-1994" ~ "1991",
        TRUE ~ substr(cycle, 6, 9)
    ))
 }

 # Common theme elements
 theme_nhanes <- function() {
    theme_minimal() +
        theme(
            axis.text.x = element_text(angle = 0),
            panel.grid.minor = element_blank(),
            plot.title = element_text(face = "bold", size = 12),
            legend.position = "bottom"
        )
 }

 # Common x-axis scale
 scale_x_nhanes <- function() {
    scale_x_continuous(
        breaks = c(2000, 2010, 2020),
        limits = c(1998, 2025)
    )
 }

 # Helper function for weighted variance
 weighted.var <- function(x, w, na.rm = FALSE) {
    if (na.rm) {
        keep <- !is.na(x) & !is.na(w)
        x <- x[keep]
        w <- w[keep]
    }
    sum.w <- sum(w)
    w.mean <- sum(w * x) / sum.w
    sum(w * (x - w.mean)^2) / (sum.w * (length(w) - 1) / length(w))
 }

 # For weighted percentiles, use a helper function
 weighted_quantile <- function(x, w, probs) {
    # Order the data
    ord <- order(x)
    x <- x[ord]
    w <- w[ord]
    
    # Normalize weights to sum to 1
    w <- w/sum(w)
    
    # Calculate cumulative weights
    cw <- cumsum(w)
    
    # Initialize results vector
    result <- numeric(length(probs))
    
    # Calculate weighted quantiles
    for (i in seq_along(probs)) {
        if (probs[i] < cw[1]) {
            result[i] <- x[1]
        } else {
            result[i] <- x[find.segment(cw, probs[i])]
        }
    }
    
    return(result)
 }

 # Helper function for weighted_quantile
 find.segment <- function(cw, p) {
    which(cw >= p)[1]
 }

 # 1. Weighted BMI percentiles over time
 percentile_plot <- all_data %>%
    filter(AGE %in% 20:55) %>%
    group_by(CYCLE) %>%
    summarize(
        p95 = weighted_quantile(BMI[!is.na(BMI)], WTMEC2YR[!is.na(BMI)], 0.95),
        p90 = weighted_quantile(BMI[!is.na(BMI)], WTMEC2YR[!is.na(BMI)], 0.90),
        p75 = weighted_quantile(BMI[!is.na(BMI)], WTMEC2YR[!is.na(BMI)], 0.75),
        p50 = weighted_quantile(BMI[!is.na(BMI)], WTMEC2YR[!is.na(BMI)], 0.50),
        p25 = weighted_quantile(BMI[!is.na(BMI)], WTMEC2YR[!is.na(BMI)], 0.25),
        year = cycle_to_year(first(CYCLE))
    ) %>%
    pivot_longer(cols = c(p95, p90, p75, p50, p25), 
                 names_to = "percentile", 
                 values_to = "bmi") %>%
    ggplot(aes(x = year, y = bmi, color = percentile, group = percentile)) +
    geom_line(size = 1) +
    geom_point(size = 3) +
    theme_nhanes() +
    scale_x_nhanes() +
    labs(title = "Survey-Weighted BMI Percentiles Over Time (Ages 20-55)",
         x = "Year",
         y = "BMI",
         color = "Percentile",
         caption = "Graph created by @tyleransom using NHANES Data") +
    scale_color_manual(values = c("p25" = "#984EA3",
                                  "p50" = "#FF7F00",
                                  "p75" = "#4DAF4A", 
                                  "p90" = "#377EB8", 
                                  "p95" = "#E41A1C"),
                       labels = c("25th", "50th", "75th", "90th", "95th"))

 # 2. Age group obesity trends
 age_group_plot <- all_data %>%
    filter(AGE %in% 20:55) %>%
    mutate(age_group = case_when(
        AGE >= 20 & AGE < 30 ~ "20-30",
        AGE >= 30 & AGE < 40 ~ "30-40",
        AGE >= 40 & AGE < 50 ~ "40-50",
        AGE >= 50 & AGE <= 55 ~ "50-55"
    )) %>%
    group_by(CYCLE, age_group) %>%
    summarize(
        obesity_rate = weighted.mean(BMI >= 30, WTMEC2YR, na.rm = TRUE) * 100,
        n = n(),
        year = cycle_to_year(first(CYCLE))
    ) %>%
    ggplot(aes(x = year, y = obesity_rate, color = age_group, group = age_group)) +
    geom_line(size = 1) +
    geom_point(size = 3) +
    theme_nhanes() +
    scale_x_nhanes() +
    labs(title = "Survey-Weighted Obesity Rates by Age Group Over Time",
         x = "Year",
         y = "Obesity Rate (%)",
         color = "Age Group",
         caption = "Graph created by @tyleransom using NHANES Data")

 # 3. Children's and Youth's obesity trends
 youth_obesity_plot <- all_data %>%
    filter(AGE %in% 12:19) %>%
    group_by(CYCLE) %>%
    summarize(
        obesity_rate = weighted.mean(BMI >= 30, WTMEC2YR, na.rm = TRUE) * 100,
        severe_obesity_rate = weighted.mean(BMI >= 35, WTMEC2YR, na.rm = TRUE) * 100,
        mean_bmi = weighted.mean(BMI, WTMEC2YR, na.rm = TRUE),
        sd_bmi = sqrt(weighted.var(BMI, WTMEC2YR, na.rm = TRUE)),
        year = cycle_to_year(first(CYCLE))
    ) %>%
    pivot_longer(cols = c(obesity_rate, severe_obesity_rate),
                 names_to = "metric",
                 values_to = "rate") %>%
    ggplot(aes(x = year, y = rate, color = metric, group = metric)) +
    geom_line(size = 1) +
    geom_point(size = 3) +
    theme_nhanes() +
    scale_x_nhanes() +
    labs(title = "Survey-Weighted Youth Obesity Trends (ages 12-19)",
         x = "Year",
         y = "Rate (%)",
         color = "Category",
         caption = "Graph created by @tyleransom using NHANES Data") +
    scale_color_manual(values = c("obesity_rate" = "#E41A1C",
                                  "severe_obesity_rate" = "#377EB8"),
                       labels = c("Obesity (BMI ≥ 30)",
                                  "Severe Obesity (BMI ≥ 35)"))

 child_obesity_plot <- all_data %>%
    filter(AGE %in% 3:11) %>%
    group_by(CYCLE) %>%
    summarize(
        obesity_rate = weighted.mean(BMI >= 30, WTMEC2YR, na.rm = TRUE) * 100,
        severe_obesity_rate = weighted.mean(BMI >= 35, WTMEC2YR, na.rm = TRUE) * 100,
        mean_bmi = weighted.mean(BMI, WTMEC2YR, na.rm = TRUE),
        sd_bmi = sqrt(weighted.var(BMI, WTMEC2YR, na.rm = TRUE)),
        year = cycle_to_year(first(CYCLE))
    ) %>%
    pivot_longer(cols = c(obesity_rate, severe_obesity_rate),
                 names_to = "metric",
                 values_to = "rate") %>%
    ggplot(aes(x = year, y = rate, color = metric, group = metric)) +
    geom_line(size = 1) +
    geom_point(size = 3) +
    theme_nhanes() +
    scale_x_nhanes() +
    labs(title = "Survey-Weighted Child Obesity Trends (ages 3-11)",
         x = "Year",
         y = "Rate (%)",
         color = "Category",
         caption = "Graph created by @tyleransom using NHANES Data") +
    scale_color_manual(values = c("obesity_rate" = "#E41A1C",
                                  "severe_obesity_rate" = "#377EB8"),
                       labels = c("Obesity (BMI ≥ 30)",
                                  "Severe Obesity (BMI ≥ 35)"))

 # 4. BMI Distribution Change (Density plots for first and last cycle)
 distribution_plot <- check_data %>%
    group_by(year) %>%
    do({
        # Calculate weighted kernel density
        d <- density(.$BMI, 
                     weights = .$WTMEC2YR/sum(.$WTMEC2YR),
                     n = 512,
                     from = 15,
                     to = 85)
        
        data.frame(
            x = d$x,
            y = d$y
        )
    }) %>%
    ungroup()

 # Convert to wider format for ribbon
 distribution_wide <- distribution_plot %>%
    pivot_wider(names_from = year, 
                names_prefix = "y",
                values_from = y)

 # Create plot with ribbon
 distribution_plot <- ggplot(distribution_wide, aes(x = x)) +
    geom_ribbon(aes(ymin = 0, ymax = y2018, fill = "2018"), alpha = 0.3) +
    geom_ribbon(aes(ymin = 0, ymax = y2022, fill = "2022"), alpha = 0.3) +
    geom_line(aes(y = y2018, color = "2018")) +
    geom_line(aes(y = y2022, color = "2022")) +
    theme_nhanes() +
    scale_x_continuous(breaks = seq(20, 80, 10)) +
    coord_cartesian(xlim = c(15, 85)) +
    scale_y_continuous(labels = scales::number_format(accuracy = 0.01)) +
    labs(title = "Survey-Weighted BMI Distribution Change (Ages 20-55)",
         x = "BMI",
         y = "Density",
         fill = "Year",
         color = "Year",
         caption = "Graph created by @tyleransom using NHANES Data") +
    scale_fill_manual(values = c("2018" = "#E41A1C", "2022" = "#377EB8")) +
    scale_color_manual(values = c("2018" = "#E41A1C", "2022" = "#377EB8"))

 # 5. Overall adult obesity trend (FT replication)
 overall_adult_trend <- all_data %>%
    filter(AGE %in% 20:55) %>%
    group_by(CYCLE) %>%
    summarize(
        obesity_rate = weighted.mean(BMI >= 30, WTMEC2YR, na.rm = TRUE) * 100,
        n = n(),
        year = cycle_to_year(first(CYCLE))
    ) %>%
    ggplot(aes(x = year, y = obesity_rate)) +
    geom_line(size = 1, color = "#2E74B5") +  # FT blue
    geom_point(size = 3, color = "#2E74B5") +
    theme_nhanes() +
    scale_x_nhanes() +
    labs(
        title = "Survey-Weighted U.S. Adult Obesity Rate (Ages 20-55)",
        x = "Year",
        y = "Obesity Rate (%)",
        caption = "Graph created by @tyleransom using NHANES Data") +
    scale_y_continuous(
        limits = c(0, 50),
        breaks = seq(0, 50, 10)
    )

 # Combine plots using patchwork
 combined_plots <- (percentile_plot + age_group_plot) / 
    (child_obesity_plot + distribution_plot)

 # Save all plots
 ggsave("wgt_nhanes_obesity_trends.png", combined_plots, 
       width = 15, height = 12, units = "in")
 ggsave("wgt_ft_obesity_trend.png", overall_adult_trend, 
       width = 8, height = 6, units = "in")
 ggsave("wgt_percentile_trends.png", percentile_plot, 
       width = 8, height = 6, units = "in")
 ggsave("wgt_age_group_trends.png", age_group_plot, 
       width = 8, height = 6, units = "in")
 ggsave("wgt_child_obesity_trends.png", child_obesity_plot, 
       width = 8, height = 6, units = "in")
 ggsave("wgt_youth_obesity_trends.png", youth_obesity_plot, 
       width = 8, height = 6, units = "in")
 ggsave("wgt_bmi_distribution.png", distribution_plot, 
       width = 8, height = 6, units = "in")
	# Required libraries
	library(RNHANES)
	library(tidyverse)
	library(survey)
	library(haven)
	library(stats)
	library(readr)
	library(patchwork)
	library(scales)

	#' Constants and configurations
	NHANES_CONFIG <- list(
	cycles = c(
	#"1971-1975",
	#"1976-1980",
	#"1988-1994",
	"1999-2000",
	"2001-2002",
	"2003-2004",
	"2005-2006",
	"2007-2008",
	"2009-2010",
	"2011-2012",
	"2013-2014",
	"2015-2016",
	"2017-2018",
	"2021-2022"
	),
	cycle_codes = c(
	"2001-2002" = "B",
	"2003-2004" = "C",
	"2005-2006" = "D",
	"2007-2008" = "E",
	"2009-2010" = "F",
	"2011-2012" = "G",
	"2013-2014" = "H",
	"2015-2016" = "I",
	"2017-2018" = "J",
	"2021-2022" = "L"
	)
	)

	#' Standard column names for consistent data structure
	STANDARD_COLS <- c(
	"SEQN",
	"BMI",
	"CYCLE",
	"GENDER",
	"AGE",
	"WTMEC2YR",
	"SDMVPSU",
	"SDMVSTRA"
	)

	#' Process raw NHANES data into standardized format
	#' @param data Data frame containing raw NHANES data
	#' @param cycle_year String indicating the NHANES cycle
	#' @return Processed data frame with standardized columns
	standardize_nhanes_data <- function(data, cycle_year) {
	data %>%
	select(SEQN, RIDAGEYR, RIAGENDR, BMXBMI, WTMEC2YR, SDMVPSU, SDMVSTRA) %>%
	rename(
	BMI = BMXBMI,
	AGE = RIDAGEYR,
	GENDER = RIAGENDR
	) %>%
	mutate(
	CYCLE = cycle_year,
	GENDER = factor(GENDER, levels = c(1, 2), labels = c("Male", "Female"))
	)
	}

	#' Load BMI data for NHANES 1971-1975
	#' @return Processed data frame
	load_bmi_data_1971_1975 <- function() {
	url <- "https://wwwn.cdc.gov/nchs/data/nhanes1/DU4111.txt"

	raw_data <- read_fwf(
	file = url,
	col_positions = fwf_positions(
	start = c(1, 101, 104, 255, 266, 95, 196),
	end = c(5, 102, 104, 259, 269, 99, 198),
	col_names = c("SEQN", "N1BM0101", "N1BM0104", "N1BM0255", "N1BM0266", "N1BM0095", "N1BM0196")
	)
	) %>%
	mutate(across(everything(), ~as.numeric(str_trim(.x)))) # Convert all to numeric after trimming

	processed_data <- raw_data %>%
	mutate(
	SEQN = as.integer(SEQN), # Ensure SEQN is integer type
	BMI = ifelse(!is.na(N1BM0255) & !is.na(N1BM0266) & N1BM0266 > 0,
	N1BM0255 * 0.453592 / ((N1BM0266/100)^2),
	NA_real_),
	CYCLE = "1971-1975",
	GENDER = factor(N1BM0104, levels = c(1, 2), labels = c("Male", "Female")),
	AGE = N1BM0101,
	WTMEC2YR = 1,
	SDMVSTRA = N1BM0095,
	SDMVPSU = N1BM0196
	) %>%
	select(all_of(STANDARD_COLS))

	return(processed_data)
	}

	#' Load BMI data for NHANES 1976-1980
	load_bmi_data_1976_1980 <- function() {
	url <- "https://wwwn.cdc.gov/nchs/data/nhanes2/DU5301.txt"

	raw_data <- read_fwf(
	file = url,
	col_positions = fwf_positions(
	start = c(1, 55, 407, 412, 418, 422, 190, 282, 324, 326),
	end = c(5, 55, 411, 416, 421, 424, 191, 287, 325, 326),
	col_names = c("SEQN", "SEX", "WT1", "WT2", "HT1", "HT2",
	"AGE", "WTMEC2YR", "SDMVSTRA", "SDMVPSU")
	)
	) %>%
	mutate(across(everything(), ~ifelse(.x == "" \| .x == " ", NA_character_, .x))) %>%
	mutate(across(everything(), ~as.numeric(str_trim(.x)))) %>%
	mutate(
	SEQN = as.integer(SEQN),
	# Convert string measurements to numeric, handling implied decimals
	WT1 = ifelse(WT1 >= 9999, NA_real_, as.numeric(WT1)/100), # lbs with 2 decimal places
	WT2 = ifelse(WT2 >= 9999, NA_real_, as.numeric(WT2)/100), # kg with 2 decimal places
	HT1 = ifelse(HT1 >= 9999, NA_real_, as.numeric(HT1)/10), # cm with 1 decimal place
	HT2 = ifelse(HT2 >= 999, NA_real_, as.numeric(HT2)/10), # inches with 1 decimal place
	# Use kg and cm for BMI calculation
	WT = WT2, # Already in kg
	HT = HT1, # Already in cm
	# Calculate BMI
	BMI = ifelse(!is.na(WT) & !is.na(HT) & HT > 0 & WT > 0,
	WT / ((HT/100)^2),
	NA_real_),
	CYCLE = "1976-1980",
	GENDER = factor(SEX, levels = c(1, 2), labels = c("Male", "Female"))
	) %>%
	select(SEQN, BMI, CYCLE, GENDER, AGE, WTMEC2YR, SDMVSTRA, SDMVPSU)
	}

	#' Load BMI data for NHANES 1988-1994
	load_bmi_data_1988_1994 <- function() {
	url <- "https://wwwn.cdc.gov/nchs/data/nhanes3/1a/exam.dat"

	raw_data <- read_fwf(
	file = url,
	col_positions = fwf_positions(
	start = c(1, 14199, 14203, 115, 13, 335, 333, 12),
	end = c(5, 14202, 14206, 117, 13, 337, 334, 19),
	col_names = c("SEQN", "BMPWT", "BMPHT", "HSAGEIR", "HSSEX",
	"SDPSTRA6", "SDPPSU6", "WTPFEX6")
	)
	) %>%
	mutate(across(everything(), ~as.numeric(str_trim(.x))))

	processed_data <- raw_data %>%
	mutate(
	SEQN = as.integer(SEQN),
	BMI = ifelse(!is.na(BMPWT) & !is.na(BMPHT) & BMPHT > 0,
	BMPWT / ((BMPHT/100)^2),
	NA_real_),
	CYCLE = "1988-1994",
	GENDER = factor(HSSEX, levels = c(1, 2), labels = c("Male", "Female")),
	AGE = HSAGEIR,
	SDMVSTRA = SDPSTRA6,
	SDMVPSU = SDPPSU6,
	WTMEC2YR = WTPFEX6
	) %>%
	select(SEQN, BMI, CYCLE, GENDER, AGE, WTMEC2YR, SDMVSTRA, SDMVPSU)
	}

	#' Load BMI data for NHANES 1999-2000
	#' @return Processed data frame
	load_bmi_data_1999_2000 <- function() {
	nhanes_load_data("BMX", "1999-2000", demographics = TRUE) %>%
	standardize_nhanes_data("1999-2000")
	}


	#' Load BMI data for a specific NHANES cycle
	#' @param cycle_years String indicating the NHANES cycle
	#' @return Processed data frame
	load_bmi_data_cycle <- function(cycle_years) {
	tryCatch({
	cycle_code <- NHANES_CONFIG$cycle_codes[cycle_years]

	bmi_url <- sprintf("https://wwwn.cdc.gov/Nchs/Nhanes/%s/BMX_%s.XPT", cycle_years, cycle_code)
	demo_url <- sprintf("https://wwwn.cdc.gov/Nchs/Nhanes/%s/DEMO_%s.XPT", cycle_years, cycle_code)

	bmi_data <- read_xpt(url(bmi_url))
	demo_data <- read_xpt(url(demo_url))

	inner_join(bmi_data, demo_data, by = "SEQN") %>%
	standardize_nhanes_data(cycle_years)
	}, error = function(e) {
	warning(sprintf("Error loading data for %s: %s", cycle_years, e$message))
	return(NULL)
	})
	}

	#' Calculate summary statistics for BMI data
	#' @param data Data frame containing BMI data
	#' @return Data frame with summary statistics
	calculate_bmi_summary <- function(data) {
	data %>%
	group_by(CYCLE) %>%
	summarize(
	n = n(),
	mean_bmi = mean(BMI, na.rm = TRUE),
	median_bmi = median(BMI, na.rm = TRUE),
	sd_bmi = sd(BMI, na.rm = TRUE),
	obesity_rate = mean(BMI >= 30, na.rm = TRUE) * 100,
	severe_obesity_rate = mean(BMI >= 35, na.rm = TRUE) * 100,
	.groups = "drop"
	) %>%
	arrange(CYCLE)
	}

	#' Main function to load and process all NHANES BMI data
	#' @return List containing processed data and summary statistics
	process_nhanes_bmi_data <- function() {
	# Load data for special cycles
	data_list <- list(
	#"1971-1975" = load_bmi_data_1971_1975(),
	#"1976-1980" = load_bmi_data_1976_1980(),
	#"1988-1994" = load_bmi_data_1988_1994(),
	"1999-2000" = load_bmi_data_1999_2000()
	)

	# Load data for standard cycles
	standard_cycles <- setdiff(
	NHANES_CONFIG$cycles,
	#c("1971-1975", "1976-1980", "1988-1994", "1999-2000")
	c("1999-2000")
	)

	for (cycle in standard_cycles) {
	data_list[[cycle]] <- load_bmi_data_cycle(cycle)
	}

	# Combine all data
	all_data <- bind_rows(data_list)

	# Calculate summary statistics
	summary_stats <- calculate_bmi_summary(all_data)

	list(
	data = all_data,
	summary = summary_stats
	)
	}

	# Construct results
	results <- process_nhanes_bmi_data()

	all_data <- results$data

	# adult obesity rate has fallen
	all_data %>% filter(AGE %in% 20:55) %>%
	group_by(CYCLE) %>%
	summarize(
	n = n(),
	mean_bmi = mean(BMI, na.rm = TRUE),
	median_bmi = median(BMI, na.rm = TRUE),
	sd_bmi = sd(BMI, na.rm = TRUE),
	obesity_rate = mean(BMI >= 30, na.rm = TRUE) * 100,
	severe_obesity_rate = mean(BMI >= 35, na.rm = TRUE) * 100,
	.groups = "drop"
	) %>%
	arrange(CYCLE)

	# child obesity rate has risen
	all_data %>% filter(AGE < 20) %>%
	group_by(CYCLE) %>%
	summarize(
	n = n(),
	mean_bmi = mean(BMI, na.rm = TRUE),
	median_bmi = median(BMI, na.rm = TRUE),
	sd_bmi = sd(BMI, na.rm = TRUE),
	obesity_rate = mean(BMI >= 30, na.rm = TRUE) * 100,
	severe_obesity_rate = mean(BMI >= 35, na.rm = TRUE) * 100,
	.groups = "drop"
	) %>%
	arrange(CYCLE)

	# age-based analysis
	age_stratified_analysis <- all_data %>%
	mutate(age_group = cut(AGE, breaks = c(0, 12, 20, 35, 45, 55, 70, 100))) %>%
	group_by(CYCLE, age_group) %>%
	summarize(
	mean_bmi = mean(BMI, na.rm = TRUE),
	obesity_rate = mean(BMI >= 30, na.rm = TRUE) * 100,
	severe_obesity_rate = mean(BMI >= 35, na.rm = TRUE) * 100,
	n = n()
	)

	# density analysis
	upper_quantiles <- all_data %>%
	filter(AGE %in% 20:55) %>%
	group_by(CYCLE) %>%
	summarize(
	q75 = quantile(BMI, 0.75, na.rm = TRUE),
	q90 = quantile(BMI, 0.90, na.rm = TRUE),
	q95 = quantile(BMI, 0.95, na.rm = TRUE)
	)

	# Raw obesity rates by cycle
	raw_obesity <- all_data %>%
	filter(AGE %in% 20:55) %>%
	group_by(CYCLE) %>%
	summarize(
	obesity_rate = mean(BMI >= 30, na.rm = TRUE) * 100,
	n = n()
	)

	# Age-stratified obesity rates
	age_obesity <- all_data %>%
	filter(AGE %in% 20:55) %>%
	mutate(age_group = cut(AGE, breaks = c(20, 30, 40, 50, 55))) %>%
	group_by(CYCLE, age_group) %>%
	summarize(
	obesity_rate = mean(BMI >= 30, na.rm = TRUE) * 100,
	n = n()
	)







	# VISUALIZATIONS


	# Helper function to convert cycle to year
	cycle_to_year <- function(cycle) {
	as.numeric(case_when(
	#cycle == "1971-1975" ~ "1973",
	#cycle == "1976-1980" ~ "1978",
	#cycle == "1988-1994" ~ "1991",
	TRUE ~ substr(cycle, 6, 9)
	))
	}

	# Common theme elements
	theme_nhanes <- function() {
	theme_minimal() +
	theme(
	axis.text.x = element_text(angle = 0),
	panel.grid.minor = element_blank(),
	plot.title = element_text(face = "bold", size = 12),
	legend.position = "bottom"
	)
	}

	# Common x-axis scale
	scale_x_nhanes <- function() {
	scale_x_continuous(
	breaks = c(2000, 2010, 2020),
	limits = c(1998, 2025)
	)
	}

	# Helper function for weighted variance
	weighted.var <- function(x, w, na.rm = FALSE) {
	if (na.rm) {
	keep <- !is.na(x) & !is.na(w)
	x <- x[keep]
	w <- w[keep]
	}
	sum.w <- sum(w)
	w.mean <- sum(w * x) / sum.w
	sum(w * (x - w.mean)^2) / (sum.w * (length(w) - 1) / length(w))
	}

	# For weighted percentiles, use a helper function
	weighted_quantile <- function(x, w, probs) {
	# Order the data
	ord <- order(x)
	x <- x[ord]
	w <- w[ord]

	# Normalize weights to sum to 1
	w <- w/sum(w)

	# Calculate cumulative weights
	cw <- cumsum(w)

	# Initialize results vector
	result <- numeric(length(probs))

	# Calculate weighted quantiles
	for (i in seq_along(probs)) {
	if (probs[i] < cw[1]) {
	result[i] <- x[1]
	} else {
	result[i] <- x[find.segment(cw, probs[i])]
	}
	}

	return(result)
	}

	# Helper function for weighted_quantile
	find.segment <- function(cw, p) {
	which(cw >= p)[1]
	}

	# 1. Weighted BMI percentiles over time
	percentile_plot <- all_data %>%
	filter(AGE %in% 20:55) %>%
	group_by(CYCLE) %>%
	summarize(
	p95 = weighted_quantile(BMI[!is.na(BMI)], WTMEC2YR[!is.na(BMI)], 0.95),
	p90 = weighted_quantile(BMI[!is.na(BMI)], WTMEC2YR[!is.na(BMI)], 0.90),
	p75 = weighted_quantile(BMI[!is.na(BMI)], WTMEC2YR[!is.na(BMI)], 0.75),
	p50 = weighted_quantile(BMI[!is.na(BMI)], WTMEC2YR[!is.na(BMI)], 0.50),
	p25 = weighted_quantile(BMI[!is.na(BMI)], WTMEC2YR[!is.na(BMI)], 0.25),
	year = cycle_to_year(first(CYCLE))
	) %>%
	pivot_longer(cols = c(p95, p90, p75, p50, p25),
	names_to = "percentile",
	values_to = "bmi") %>%
	ggplot(aes(x = year, y = bmi, color = percentile, group = percentile)) +
	geom_line(size = 1) +
	geom_point(size = 3) +
	theme_nhanes() +
	scale_x_nhanes() +
	labs(title = "Survey-Weighted BMI Percentiles Over Time (Ages 20-55)",
	x = "Year",
	y = "BMI",
	color = "Percentile",
	caption = "Graph created by @tyleransom using NHANES Data") +
	scale_color_manual(values = c("p25" = "#984EA3",
	"p50" = "#FF7F00",
	"p75" = "#4DAF4A",
	"p90" = "#377EB8",
	"p95" = "#E41A1C"),
	labels = c("25th", "50th", "75th", "90th", "95th"))

	# 2. Age group obesity trends
	age_group_plot <- all_data %>%
	filter(AGE %in% 20:55) %>%
	mutate(age_group = case_when(
	AGE >= 20 & AGE < 30 ~ "20-30",
	AGE >= 30 & AGE < 40 ~ "30-40",
	AGE >= 40 & AGE < 50 ~ "40-50",
	AGE >= 50 & AGE <= 55 ~ "50-55"
	)) %>%
	group_by(CYCLE, age_group) %>%
	summarize(
	obesity_rate = weighted.mean(BMI >= 30, WTMEC2YR, na.rm = TRUE) * 100,
	n = n(),
	year = cycle_to_year(first(CYCLE))
	) %>%
	ggplot(aes(x = year, y = obesity_rate, color = age_group, group = age_group)) +
	geom_line(size = 1) +
	geom_point(size = 3) +
	theme_nhanes() +
	scale_x_nhanes() +
	labs(title = "Survey-Weighted Obesity Rates by Age Group Over Time",
	x = "Year",
	y = "Obesity Rate (%)",
	color = "Age Group",
	caption = "Graph created by @tyleransom using NHANES Data")

	# 3. Children's and Youth's obesity trends
	youth_obesity_plot <- all_data %>%
	filter(AGE %in% 12:19) %>%
	group_by(CYCLE) %>%
	summarize(
	obesity_rate = weighted.mean(BMI >= 30, WTMEC2YR, na.rm = TRUE) * 100,
	severe_obesity_rate = weighted.mean(BMI >= 35, WTMEC2YR, na.rm = TRUE) * 100,
	mean_bmi = weighted.mean(BMI, WTMEC2YR, na.rm = TRUE),
	sd_bmi = sqrt(weighted.var(BMI, WTMEC2YR, na.rm = TRUE)),
	year = cycle_to_year(first(CYCLE))
	) %>%
	pivot_longer(cols = c(obesity_rate, severe_obesity_rate),
	names_to = "metric",
	values_to = "rate") %>%
	ggplot(aes(x = year, y = rate, color = metric, group = metric)) +
	geom_line(size = 1) +
	geom_point(size = 3) +
	theme_nhanes() +
	scale_x_nhanes() +
	labs(title = "Survey-Weighted Youth Obesity Trends (ages 12-19)",
	x = "Year",
	y = "Rate (%)",
	color = "Category",
	caption = "Graph created by @tyleransom using NHANES Data") +
	scale_color_manual(values = c("obesity_rate" = "#E41A1C",
	"severe_obesity_rate" = "#377EB8"),
	labels = c("Obesity (BMI ≥ 30)",
	"Severe Obesity (BMI ≥ 35)"))

	child_obesity_plot <- all_data %>%
	filter(AGE %in% 3:11) %>%
	group_by(CYCLE) %>%
	summarize(
	obesity_rate = weighted.mean(BMI >= 30, WTMEC2YR, na.rm = TRUE) * 100,
	severe_obesity_rate = weighted.mean(BMI >= 35, WTMEC2YR, na.rm = TRUE) * 100,
	mean_bmi = weighted.mean(BMI, WTMEC2YR, na.rm = TRUE),
	sd_bmi = sqrt(weighted.var(BMI, WTMEC2YR, na.rm = TRUE)),
	year = cycle_to_year(first(CYCLE))
	) %>%
	pivot_longer(cols = c(obesity_rate, severe_obesity_rate),
	names_to = "metric",
	values_to = "rate") %>%
	ggplot(aes(x = year, y = rate, color = metric, group = metric)) +
	geom_line(size = 1) +
	geom_point(size = 3) +
	theme_nhanes() +
	scale_x_nhanes() +
	labs(title = "Survey-Weighted Child Obesity Trends (ages 3-11)",
	x = "Year",
	y = "Rate (%)",
	color = "Category",
	caption = "Graph created by @tyleransom using NHANES Data") +
	scale_color_manual(values = c("obesity_rate" = "#E41A1C",
	"severe_obesity_rate" = "#377EB8"),
	labels = c("Obesity (BMI ≥ 30)",
	"Severe Obesity (BMI ≥ 35)"))

	# 4. BMI Distribution Change (Density plots for first and last cycle)
	distribution_plot <- check_data %>%
	group_by(year) %>%
	do({
	# Calculate weighted kernel density
	d <- density(.$BMI,
	weights = .$WTMEC2YR/sum(.$WTMEC2YR),
	n = 512,
	from = 15,
	to = 85)

	data.frame(
	x = d$x,
	y = d$y
	)
	}) %>%
	ungroup()

	# Convert to wider format for ribbon
	distribution_wide <- distribution_plot %>%
	pivot_wider(names_from = year,
	names_prefix = "y",
	values_from = y)

	# Create plot with ribbon
	distribution_plot <- ggplot(distribution_wide, aes(x = x)) +
	geom_ribbon(aes(ymin = 0, ymax = y2018, fill = "2018"), alpha = 0.3) +
	geom_ribbon(aes(ymin = 0, ymax = y2022, fill = "2022"), alpha = 0.3) +
	geom_line(aes(y = y2018, color = "2018")) +
	geom_line(aes(y = y2022, color = "2022")) +
	theme_nhanes() +
	scale_x_continuous(breaks = seq(20, 80, 10)) +
	coord_cartesian(xlim = c(15, 85)) +
	scale_y_continuous(labels = scales::number_format(accuracy = 0.01)) +
	labs(title = "Survey-Weighted BMI Distribution Change (Ages 20-55)",
	x = "BMI",
	y = "Density",
	fill = "Year",
	color = "Year",
	caption = "Graph created by @tyleransom using NHANES Data") +
	scale_fill_manual(values = c("2018" = "#E41A1C", "2022" = "#377EB8")) +
	scale_color_manual(values = c("2018" = "#E41A1C", "2022" = "#377EB8"))

	# 5. Overall adult obesity trend (FT replication)
	overall_adult_trend <- all_data %>%
	filter(AGE %in% 20:55) %>%
	group_by(CYCLE) %>%
	summarize(
	obesity_rate = weighted.mean(BMI >= 30, WTMEC2YR, na.rm = TRUE) * 100,
	n = n(),
	year = cycle_to_year(first(CYCLE))
	) %>%
	ggplot(aes(x = year, y = obesity_rate)) +
	geom_line(size = 1, color = "#2E74B5") + # FT blue
	geom_point(size = 3, color = "#2E74B5") +
	theme_nhanes() +
	scale_x_nhanes() +
	labs(
	title = "Survey-Weighted U.S. Adult Obesity Rate (Ages 20-55)",
	x = "Year",
	y = "Obesity Rate (%)",
	caption = "Graph created by @tyleransom using NHANES Data") +
	scale_y_continuous(
	limits = c(0, 50),
	breaks = seq(0, 50, 10)
	)

	# Combine plots using patchwork
	combined_plots <- (percentile_plot + age_group_plot) /
	(child_obesity_plot + distribution_plot)

	# Save all plots
	ggsave("wgt_nhanes_obesity_trends.png", combined_plots,
	width = 15, height = 12, units = "in")
	ggsave("wgt_ft_obesity_trend.png", overall_adult_trend,
	width = 8, height = 6, units = "in")
	ggsave("wgt_percentile_trends.png", percentile_plot,
	width = 8, height = 6, units = "in")
	ggsave("wgt_age_group_trends.png", age_group_plot,
	width = 8, height = 6, units = "in")
	ggsave("wgt_child_obesity_trends.png", child_obesity_plot,
	width = 8, height = 6, units = "in")
	ggsave("wgt_youth_obesity_trends.png", youth_obesity_plot,
	width = 8, height = 6, units = "in")
	ggsave("wgt_bmi_distribution.png", distribution_plot,
	width = 8, height = 6, units = "in")