kylebutts · October 24, 2025 17:07
diff --git a/simulate_bipartite.R b/simulate_bipartite.R

 ## Adapated from bipartitepandas python package
 #' Simulate Bipartite Labor Market Network
 #'
 #' Generates panel data for a bipartite network of workers and firms with
 #' assortative matching, mobility, and AKM wage structure.
 #'
 #' @param n_workers Integer. Number of workers (default: 10000)
 #' @param n_time Integer. Panel length in time periods (default: 5)
 #' @param firm_size Numeric. Average firm size per period (default: 50)
 #' @param n_firm_types Integer. Number of firm types (default: 10)
 #' @param n_worker_types Integer. Number of worker types (default: 5)
 #' @param p_move Numeric. Mobility probability per period (default: 0.5)
 #' @param c_sort Numeric. Assortative sorting strength (default: 1)
 #' @param c_netw Numeric. Network effect strength (default: 1)
 #' @param c_sig Numeric. Sorting/network shock volatility (default: 1)
 #' @param alpha_sig Numeric. Worker fixed effect volatility (default: 1)
 #' @param psi_sig Numeric. Firm fixed effect volatility (default: 1)
 #' @param w_sig Numeric. Wage shock volatility (default: 1)
 #' @param x_sig Numeric. Covariate volatility (default: 1)
 #' @param y1_beta Numeric. Covariate effect on y1 (default: 0.5)
 #' @param y1_sig Numeric. y1 error volatility (default: 1)
 #' @param y2_beta Numeric. Covariate effect on y2 (default: 0.3)
 #' @param y2_sig Numeric. y2 error volatility (default: 1)
 #'
 #' @return Data frame with columns:
 #'   \item{unit}{Worker identifier}
 #'   \item{firm_id}{Firm identifier}
 #'   \item{wage}{Original log wage (AKM structure)}
 #'   \item{y1}{First outcome variable}
 #'   \item{y2}{Second outcome variable}
 #'   \item{x}{Covariate}
 #'   \item{time}{Time period}
 #'   \item{worker_type}{Worker type}
 #'   \item{firm_type}{Firm type}
 #'   \item{worker_fe}{Worker fixed effect}
 #'   \item{firm_fe}{Firm fixed effect}
 #'
 #' @export
 simulate_bipartite <- function(
  n_workers = 10000L,
  n_time = 5L,
  firm_size = 10L,
  n_firm_types = 5L,
  n_worker_types = 5L,
  p_move = 0.5,
  c_sort = 1,
  c_netw = 1,
  c_sig = 1,
  alpha_sig = 1,
  psi_sig = 1,
  w_sig = 1,
  x_sig = 1,
  y1_beta = 0.5,
  y1_sig = 1,
  y2_beta = 0.3,
  y2_sig = 1
 ) {
  # Parameter validation
  stopifnot(
    "n_workers must be positive integer" = n_workers >= 1,
    "n_time must be positive integer" = n_time >= 1,
    "firm_size must be positive" = firm_size > 0,
    "n_firm_types must be positive integer" = n_firm_types >= 1,
    "n_worker_types must be positive integer" = n_worker_types >= 1,
    "alpha_sig must be non-negative" = alpha_sig >= 0,
    "psi_sig must be non-negative" = psi_sig >= 0,
    "w_sig must be non-negative" = w_sig >= 0,
    "c_sig must be non-negative" = c_sig >= 0,
    "p_move must be in [0,1]" = p_move >= 0 && p_move <= 1,
    "x_sig must be non-negative" = x_sig >= 0,
    "y1_sig must be non-negative" = y1_sig >= 0,
    "y2_sig must be non-negative" = y2_sig >= 0
  )

  # ============================================================================
  # GENERATE FIXED EFFECTS
  # ============================================================================

  # Generate fixed effects using inverse normal CDF
  psi <- qnorm(seq_len(n_firm_types) / (n_firm_types + 1)) * psi_sig
  alpha <- qnorm(seq_len(n_worker_types) / (n_worker_types + 1)) * alpha_sig

  # Compute transition matrices
  G <- array(0, dim = c(n_worker_types, n_firm_types, n_firm_types))
  for (l in seq_len(n_worker_types)) {
    for (k_from in seq_len(n_firm_types)) {
      probs <- dnorm((psi - c_netw * psi[k_from] - c_sort * alpha[l]) / c_sig)
      G[l, k_from, ] <- probs / sum(probs)
    }
  }

  # Compute stationary distributions
  H <- matrix(0, nrow = n_worker_types, ncol = n_firm_types)
  for (l in seq_len(n_worker_types)) {
    eig_decomp <- eigen(t(G[l, , ]))
    stationary_idx <- which.min(abs(eig_decomp$values - 1))
    stationary_vec <- Re(eig_decomp$vectors[, stationary_idx])
    stationary_vec <- abs(stationary_vec) / sum(abs(stationary_vec))
    H[l, ] <- stationary_vec
  }

  # ============================================================================
  # SIMULATE MOBILITY
  # ============================================================================

  # Generate worker types
  worker_types <- sample(seq_len(n_worker_types), n_workers, replace = TRUE)

  # Initialize mobility matrices
  firm_types <- matrix(0L, nrow = n_workers, ncol = n_time)
  spell_ids <- matrix(0L, nrow = n_workers, ncol = n_time)
  spell_counter <- 1L

  # Simulate worker mobility over time
  for (i in seq_len(n_workers)) {
    l <- worker_types[i]

    # Initial firm placement
    firm_types[i, 1] <- sample(seq_len(n_firm_types), 1, prob = H[l, ])
    spell_ids[i, 1] <- spell_counter
    spell_counter <- spell_counter + 1L

    # Mobility decisions for subsequent periods
    if (n_time > 1) {
      for (t in 2:n_time) {
        if (runif(1) < p_move) {
          # Worker moves firms
          firm_types[i, t] <- sample(
            seq_len(n_firm_types),
            1,
            prob = G[l, firm_types[i, t - 1], ]
          )
          spell_ids[i, t] <- spell_counter
          spell_counter <- spell_counter + 1L
        } else {
          # Worker stays at same firm
          firm_types[i, t] <- firm_types[i, t - 1]
          spell_ids[i, t] <- spell_ids[i, t - 1]
        }
      }
    }
  }

  # ============================================================================
  # CONSTRUCT PANEL
  # ============================================================================
  # Create long-format panel structure
  indiv_id <- rep(seq_len(n_workers), each = n_time)
  time <- rep(seq_len(n_time), times = n_workers)
  worker_type <- rep(worker_types, each = n_time)
  firm_type <- as.vector(t(firm_types))
  spell <- as.vector(t(spell_ids))

  # Compute spell sizes for firm ID assignment
  spell_summary <- aggregate(
    list(spell_size = spell),
    by = list(spell = spell, firm_type = firm_type),
    FUN = length
  )

  # Assign firm IDs (in_worker_typesined from assign_firm_ids)
  firm_ids <- integer(nrow(spell_summary))

  for (k_type in unique(spell_summary$firm_type)) {
    k_mask <- spell_summary$firm_type == k_type
    k_spells <- spell_summary[k_mask, ]

    # Calculate number of firms needed for this firm type
    total_obs <- sum(k_spells$spell_size)
    n_firms <- max(1L, round(total_obs / (firm_size * n_time)))

    # Randomly assign spells to firms
    assigned_ids <- sample(seq_len(n_firms), nrow(k_spells), replace = TRUE)

    # Create contiguous firm IDs
    firm_mapping <- data.frame(
      temp_id = assigned_ids,
      firm_id = seq_along(unique(assigned_ids))[match(
        assigned_ids,
        unique(assigned_ids)
      )]
    )

    firm_ids[k_mask] <- firm_mapping$firm_id[match(
      assigned_ids,
      firm_mapping$temp_id
    )]
  }

  spell_summary$firm_id <- firm_ids

  # Merge firm IDs back to panel
  panel <- data.frame(
    indiv_id = indiv_id,
    year = time,
    worker_type = worker_type,
    firm_type = firm_type,
    spell = spell
  )
  panel <- merge(panel, spell_summary[c("spell", "firm_id")], by = "spell")

  # Generate fixed effects and outcomes
  panel$worker_fe <- alpha[panel$worker_type]
  panel$firm_fe <- psi[panel$firm_type]
  panel$wage <- panel$worker_fe + panel$firm_fe + rnorm(nrow(panel)) * w_sig

  # Generate covariate and additional outcomes
  panel$x1 <- rnorm(nrow(panel)) * x_sig
  panel$y <- panel$worker_fe +
    panel$firm_fe +
    y1_beta * panel$x +
    rnorm(nrow(panel)) * y1_sig
  panel$y2 <- panel$worker_fe +
    panel$firm_fe +
    y2_beta * panel$x +
    rnorm(nrow(panel)) * y2_sig

  # Sort and return final dataset
  panel <- panel[order(panel$indiv_id, panel$year), ]

  return(panel[c(
    "indiv_id",
    "firm_id",
    "wage",
    "y",
    "y2",
    "x1",
    "year",
    "worker_type",
    "firm_type",
    "worker_fe",
    "firm_fe"
  )])
 }

	## Adapated from bipartitepandas python package
	#' Simulate Bipartite Labor Market Network
	#'
	#' Generates panel data for a bipartite network of workers and firms with
	#' assortative matching, mobility, and AKM wage structure.
	#'
	#' @param n_workers Integer. Number of workers (default: 10000)
	#' @param n_time Integer. Panel length in time periods (default: 5)
	#' @param firm_size Numeric. Average firm size per period (default: 50)
	#' @param n_firm_types Integer. Number of firm types (default: 10)
	#' @param n_worker_types Integer. Number of worker types (default: 5)
	#' @param p_move Numeric. Mobility probability per period (default: 0.5)
	#' @param c_sort Numeric. Assortative sorting strength (default: 1)
	#' @param c_netw Numeric. Network effect strength (default: 1)
	#' @param c_sig Numeric. Sorting/network shock volatility (default: 1)
	#' @param alpha_sig Numeric. Worker fixed effect volatility (default: 1)
	#' @param psi_sig Numeric. Firm fixed effect volatility (default: 1)
	#' @param w_sig Numeric. Wage shock volatility (default: 1)
	#' @param x_sig Numeric. Covariate volatility (default: 1)
	#' @param y1_beta Numeric. Covariate effect on y1 (default: 0.5)
	#' @param y1_sig Numeric. y1 error volatility (default: 1)
	#' @param y2_beta Numeric. Covariate effect on y2 (default: 0.3)
	#' @param y2_sig Numeric. y2 error volatility (default: 1)
	#'
	#' @return Data frame with columns:
	#' \item{unit}{Worker identifier}
	#' \item{firm_id}{Firm identifier}
	#' \item{wage}{Original log wage (AKM structure)}
	#' \item{y1}{First outcome variable}
	#' \item{y2}{Second outcome variable}
	#' \item{x}{Covariate}
	#' \item{time}{Time period}
	#' \item{worker_type}{Worker type}
	#' \item{firm_type}{Firm type}
	#' \item{worker_fe}{Worker fixed effect}
	#' \item{firm_fe}{Firm fixed effect}
	#'
	#' @export
	simulate_bipartite <- function(
	n_workers = 10000L,
	n_time = 5L,
	firm_size = 10L,
	n_firm_types = 5L,
	n_worker_types = 5L,
	p_move = 0.5,
	c_sort = 1,
	c_netw = 1,
	c_sig = 1,
	alpha_sig = 1,
	psi_sig = 1,
	w_sig = 1,
	x_sig = 1,
	y1_beta = 0.5,
	y1_sig = 1,
	y2_beta = 0.3,
	y2_sig = 1
	) {
	# Parameter validation
	stopifnot(
	"n_workers must be positive integer" = n_workers >= 1,
	"n_time must be positive integer" = n_time >= 1,
	"firm_size must be positive" = firm_size > 0,
	"n_firm_types must be positive integer" = n_firm_types >= 1,
	"n_worker_types must be positive integer" = n_worker_types >= 1,
	"alpha_sig must be non-negative" = alpha_sig >= 0,
	"psi_sig must be non-negative" = psi_sig >= 0,
	"w_sig must be non-negative" = w_sig >= 0,
	"c_sig must be non-negative" = c_sig >= 0,
	"p_move must be in [0,1]" = p_move >= 0 && p_move <= 1,
	"x_sig must be non-negative" = x_sig >= 0,
	"y1_sig must be non-negative" = y1_sig >= 0,
	"y2_sig must be non-negative" = y2_sig >= 0
	)

	# ============================================================================
	# GENERATE FIXED EFFECTS
	# ============================================================================

	# Generate fixed effects using inverse normal CDF
	psi <- qnorm(seq_len(n_firm_types) / (n_firm_types + 1)) * psi_sig
	alpha <- qnorm(seq_len(n_worker_types) / (n_worker_types + 1)) * alpha_sig

	# Compute transition matrices
	G <- array(0, dim = c(n_worker_types, n_firm_types, n_firm_types))
	for (l in seq_len(n_worker_types)) {
	for (k_from in seq_len(n_firm_types)) {
	probs <- dnorm((psi - c_netw * psi[k_from] - c_sort * alpha[l]) / c_sig)
	G[l, k_from, ] <- probs / sum(probs)
	}
	}

	# Compute stationary distributions
	H <- matrix(0, nrow = n_worker_types, ncol = n_firm_types)
	for (l in seq_len(n_worker_types)) {
	eig_decomp <- eigen(t(G[l, , ]))
	stationary_idx <- which.min(abs(eig_decomp$values - 1))
	stationary_vec <- Re(eig_decomp$vectors[, stationary_idx])
	stationary_vec <- abs(stationary_vec) / sum(abs(stationary_vec))
	H[l, ] <- stationary_vec
	}

	# ============================================================================
	# SIMULATE MOBILITY
	# ============================================================================

	# Generate worker types
	worker_types <- sample(seq_len(n_worker_types), n_workers, replace = TRUE)

	# Initialize mobility matrices
	firm_types <- matrix(0L, nrow = n_workers, ncol = n_time)
	spell_ids <- matrix(0L, nrow = n_workers, ncol = n_time)
	spell_counter <- 1L

	# Simulate worker mobility over time
	for (i in seq_len(n_workers)) {
	l <- worker_types[i]

	# Initial firm placement
	firm_types[i, 1] <- sample(seq_len(n_firm_types), 1, prob = H[l, ])
	spell_ids[i, 1] <- spell_counter
	spell_counter <- spell_counter + 1L

	# Mobility decisions for subsequent periods
	if (n_time > 1) {
	for (t in 2:n_time) {
	if (runif(1) < p_move) {
	# Worker moves firms
	firm_types[i, t] <- sample(
	seq_len(n_firm_types),
	1,
	prob = G[l, firm_types[i, t - 1], ]
	)
	spell_ids[i, t] <- spell_counter
	spell_counter <- spell_counter + 1L
	} else {
	# Worker stays at same firm
	firm_types[i, t] <- firm_types[i, t - 1]
	spell_ids[i, t] <- spell_ids[i, t - 1]
	}
	}
	}
	}

	# ============================================================================
	# CONSTRUCT PANEL
	# ============================================================================
	# Create long-format panel structure
	indiv_id <- rep(seq_len(n_workers), each = n_time)
	time <- rep(seq_len(n_time), times = n_workers)
	worker_type <- rep(worker_types, each = n_time)
	firm_type <- as.vector(t(firm_types))
	spell <- as.vector(t(spell_ids))

	# Compute spell sizes for firm ID assignment
	spell_summary <- aggregate(
	list(spell_size = spell),
	by = list(spell = spell, firm_type = firm_type),
	FUN = length
	)

	# Assign firm IDs (in_worker_typesined from assign_firm_ids)
	firm_ids <- integer(nrow(spell_summary))

	for (k_type in unique(spell_summary$firm_type)) {
	k_mask <- spell_summary$firm_type == k_type
	k_spells <- spell_summary[k_mask, ]

	# Calculate number of firms needed for this firm type
	total_obs <- sum(k_spells$spell_size)
	n_firms <- max(1L, round(total_obs / (firm_size * n_time)))

	# Randomly assign spells to firms
	assigned_ids <- sample(seq_len(n_firms), nrow(k_spells), replace = TRUE)

	# Create contiguous firm IDs
	firm_mapping <- data.frame(
	temp_id = assigned_ids,
	firm_id = seq_along(unique(assigned_ids))[match(
	assigned_ids,
	unique(assigned_ids)
	)]
	)

	firm_ids[k_mask] <- firm_mapping$firm_id[match(
	assigned_ids,
	firm_mapping$temp_id
	)]
	}

	spell_summary$firm_id <- firm_ids

	# Merge firm IDs back to panel
	panel <- data.frame(
	indiv_id = indiv_id,
	year = time,
	worker_type = worker_type,
	firm_type = firm_type,
	spell = spell
	)
	panel <- merge(panel, spell_summary[c("spell", "firm_id")], by = "spell")

	# Generate fixed effects and outcomes
	panel$worker_fe <- alpha[panel$worker_type]
	panel$firm_fe <- psi[panel$firm_type]
	panel$wage <- panel$worker_fe + panel$firm_fe + rnorm(nrow(panel)) * w_sig

	# Generate covariate and additional outcomes
	panel$x1 <- rnorm(nrow(panel)) * x_sig
	panel$y <- panel$worker_fe +
	panel$firm_fe +
	y1_beta * panel$x +
	rnorm(nrow(panel)) * y1_sig
	panel$y2 <- panel$worker_fe +
	panel$firm_fe +
	y2_beta * panel$x +
	rnorm(nrow(panel)) * y2_sig

	# Sort and return final dataset
	panel <- panel[order(panel$indiv_id, panel$year), ]

	return(panel[c(
	"indiv_id",
	"firm_id",
	"wage",
	"y",
	"y2",
	"x1",
	"year",
	"worker_type",
	"firm_type",
	"worker_fe",
	"firm_fe"
	)])
	}