mcshaz · July 16, 2020 21:03
diff --git a/oneSampleSurvival.R b/oneSampleSurvival.R
 # see http://hedwig.mgh.harvard.edu/biostatistics/node/30 for the original paper and an excel spreadsheet with macros
 # the spreadsheet does much the same in VBA, but with considerably more code
 # unlike the vba spreadsheet, this also takes into acount the fraction of age in years at entry and exit (in creating per patient risk)
 # if the age is greater than the maximum data (101 years old), each subsequent year is a repeat of the final annual risk available
 library(survival)

 oneSampleSurvival <- function(df, description) {
  # NZ data between 0 and 100 for each year
  # 10 years of population and death data was averaged (2009-2018) for each year of age until 89 years old
  # data is then repeated in 5 yearly blocks as more granular data was hard to obtain
  # mortality at 0 years excludes mortality before 28 days (not relevant)
  maleMortality <- c(0.001843361228536, 0.000426012398328, 0.000205162390909, 0.000159933000221, 0.000131106236558, 0.000109847867736, 7.54877026981029E-05, 0.000104275266973, 8.46092661589332E-05, 0.000105655560873, 9.83646791092508E-05, 9.85469708619511E-05, 0.000107171040591, 0.000136745787619, 0.000211693246977, 0.000409680498681, 0.00047908243582, 0.00068725838029, 0.000792558666334, 0.00084616668714, 0.000794068160984, 0.000907776641227, 0.000800008665019, 0.000828096770177, 0.000806997128346, 0.000840005277716, 0.000802987629156, 0.000825605870321, 0.000765479018619, 0.000763137039117, 0.000806246482394, 0.000903749143428, 0.000924492304174, 0.000857232269108, 0.00092020098225, 0.000998230025349, 0.001085433512175, 0.001027818977332, 0.001122918474693, 0.00119988265766, 0.00137785701308, 0.001442949518615, 0.00153845943647, 0.00164270871642, 0.00188256662831, 0.002023312990214, 0.002030541981486, 0.002272865187846, 0.002607033588397, 0.002696261347275, 0.002969802371852, 0.003330445145048, 0.003450994308472, 0.003766824109133, 0.004382106432976, 0.004619214946603, 0.00507076963448, 0.005168789456058, 0.006025729167394, 0.006502154100548, 0.006825708117681, 0.00758116723691, 0.008280373380387, 0.009053044757868, 0.00965343239435, 0.010853634625976, 0.012058271659544, 0.013235643983316, 0.014237071816918, 0.016323379469387, 0.01765435043422, 0.019773974552901, 0.021931935366736, 0.024825087322466, 0.027006375178927, 0.029820680952757, 0.033902362368077, 0.03837648923878, 0.042932432116485, 0.046894126378063, 0.054108256726034, 0.060411289976837, 0.068653770714218, 0.077960340417016, 0.088759284493938, 0.099283037280737, 0.112244152400086, 0.128687561933202, 0.144916978613787, 0.157930508606968, 0.229411764705882, 0.229411764705882, 0.229411764705882, 0.229411764705882, 0.229411764705882, 0.362290502793296, 0.362290502793296, 0.362290502793296, 0.362290502793296, 0.362290502793296, 0.4025)
  femaleMortality <- c(0.001440725276587, 0.000378599537757, 0.00021709924675, 9.97502891438584E-05, 9.85006425048784E-05, 8.92050090982164E-05, 8.95206096251493E-05, 7.96114330270264E-05, 6.92762143799792E-05, 0.000101432919309, 5.08040463311748E-05, 9.22608680532037E-05, 0.000122942485987, 0.000153294565916, 0.000172502569892, 0.000260513869301, 0.000255523432279, 0.000368255538044, 0.000356330469863, 0.000267139191191, 0.000358230732596, 0.000369833546194, 0.000305476109726, 0.000319588939803, 0.000297956887714, 0.000363010043724, 0.000362859918462, 0.000432002032646, 0.000321026034948, 0.000361625954016, 0.00040452437153, 0.000431745162574, 0.000512770712544, 0.00054317376104, 0.000488063712367, 0.000486563128689, 0.00057339122496, 0.00072400070304, 0.00072060329081, 0.000754517692419, 0.000849463968011, 0.000831645042867, 0.001042677971853, 0.001132957559664, 0.001207180709918, 0.00131585417895, 0.001513946743796, 0.001606617997058, 0.001723479078906, 0.001933808631927, 0.002226989986555, 0.002231269823877, 0.002626438738956, 0.002696431092572, 0.002966902852488, 0.003080837093037, 0.003413401888315, 0.003427147560834, 0.004010391720794, 0.004217567593114, 0.004606013461916, 0.005031607028952, 0.005503746741787, 0.006108782143772, 0.006845816438536, 0.007141344621113, 0.007955395638506, 0.009127032877493, 0.010042400111416, 0.010803760162666, 0.012057728392066, 0.01323141371974, 0.014562716593054, 0.015873768931767, 0.018880147634345, 0.02056309402693, 0.022568564901875, 0.025811635811795, 0.029461369991122, 0.032920253916872, 0.038426882965907, 0.042458470397799, 0.050308505460449, 0.05592788582796, 0.063922777668064, 0.074695945529539, 0.084828676018797, 0.095857321745606, 0.110779421454886, 0.129589230549757, 0.185325602140946, 0.185325602140946, 0.185325602140946, 0.185325602140946, 0.185325602140946, 0.320109190172884, 0.320109190172884, 0.320109190172884, 0.320109190172884, 0.320109190172884, 0.444805194805195)
  
  getYrMortality <- function(startAgeYr, endAgeYr, isMale){
    if (isMale) {
      lookupVector <- maleMortality
    } else {
      lookupVector <- femaleMortality
    }
    startIndx <- as.integer(startAgeYr) + 1L # add 1 as mortality begins at 0 years of age (died <28d excluded)
    endIndx <- as.integer(endAgeYr) + 1L
    if (endIndx > length(lookupVector)) {
      lastIndx <- length(lookupVector)
      returnVar <- c(lookupVector[startIndx:lastIndx], rep(lookupVector[lastIndx], endIndx - lastIndx))
    } else {
      returnVar <- lookupVector[startIndx:endIndx]
    }
    return(returnVar)
  }
  
  smrProportionalHazard <- function(observed, expected, ci = 95) {
    stat <- (observed - expected) ^ 2 / expected
    
    smr <- observed / expected
    
    chiterm <- qchisq((100 + ci) / 200, 1)
    ub <- smr + chiterm / (2 * expected) + ((chiterm * (4 * observed + chiterm)) ^ 0.5) / (2 * expected)
    lb <- smr + chiterm / (2 * expected) - ((chiterm * (4 * observed + chiterm)) ^ 0.5) / (2 * expected)
    
    return(c("smr" = smr,
            "p_value" = 1 - pchisq(stat, 1),
            "lower_ci" = lb,
            "upper_ci" = ub))
  }
  
  riskOfDeath <- by(df, seq_len(nrow(df)), function(rw){
    # extracts mortality data for each year of age between entry and exit date
    # first and last date will be fractionally reduced according to how much of thay age year was remaining
    startAgeYr <- as.numeric(rw$entry - rw$dob) / 365.25
    startProportion <- startAgeYr %% 1
    startAgeYr <- as.integer(startAgeYr)
    
    endAgeYr <- as.numeric(rw$exit - rw$dob) / 365.25
    endProportion <- endAgeYr %% 1
    endAgeYr <- as.integer(endAgeYr)
    
    returnVar <- getYrMortality(startAgeYr, endAgeYr, rw$male)
    
    if (startAgeYr == endAgeYr) {
      returnVar <- returnVar * (endProportion - startProportion)
    } else {
      returnVar[1] <- returnVar[1] * (1 - startProportion)
      len <- length(returnVar)
      returnVar[len] <- returnVar[len] * endProportion
    }

    return(sum(returnVar))
  })
  
  #main output
  cat(description, "(using proportional hazards assumption)", "\n")
  totalDeaths <- sum(df$dead)
  totalRisk <- sum(riskOfDeath)
  cat("observed / expected =", totalDeaths, "/", totalRisk, "\n")
  print(smrProportionalHazard(totalDeaths, totalRisk))
  
  df$yrsOldAtEntry <- as.numeric(df$entry - df$dob) / 365.25
  df$yrsInStudy <- as.numeric(df$exit - df$entry) / 365.25
  
  # create reference for population hazard
  maxYears <- ceiling(max(df$yrsInStudy))
  hazards <- by(df, seq_len(nrow(df)), function(rw){
    mortVector <- getYrMortality(rw$yrsOldAtEntry, rw$yrsOldAtEntry + maxYears, rw$male)
    return(exp(-cumsum(mortVector)))
  })
  hazards <- c(1, rowMeans(simplify2array(hazards)))
  survData <- survfit(Surv(df$yrsInStudy, df$dead) ~ 1, data = df)
  lowerY <- floor(survData$lower[length(survData$lower)] * 100) / 100
  par(cex.sub = 0.8)
  plot(survData, 
       xlab = "Years from op", 
       ylab = "Survival probability",
       ylim = c(lowerY, 1),
       sub = description)
  lines(0:(as.integer(maxYears) + 1L), hazards, col="grey")
  legend("bottomleft",
         inset = 0.05,
         legend = c("general pop. matched for age/sex", "cardiac", "95% conf. int."),
         col = c("grey", "black", "black"),
         lty = c(1, 1, 2))
 }

 df <- byfirstsurg[, c("Male", "Dead", "DateofBirth", "DateOfEvent", "DatelastKnown")]
 # rename variables to required dataframe names for oneSampleSurvival function
 names(df) <- c("male", "dead", "dob", "entry", "exit")
 df$male <- df$male == 1
 df$dead <- df$dead == 1

 # looking at subsets of the data - trying to identify where/which subset the proportional hazard assumption might hold
 oneSampleSurvival (df[df$exit - df$entry > 30,], "surviving >30 days from operation")

 oneSampleSurvival (df[df$exit - df$entry > 30 & df$entry - df$dob > 365.25 * 15,], "surviving >30 days and >14 yr old at operation")

 oneSampleSurvival (df[df$exit - df$entry > 365,], "surviving at least 1 yr from operation")
	# see http://hedwig.mgh.harvard.edu/biostatistics/node/30 for the original paper and an excel spreadsheet with macros
	# the spreadsheet does much the same in VBA, but with considerably more code
	# unlike the vba spreadsheet, this also takes into acount the fraction of age in years at entry and exit (in creating per patient risk)
	# if the age is greater than the maximum data (101 years old), each subsequent year is a repeat of the final annual risk available
	library(survival)

	oneSampleSurvival <- function(df, description) {
	# NZ data between 0 and 100 for each year
	# 10 years of population and death data was averaged (2009-2018) for each year of age until 89 years old
	# data is then repeated in 5 yearly blocks as more granular data was hard to obtain
	# mortality at 0 years excludes mortality before 28 days (not relevant)
	maleMortality <- c(0.001843361228536, 0.000426012398328, 0.000205162390909, 0.000159933000221, 0.000131106236558, 0.000109847867736, 7.54877026981029E-05, 0.000104275266973, 8.46092661589332E-05, 0.000105655560873, 9.83646791092508E-05, 9.85469708619511E-05, 0.000107171040591, 0.000136745787619, 0.000211693246977, 0.000409680498681, 0.00047908243582, 0.00068725838029, 0.000792558666334, 0.00084616668714, 0.000794068160984, 0.000907776641227, 0.000800008665019, 0.000828096770177, 0.000806997128346, 0.000840005277716, 0.000802987629156, 0.000825605870321, 0.000765479018619, 0.000763137039117, 0.000806246482394, 0.000903749143428, 0.000924492304174, 0.000857232269108, 0.00092020098225, 0.000998230025349, 0.001085433512175, 0.001027818977332, 0.001122918474693, 0.00119988265766, 0.00137785701308, 0.001442949518615, 0.00153845943647, 0.00164270871642, 0.00188256662831, 0.002023312990214, 0.002030541981486, 0.002272865187846, 0.002607033588397, 0.002696261347275, 0.002969802371852, 0.003330445145048, 0.003450994308472, 0.003766824109133, 0.004382106432976, 0.004619214946603, 0.00507076963448, 0.005168789456058, 0.006025729167394, 0.006502154100548, 0.006825708117681, 0.00758116723691, 0.008280373380387, 0.009053044757868, 0.00965343239435, 0.010853634625976, 0.012058271659544, 0.013235643983316, 0.014237071816918, 0.016323379469387, 0.01765435043422, 0.019773974552901, 0.021931935366736, 0.024825087322466, 0.027006375178927, 0.029820680952757, 0.033902362368077, 0.03837648923878, 0.042932432116485, 0.046894126378063, 0.054108256726034, 0.060411289976837, 0.068653770714218, 0.077960340417016, 0.088759284493938, 0.099283037280737, 0.112244152400086, 0.128687561933202, 0.144916978613787, 0.157930508606968, 0.229411764705882, 0.229411764705882, 0.229411764705882, 0.229411764705882, 0.229411764705882, 0.362290502793296, 0.362290502793296, 0.362290502793296, 0.362290502793296, 0.362290502793296, 0.4025)
	femaleMortality <- c(0.001440725276587, 0.000378599537757, 0.00021709924675, 9.97502891438584E-05, 9.85006425048784E-05, 8.92050090982164E-05, 8.95206096251493E-05, 7.96114330270264E-05, 6.92762143799792E-05, 0.000101432919309, 5.08040463311748E-05, 9.22608680532037E-05, 0.000122942485987, 0.000153294565916, 0.000172502569892, 0.000260513869301, 0.000255523432279, 0.000368255538044, 0.000356330469863, 0.000267139191191, 0.000358230732596, 0.000369833546194, 0.000305476109726, 0.000319588939803, 0.000297956887714, 0.000363010043724, 0.000362859918462, 0.000432002032646, 0.000321026034948, 0.000361625954016, 0.00040452437153, 0.000431745162574, 0.000512770712544, 0.00054317376104, 0.000488063712367, 0.000486563128689, 0.00057339122496, 0.00072400070304, 0.00072060329081, 0.000754517692419, 0.000849463968011, 0.000831645042867, 0.001042677971853, 0.001132957559664, 0.001207180709918, 0.00131585417895, 0.001513946743796, 0.001606617997058, 0.001723479078906, 0.001933808631927, 0.002226989986555, 0.002231269823877, 0.002626438738956, 0.002696431092572, 0.002966902852488, 0.003080837093037, 0.003413401888315, 0.003427147560834, 0.004010391720794, 0.004217567593114, 0.004606013461916, 0.005031607028952, 0.005503746741787, 0.006108782143772, 0.006845816438536, 0.007141344621113, 0.007955395638506, 0.009127032877493, 0.010042400111416, 0.010803760162666, 0.012057728392066, 0.01323141371974, 0.014562716593054, 0.015873768931767, 0.018880147634345, 0.02056309402693, 0.022568564901875, 0.025811635811795, 0.029461369991122, 0.032920253916872, 0.038426882965907, 0.042458470397799, 0.050308505460449, 0.05592788582796, 0.063922777668064, 0.074695945529539, 0.084828676018797, 0.095857321745606, 0.110779421454886, 0.129589230549757, 0.185325602140946, 0.185325602140946, 0.185325602140946, 0.185325602140946, 0.185325602140946, 0.320109190172884, 0.320109190172884, 0.320109190172884, 0.320109190172884, 0.320109190172884, 0.444805194805195)

	getYrMortality <- function(startAgeYr, endAgeYr, isMale){
	if (isMale) {
	lookupVector <- maleMortality
	} else {
	lookupVector <- femaleMortality
	}
	startIndx <- as.integer(startAgeYr) + 1L # add 1 as mortality begins at 0 years of age (died <28d excluded)
	endIndx <- as.integer(endAgeYr) + 1L
	if (endIndx > length(lookupVector)) {
	lastIndx <- length(lookupVector)
	returnVar <- c(lookupVector[startIndx:lastIndx], rep(lookupVector[lastIndx], endIndx - lastIndx))
	} else {
	returnVar <- lookupVector[startIndx:endIndx]
	}
	return(returnVar)
	}

	smrProportionalHazard <- function(observed, expected, ci = 95) {
	stat <- (observed - expected) ^ 2 / expected

	smr <- observed / expected

	chiterm <- qchisq((100 + ci) / 200, 1)
	ub <- smr + chiterm / (2 * expected) + ((chiterm * (4 * observed + chiterm)) ^ 0.5) / (2 * expected)
	lb <- smr + chiterm / (2 * expected) - ((chiterm * (4 * observed + chiterm)) ^ 0.5) / (2 * expected)

	return(c("smr" = smr,
	"p_value" = 1 - pchisq(stat, 1),
	"lower_ci" = lb,
	"upper_ci" = ub))
	}

	riskOfDeath <- by(df, seq_len(nrow(df)), function(rw){
	# extracts mortality data for each year of age between entry and exit date
	# first and last date will be fractionally reduced according to how much of thay age year was remaining
	startAgeYr <- as.numeric(rw$entry - rw$dob) / 365.25
	startProportion <- startAgeYr %% 1
	startAgeYr <- as.integer(startAgeYr)

	endAgeYr <- as.numeric(rw$exit - rw$dob) / 365.25
	endProportion <- endAgeYr %% 1
	endAgeYr <- as.integer(endAgeYr)

	returnVar <- getYrMortality(startAgeYr, endAgeYr, rw$male)

	if (startAgeYr == endAgeYr) {
	returnVar <- returnVar * (endProportion - startProportion)
	} else {
	returnVar[1] <- returnVar[1] * (1 - startProportion)
	len <- length(returnVar)
	returnVar[len] <- returnVar[len] * endProportion
	}

	return(sum(returnVar))
	})

	#main output
	cat(description, "(using proportional hazards assumption)", "\n")
	totalDeaths <- sum(df$dead)
	totalRisk <- sum(riskOfDeath)
	cat("observed / expected =", totalDeaths, "/", totalRisk, "\n")
	print(smrProportionalHazard(totalDeaths, totalRisk))

	df$yrsOldAtEntry <- as.numeric(df$entry - df$dob) / 365.25
	df$yrsInStudy <- as.numeric(df$exit - df$entry) / 365.25

	# create reference for population hazard
	maxYears <- ceiling(max(df$yrsInStudy))
	hazards <- by(df, seq_len(nrow(df)), function(rw){
	mortVector <- getYrMortality(rw$yrsOldAtEntry, rw$yrsOldAtEntry + maxYears, rw$male)
	return(exp(-cumsum(mortVector)))
	})
	hazards <- c(1, rowMeans(simplify2array(hazards)))
	survData <- survfit(Surv(df$yrsInStudy, df$dead) ~ 1, data = df)
	lowerY <- floor(survData$lower[length(survData$lower)] * 100) / 100
	par(cex.sub = 0.8)
	plot(survData,
	xlab = "Years from op",
	ylab = "Survival probability",
	ylim = c(lowerY, 1),
	sub = description)
	lines(0:(as.integer(maxYears) + 1L), hazards, col="grey")
	legend("bottomleft",
	inset = 0.05,
	legend = c("general pop. matched for age/sex", "cardiac", "95% conf. int."),
	col = c("grey", "black", "black"),
	lty = c(1, 1, 2))
	}

	df <- byfirstsurg[, c("Male", "Dead", "DateofBirth", "DateOfEvent", "DatelastKnown")]
	# rename variables to required dataframe names for oneSampleSurvival function
	names(df) <- c("male", "dead", "dob", "entry", "exit")
	df$male <- df$male == 1
	df$dead <- df$dead == 1

	# looking at subsets of the data - trying to identify where/which subset the proportional hazard assumption might hold
	oneSampleSurvival (df[df$exit - df$entry > 30,], "surviving >30 days from operation")

	oneSampleSurvival (df[df$exit - df$entry > 30 & df$entry - df$dob > 365.25 * 15,], "surviving >30 days and >14 yr old at operation")

	oneSampleSurvival (df[df$exit - df$entry > 365,], "surviving at least 1 yr from operation")