Population_Study/Scripts/Mortality_Rate_Over_Time_Simulation_Function.r
2025-10-22 17:36:31 -06:00

52 lines
5.8 KiB
R

##Intermediate functions to simulate death rates in future years, for particular population groups.
#The code uses death rates by age and sex group data to simulate future death rates in Lincoln county (Imparted to Kemmerer). This is repeated over a larger number of simulation, so that each Monte Carlo run that simulates actual deaths has a corresponding time specific death rate projection accounting for variation. For example given variance in death rate trends the death rate of young people will range in the future, the simulated path of this is linked to each Monte Carlo run for particular death cutoff points given a evolution of population age. Therefore the number of Monte Carlo runs where deaths in the population are made has to have a indexed time variance run.
#####Packages
library(parallel) #Speed up processing
cl <- detectCores()-1
#Load the data cleaned in the data loading script. Originally collected from the NIH HDPulse Wyoming Mortality Tables, and saved as csv files for each sex and age combination for all Wyoming counties.
LN_FILE_LOC <- "./Data/Cleaned_Data/Lincoln_Mortality_Rate.Rds"
if(!file.exists(paste0(LN_FILE_LOC))){source("Scripts/Data_Load.r"); LN_FILE_LOC <- "./Data/Cleaned_Data/Lincoln_Mortality_Rate.Rds"} #Run the data cleaning script if this required data is missing.
LIN_MORTALITY <- readRDS(LN_FILE_LOC)
#A function which projects how the death rate of each group will change relative to the mean with time. This pulls from the NIH five year trend and trend standard deviation.
TREND_SIM <- function(DEMO_GROUP,NUM_YEARS_FORWARD=50,RAND_SEED=NA){
#DEMO_GROUP: The current demographic cohort to study. For example 35 year old men, or 70 year old women
#NUM_YEARS_FORWARD: The number of years to project into the future. This must match the vector length of the Monte Carlo such such that each forecasted year has a matching death rate
#RAND_SEED: Optionally pass in a seed to fix the simulation outcomes, so that the results can be replicated in the future.
if(!is.na(RAND_SEED)){set.seed(RAND_SEED)}#Is the seed if requested
RES <- c()
C_YEAR_VALUE <- 0
#In each of the forecast years, grab a random distribution from the trend. This must be added to the last draw so that there can be drift that accounts for last years increase/decrease in the death rate.
for(i in 1:NUM_YEARS_FORWARD){
RES[length(RES)+1] <- C_YEAR_VALUE
C_YEAR_VALUE <- rnorm(1,mean=DEMO_GROUP$Trend,sd=DEMO_GROUP$Trend_SD)+C_YEAR_VALUE
}
return(RES)
}
#A function to simulate the absolute mortality rate of the Lincoln county population group (age,sex combination) in any future year. This combines the year to year variation and the time trend observed. This is a intermediate step to the final simulation
DEATH_RATE_DEMO_GROUP_SIM <- function(DEMO_GROUP,NUM_YEARS_FORWARD=50,RAND_SEED=NA){
if(!is.na(RAND_SEED)){set.seed(RAND_SEED)} #Set a random seed to make reproducible if requested
return(rnorm(NUM_YEARS_FORWARD,mean=DEMO_GROUP$Death_Rate,sd=DEMO_GROUP$Rate_SD))+TREND_SIM(DEMO_GROUP,NUM_YEARS_FORWARD,RAND_SEED)#Pull from the observed yearly variance of group death rates for each year. Add to this the simulated mortality rate addition due to trend for that future year.
}
#The key function of this Monte Carlo subset, that runs the mortality rate simulation and saves all of the simulations to an output file. These are saved to file, to free up ram in the main Monte Carlo. The indexed results can be extracted from disk and the allowing the particular simulation results need to be used, and the deleted from memory.
#This also speeds up the simulation by allowing the time changes in mortality rate to be run separately from the other results reducing overhead and complexity.
MORTALITY_RATE_SIMULATION<- function(NUM_SIMS,NUM_YEARS_TO_SIMULATE,POP_DATA=LIN_MORTALITY,SAVE_LOC="./Data/Simulated_Data_Sets/MORTALITY_MONTE_CARLO.Rds",RERUN=FALSE){
#NUM_SIMS: The number of time paths to be simulated. This is different from the number of years because each simulation must simulate all years in the set.
#NUM_YEARS_TO_SIMULATE: The number of years each Monte Carlo run will go out to. For example each single simulation may require values out to 2030, if the current year is 2025 this value is 5.
#POP_DATA: The mortality data for the study population. For each combination of sex and age, this includes the mean and SD of the mortality rate, and the trend in mortality over time.
#SAVE_LOC: What directory should the Monte Carlo results be saved to?
#RERUN: If true the simulation will be rerun, even if existing results are saved to file.
#Skip the simulation if it has already been run. Can take along time. This allows the main simulation to pull from this file conditional on a first completion
if(RERUN | !file.exists(SAVE_LOC)){
#A function which runs a single population cohort simulation. A sapply is used to create a matrix of results, with column numbers representing each individual simulation, and row numbers representing each year. Using a matrix vastly speeds up memory recall compared to a tibble or data frame.
SINGLE_GROUP_SIM_DEATH_RATE <- function(x){sapply(1:NUM_SIMS,DEATH_RATE_DEMO_GROUP_SIM,DEMO_GROUP=POP_DATA[x,],NUM_YEARS_FORWARD=NUM_YEARS_TO_SIMULATE)}
#Run each population group simulation in parallel, save the matrix of population cohorts (sex,age) in a list. The index of the list can be used to recall particular demographic group simulations as the index corresponds to the row number of the demographic data file. From there the simulated matrix can be recalled to find particular summation numbers by column number, and years by row number.
TEMP <- mclapply(1:nrow(POP_DATA),SINGLE_GROUP_SIM_DEATH_RATE, mc.cores = detectCores()-1 )
saveRDS(TEMP,SAVE_LOC,compress=FALSE)
} else{print("Death rate of population groups, by year simulation already saved on file. Skipping simulation.")}
}