Population_Study/1_Run_Full_Simulation.r
2025-12-01 17:21:19 -07:00

155 lines
11 KiB
R

#####Packages
library(tidyverse) #Cleaning data
library(fixest) #Estimating a model of birth rates, to provide variance in the birth rate Monte Carlo using a fixed effect model.
library(forecast) #Fore ARIMA migration simulations
library(parallel)
library(uuid) #To add a index to each batch
####If the prelimnary data needs to be reloaded run the supplied bash script to download, process, and generate all needed data sets for the Monte Carlo population Simulation. Otherwise skip this step to save time
RELOAD_DATA <- FALSE
if(RELOAD_DATA){system("bash Prelim_Process.sh")}
#Load custom functions needed for the simulation
source("Scripts/Load_Custom_Functions/Migration_Simulation_Functions.r")
source("Scripts/Load_Custom_Functions/Birth_Simulation_Functions.r")
source("Scripts/Load_Custom_Functions/Increment_Data_Year.r")
source("Scripts/Load_Custom_Functions/Single_Age_Mortality_Trend_Simulation.r")
#######Preliminary Model Inputs
YEARS_AHEAD <- 10
NUM_SIMULATIONS <- 10^4
ST_YEAR <- 2017
################################Load Data
DEMO <- readRDS("Data/Intermediate_Inputs/Starting_Demographic_Data_Sets_of_Monte_Carlo/2016_Starting_Kemmerer_Diamondville_Demographics_Matrix.Rds")
BIRTH_MOD <- readRDS("Data/Intermediate_Inputs/Birth_Regressions/Birth_Regression_2016.Rds")
#Must add region as a factor with multiple levels for predict to work. Seems to check for multiple levels although that is not needed econometrics.
BIRTH_DATA <- readRDS("Data/Intermediate_Inputs/Birth_Regressions/Regression_Data/Birth_Simulation_Key_Starting_Points.Rds") %>% mutate(Region=factor(Region)) %>% filter(Region=='Kemmerer & Diamondville',Year==2016)
MIGRATION_ARIMA <- readRDS("Data/Intermediate_Inputs/Migration_ARIMA_Models/Kemmerer_Diamondville_Net_Migration_ARIMA_2016.Rds")
MIGRATION_ODDS <- readRDS("Data/Intermediate_Inputs/Migration_Trends/Migration_Age_Probability_Zero_to_85.Rds")
MIGRATION_MATRIX <- simulate(nsim=NUM_SIMULATIONS,MIGRATION_ARIMA,n=YEARS_AHEAD)
MIGRATION_MATRIX <- do.call(cbind, mclapply(1:NUM_SIMULATIONS,function(x)(as.vector(simulate(nsim=YEARS_AHEAD,MIGRATION_ARIMA) )),mc.cores = detectCores()-1))
rownames(MIGRATION_MATRIX) <- ST_YEAR:(ST_YEAR+YEARS_AHEAD-1)
colnames(MIGRATION_MATRIX) <- 1:NUM_SIMULATIONS
#Data for death rate trends
SINGLE_AGE_MODS <- readRDS("Data/Intermediate_Inputs/Mortality_Regression_Data/Single_Sex_Age_Time_Series_Regression_2016.Rds")
BOUNDS <- readRDS("Data/Intermediate_Inputs/Mortality_Regression_Data/Single_Sex_Age_Bounds_for_Predictions.Rds")
MAX_MALE <- BOUNDS %>% filter(Sex=='Male') %>% pull(MAX_RATE)
MIN_MALE <- BOUNDS %>% filter(Sex=='Male') %>% pull(MIN_RATE)
MAX_FEMALE <- BOUNDS %>% filter(Sex=='Female') %>% pull(MAX_RATE)
MIN_FEMALE <- BOUNDS %>% filter(Sex=='Female') %>% pull(MIN_RATE)
MIN_GAP <- BOUNDS %>% filter(Sex=='Male') %>% pull(MIN_MALE_FEMALE_GAP)
MAX_GAP <- BOUNDS %>% filter(Sex=='Male') %>% pull(MAX_MALE_FEMALE_GAP)
#Adjusted for 2016 by filtering REMOVE LATER
BASELINE_AGE_ADJUST_MEN <- readRDS("Data/Cleaned_Data/Mortality_Data/RDS/Single_Sex_Age_Population_in_2000.Rds") %>% filter(Sex=='Male',Year<=2016) %>% pull(Percent_of_Population)
BASELINE_AGE_ADJUST_WOMEN <- readRDS("Data/Cleaned_Data/Mortality_Data/RDS/Single_Sex_Age_Population_in_2000.Rds") %>% filter(Sex=='Female',Year<=2016) %>% pull(Percent_of_Population)
#Adjust to just women popualtion (Not all population percent
BASELINE_AGE_ADJUST_WOMEN <- BASELINE_AGE_ADJUST_WOMEN/sum(BASELINE_AGE_ADJUST_WOMEN )
BASELINE_AGE_ADJUST_MEN <- BASELINE_AGE_ADJUST_MEN/ sum(BASELINE_AGE_ADJUST_MEN )
MOD_MEN_ALL <- readRDS("Data/Intermediate_Inputs/Age_Mortality_ARIMA_Models/ARIMA_US_Men_Mortality_by_Age_2016.Rds")
MOD_WOMEN_ALL <- readRDS("Data/Intermediate_Inputs/Age_Mortality_ARIMA_Models/ARIMA_US_Women_Mortality_by_Age_2016.Rds")
MOD_LIN_MEN <- readRDS("Data/Intermediate_Inputs/Age_Mortality_ARIMA_Models/ARIMA_Lincoln_Men_Mortality_by_Age_2016.Rds")
MOD_LIN_WOMEN <- readRDS("Data/Intermediate_Inputs/Age_Mortality_ARIMA_Models/ARIMA_Lincoln_Women_Mortality_by_Age_2016.Rds")
XREG <- cbind(rep(0.0001,YEARS_AHEAD+1),rep(0.0001,YEARS_AHEAD+1)) #Empty data set to simulate in the future
XREG <- ts(XREG,start=ST_YEAR,frequency=1)
SIMULATED_MORTALITY_DATA_SET <- MAKE_EMPTY(ST_YEAR,ST_YEAR+YEARS_AHEAD,MOD_LIN_MEN,MOD_LIN_WOMEN,MOD_MEN_ALL,MOD_WOMEN_ALL,XREG)
MORTALITY_SIMULATION <- AGE_DIST(SINGLE_AGE_MODS,SIMULATED_MORTALITY_DATA_SET ,MAX_MALE,MAX_FEMALE,MIN_MALE,MIN_FEMALE,MAX_GAP,MIN_GAP,BASELINE_AGE_ADJUST_MEN,BASELINE_AGE_ADJUST_WOMEN)
#####################START YEAR BY SIMULATIONS
CURRENT_YEARS_AHEAD <- 1
CURRENT_SIM_NUM <- 58
ORIG_DEMO <- DEMO
DEMO <- DEMOGRAPHICS_AFTER_MIGRATION(DEMO, MIGRATION_MATRIX[CURRENT_YEARS_AHEAD ,CURRENT_SIM_NUM],MIGRATION_ODDS )
TOTAL_MIGRATION <- sum(DEMO-ORIG_DEMO)
BIRTH_DATA$Year <- BIRTH_DATA$Year+1
BIRTH_DATA$Lag_Two_Births <- BIRTH_DATA$Lag_Births
BIRTH_DATA$Lag_Births <- BIRTH_DATA$Births
BIRTH_DATA$Births <- NA
##We grab one year earlier than the window because they are one year older this year. Because the ages are from 0-85, row 18 is year 17, but one year is added making it 18 years in the current year. The birth windows are 18-28 for women and 18-30 for men.
BIRTH_DATA$Min_Birth_Group <- min(sum(DEMO[18:30,1]),sum(DEMO[18:28,2]))
NEW_BORNS <- BIRTH_SIM(BIRTH_MOD,BIRTH_DATA)
TOTAL_BIRTHS <- sum(NEW_BORNS)
BIRTH_DATA[,"Births"] <-TOTAL_BIRTHS
DEMO <- INCREMENT_AGES(DEMO,NEW_BORNS)
MALE_DEATHS <- sapply(1:86,function(x){rbinom(1,DEMO[x,1],MORTALITY_SIMULATION[[1]][x,CURRENT_YEARS_AHEAD])})
FEMALE_DEATHS <- sapply(1:86,function(x){rbinom(1,DEMO[x,2],MORTALITY_SIMULATION[[2]][x,CURRENT_YEARS_AHEAD])})
TOTAL_DEATHS <- sum(MALE_DEATHS+FEMALE_DEATHS)
DEMO[,"Num_Male"] <- DEMO[,"Num_Male"] -MALE_DEATHS
DEMO[,"Num_Female"] <- DEMO[,"Num_Female"] -MALE_DEATHS
#List of values needed for the next run or for reporting a result
list(DEMO,BIRTH_DATA,c(TOTAL_BIRTHS,TOTAL_DEATHS,TOTAL_MIGRATION))
#####User Configuration Values
KEMMER_SIM <- TRUE #Wether the simulation should predict Kemmerer (and Diamondville) or Lincoln County as a whole. TRUE, is Kemmerer False is Lincoln
START_SIM_YEAR <- 2025 #The first year to simulate
NUM_SIMULATIONS <- 10^5 #Number of Monte Carlo Simulations to run
RERUN_MORTALITY_SIMULATION <- TRUE #Rerun the Monte Carlo simulation of future mortality rates (not actual deaths) even if a Rds file of a mortality rates exists. This can be used to speed up runs when FALSE
RERUN_MIGRATION_SIMULATION <- TRUE #Rerun the ARIMA simulations that predict total migration in any year even if a Rds file of a mortality rates exists. This can be used to speed up runs when FALSE
NUM_YEARS_PROJECTED <- 50 #How many years into the future should each Monte Carlo run project to. For example 25 years if starting from 2025 and ending in 2050.
BIRTH_RATE_REG_RESULTS <- "Data/Regression_Results/Birth_Rate_Model.Rds" #Location of the regression used to model variance in birth rates in each Monte Carlo simulation.
START_DEMOGRAPHIC_DATA <- "Data/Cleaned_Data/Start_Year_Demographic_Data_With_Fertility_Groups.Rds" #Location of the data for the first year needing a forecasted birth rate, which aggregates the yearly splits of populations, into a single, year-county data set with variables for key birth prediction (total number of men and women in ages with high fertility rates), and then combines with the data set including births, deaths, migration and population.
AGE_OF_MIGRANT_PROBABILITY <- "Data/Other_Intermediate_Outputs/Migreation_Age_Probablity_One_to_Ninety.csv" #Location of the data which is the result of regression analysis of the age of migrants. That is to say 18 year olds may migrate more than 70 year olds, and this is the distribution by age. Sex was not found to be a major factor
####Run any scripts required before main Monte Carlo
source("Survival_Simulation.r") #Populate a table with a simulation of future mortality rates, for quick recall during the simulation.
#A script contains the code needed to create a feols (fixest) regression of the birth rate given age distribution. Load this saved result or else create it to use in each simulation for gathering variance of births in any given age distribution path of the Monte Carlo.
if(file.exists(BIRTH_RATE_REG_RESULTS)){MOD_BIRTHS <- readRDS(BIRTH_RATE_REG_RESULTS);FIRST_PREDICT_YEAR_POPULATION_DATA <- readRDS(START_DEMOGRAPHIC_DATA)} else{source("Birth_Rate_Regression.r")}
if(file.exists(AGE_OF_MIGRANT_PROBABILITY)){MIG_AGE_DIST <- read.csv(AGE_OF_MIGRANT_PROBABILITY)} else{source("Migration_Regression.r")}
#Rerun the migration simulation if requested
if(RERUN_MIGRATION_SIMULATION ){source("Migration_Regression.r")}
#######################################################Main Monte Carlo
START_DEM_DATA <- readRDS("Data/Cleaned_Data/Lincoln_Demographic_Data.Rds") %>% group_by(County) %>% filter(Year==2023) %>% ungroup %>% select(-County)
MORTALITY_SIMULATION <- readRDS("./Data/Simulated_Data_Sets/MORTALITY_MONTE_CARLO.Rds") #Load the Mortality simulation to speed up simulation
MIGRATION_ARIMA_SIMULATION <- readRDS("./Data/Simulated_Data_Sets/Migration_ARIMA.Rds") #Load the Migration simulation to speed up simulation
#####################!!!!!!!!!!!!!!!!!!!Working on pulling in data from the Data/Cleaned_Data/Intiate_Simulation/ directory which stores a starting point for all groups. Make data much more clean
if(KEMMER_SIM){
LN_POP_ST <- FIRST_PREDICT_YEAR_POPULATION_DATA$Population #Population of Lincoln County
START_DEM_DATA <- readRDS("Data/Cleaned_Data/Kemmerer_Demographic_Data.Rds") %>% select(-County) %>% mutate(County='Lincoln')
FIRST_PREDICT_YEAR_POPULATION_DATA <- readRDS("Data/Cleaned_Data/Kemmerer_Summary_Start_Data.Rds")
KEM_POP_ST <- FIRST_PREDICT_YEAR_POPULATION_DATA$Population
POP_ST_RATIO <- LN_POP_ST/KEM_POP_ST
MIGRATION_ARIMA_SIMS <- round(MIGRATION_ARIMA_SIMS*POP_ST_RATIO) #Downscale County migreation to the city level based on average popualtion
}
#Second run, work on making into loop and saving the output to file (check if that will slow the results). Maybe use a predefined matrix, so that the results can be stored quickly
SINGLE_PATH_SIM <- function(j){
C_RES <- RUN_SINGLE_SIM(MOD_BIRTHS,FIRST_PREDICT_YEAR_POPULATION_DATA,START_DEM_DATA,MORTALITY_SIMULATION,SIM_NUMBER=j,START_OF_SIM=START_SIM_YEAR,MIGRATION_ARIMA_SIMULATION,MIG_AGE_DIST)
RES <- C_RES[[1]]
for(i in 1:NUM_YEARS_PROJECTED){
C_RES <- RUN_SINGLE_SIM(MOD_BIRTHS,C_RES[[3]],C_RES[[2]],MORTALITY_SIMULATION,SIM_NUMBER=j,START_OF_SIM=START_SIM_YEAR,MIGRATION_ARIMA_SIMULATION,MIG_AGE_DIST)
RES <- rbind(RES,C_RES[[1]])
}
return(RES)
}
#Run the full simulation across all simulations simulating changes in demographic, and mortality data.
#Setup save results
RES_DIR <- "./Results"
RAW_SIM_FILE <- paste0(RES_DIR,"/Raw_Simulations.csv")
PERCENTILE_DATA <- paste0(RES_DIR,"/Percentile_Clean_Results.csv")
#Loop all results saving in batches
dir.create(RES_DIR,showWarnings=FALSE)
BATCH_SIZE <- 1000
NUM_RUNS <- ceiling(NUM_SIMULATIONS/BATCH_SIZE)
#Run the loop
for(x in 1:NUM_RUNS) {
BATCH_GUID <- UUIDgenerate()
try(FULL_RESULTS <- mclapply(1:BATCH_SIZE,function(x){try(SINGLE_PATH_SIM(x))},mc.cores = detectCores()-1))
if(exists("FULL_RESULTS")){
FULL_RESULTS <- do.call(rbind,lapply(1:BATCH_SIZE,function(x){FULL_RESULTS[[x]] %>% mutate(SIM_ID=UUIDgenerate())}))
FULL_RESULTS$BATCH_ID <- BATCH_GUID
FULL_RESULTS <- FULL_RESULTS%>% select(BATCH_ID,SIM_ID,everything())
if(x==1){write_csv(FULL_RESULTS,RAW_SIM_FILE)}else {write_csv(FULL_RESULTS,RAW_SIM_FILE,append=TRUE)}
rm(FULL_RESULTS)
gc()
}
}