diff --git a/1_Run_Full_Simulation.r b/1_Run_Full_Simulation.r index 6136abd..efe8324 100644 --- a/1_Run_Full_Simulation.r +++ b/1_Run_Full_Simulation.r @@ -3,6 +3,10 @@ library(tidyverse) #Cleaning data library(fixest) #Estimating a model of birth rates, to provide variance in the birth rate Monte Carlo using a fixed effect model. library(parallel) library(uuid) #To add a index to each batch +####If the prelimnary data needs to be reloaded run the supplied bash script to download, process, and generate all needed data sets for the Monte Carlo popopulation Simulation. Otherwise skip this step to save time +RELOAD_DATA <- TRUE +if(RELOAD_DATA){system("bash Prelim_Process.sh")} + #Load custom functions needed for the simulation source("Scripts/Birth_Simulation_Functions.r") source("Scripts/Monte_Carlo_Functions.r") diff --git a/ARMA_Pop.r b/ARMA_Pop.r deleted file mode 100644 index 64d873e..0000000 --- a/ARMA_Pop.r +++ /dev/null @@ -1,153 +0,0 @@ -######################################DATA WORKING OFF OF DATA SCRIPT!!!!!!!!!!!!!!!!!!! Start there - -#Future things to consider -###Naughton 1 & 2 natural gas conversion -###Add some variance in employment at TerraPower (odds of more or less) -###Probablity of more nuclear or power addtions -###Data center coming to the region - #Vague project with 500 electrions needed https://cowboystatedaily.com/2025/07/06/data-centers-boomer-retirements-make-electrician-the-new-it-job-in-wyoming/ - #Coal mine sale possible alt use of coal https://cowboystatedaily.com/2024/09/13/kemmerer-coal-mine-operation-sold-to-southern-california-real-estate-company/ - #Road project cancelled https://cowboystatedaily.com/2024/09/12/30m-project-to-move-3-miles-of-u-s-30-to-access-kemmerer-coal-stopped/ - #Data center agrement https://cowboystatedaily.com/2025/01/22/kemmerer-officials-hope-terrapower-deal-could-attract-ai-data-center/ -library(tidyverse) -library(forecast) -library(lmtest) - -source("Scripts/Functions.r") -#source("Scripts/Load_Wyoming_Web_Data.r") -DF <- FRED_GET('WYLINC3POP','LN_POP') %>% select(-YEAR) -TS <- 1000*ts(DF,start=c(1970),end=c(2024),frequency=1) -BC <- BoxCox.lambda(TS) -MODEL <- auto.arima(TS, lambda = BC) -forecast(MODEL,h=20) -ARMA_POP <- forecast(MODEL,h=35,fan=TRUE,bootstrap = FALSE,biasadj=FALSE) -plot(ARMA_POP,main="Lincoln County Population Forecast",xlab="Year",ylab="Population") -#####City level - #See data http://eadiv.state.wy.us/pop/ - KEM <- c(3273,3523,3688,3667,3626,3637,3611,3388,3156,3040,3020,3029,2989,2959,2976,2963,2910,2807,2729,2690,2657,2608,2575,2561,2574,2579,2603,2640,2679,2692,2642,2597,2551,2575,2578,2554,2544,2499,2457,2435,2413,2445,2445,2404,2378) - DIAMOND <- c(1000,1070,1114,1101,1082,1078,1063,991,916,876,864,863,847,835,835,827,808,774,748,732,705,695,690,689,695,700,710,723,738,745,731,704,677,667,652,629,613,586,559,540,523,526,527,521,517) - AREA_POP <- KEM+DIAMOND - LN <- c(12177,13254,14031,14110,14111,14319,14384,13658,12875,12552,12625,12975,13124,13329,13759,14073,14206,14099,14114,14338,14621,14697,14858,15117,15539,15917,16429,17013,17629,18082,18083,17946,17822,18148,18346,18473,18766,18899,19042,19379,19658,20174,20690,20909,21000) -# NO_CITY <- c(10095,10392,10747,10944,11043) - YEAR <- 1980:2024 -####Old data addtion:Period Ends in 1970 - #See in part http://eadiv.state.wy.us/demog_data/cntycity_hist.htm - LN_OLD <- c(12487,10894,10286,9023,9018,8640) - KEM_OLD <- c(843,1517,1884,2026,1667,2028,2292) - DIAMOND_OLD <- c(696,726,812,586,415,398,485) - AREA_OLD <- KEM_OLD+DIAMOND_OLD - AREA2 <- c(AREA_OLD,AREA_POP) - LN2 <- c(NA,LN_OLD,LN) - YEAR2 <- c(seq(1910,1980,by=10),1981:2024) - A <- cbind(YEAR2,LN2) %>% as_tibble %>% rename(Population=LN2) %>% mutate(Region='Lincoln County') - B <- cbind(YEAR2,AREA2) %>% as_tibble %>% rename(Population=AREA2) %>% mutate(Region='Kemmerer & Diamondvile') - DATA <- rbind(A,B) %>% rename(Year=YEAR2) - ggplot(aes(x=Year,y=Population,group=Region,color=Region),data=DATA) +geom_line(linewidth=1.5)+scale_x_continuous()+geom_vline(xintercept= 2022, linetype = "dashed", color = "red",size = 1) -###Kemmerer ARMA - KEM_TS <- DATA %>% filter(Year>=1980,Region=='Kemmerer & Diamondvile') %>% pull(Population) %>% ts(start=c(1980),end=c(2024),frequency=1) - BC <- BoxCox.lambda(KEM_TS) - MODEL2 <- auto.arima(KEM_TS, lambda = BC) - ARMA_KEM <- forecast(MODEL2,fan=TRUE,h=40,biasadj=FALSE) - plot(ARMA_KEM,main="Kemmerer & Diamondvile Population",xlab="Year",ylab="Population",ylim=c(0,6000)) -###IMPLAN adjustment - #TerraPower: Commuting SAM -LOW_EMP <- 310.75 #Employees - #TerraPower: No commuting -HIGH_EMP <- 325 #Employees - #Employment in Licolin -LIN_EMP <- 625 - #Regional (zip code level) ratios -EMPLOYMENT <- 2425.84 -HOUSEHOLDS <- 1850.20 -POPULATION <- 4038 - #Half of Lincolin Employment in some low industried can be added to Kemmerer effect. For example restruants may be scaled based on current output, when new ones will be built. -ADJ_LOCAL_EMP <- 78.15 - - -#85% commute from OUTSIDE Kemmer -#15% local labor - -RATIO <- POPULATION/EMPLOYMENT -#Note that number in household will be much larger. This acounts for average commuting rates by looking at population compared to employment -LOW_GROWTH <- RATIO*LOW_EMP -RATIO*HIGH_EMP -MID_GROWTH <- RATIO*(LOW_EMP+ADJ_LOCAL_EMP) -HIGH_GROWTH <- RATIO*(HIGH_EMP+ADJ_LOCAL_EMP) -MID_GROWTH -HIGH_GROWTH - -############## -TS <- EMP %>% select(EMP) %>% ts(start=c(1990),end=c(2024),frequency=1) -BC <- BoxCox.lambda(TS) -MODEL2 <- auto.arima(TS, lambda = BC) -MODEL2 -ARMA_EMP <- forecast(MODEL2,h=35,fan=TRUE,bootstrap = TRUE,npaths=100000,biasadj=FALSE) -plot(ARMA_EMP,main="Lincoln County Employment",xlab="Year",ylab="Employed") - -########## -DATA <- FRED_GET('BPPRIV056023','PRIV_HOUSING') %>% inner_join(FRED_GET('WYLINC3POP','LN_POP')) %>% inner_join(FRED_GET('ATNHPIUS56023A','HOUSE_PRICE_INDEX')) %>% left_join(FRED_GET('LAUCN560230000000005','EMP')) %>%left_join(FRED_GET('DCOILWTICO','WTI')) %>%left_join(FRED_GET('PCOALAUUSDM','COAL')) %>% mutate(LN_POP=1000*LN_POP) %>% select(-YEAR) %>% ts() %>% log() %>% diff() -grangertest(PRIV_HOUSING~HOUSE_PRICE_INDEX,data=DATA,order=1) -grangertest(HOUSE_PRICE_INDEX~PRIV_HOUSING,data=DATA,order=2) -grangertest(LN_POP~PRIV_HOUSING,data=DATA,order=1) -grangertest(HOUSE_PRICE_INDEX~COAL,data=DATA,order=2) -grangertest(LN_POP~WTI,data=DATA,order=1) -grangertest(COAL~WTI,data=DATA,order=2) -grangertest(WTI~COAL,data=DATA,order=2) -grangertest(LN_POP~COAL,data=DATA,order=1) -grangertest(EMP~LN_POP,data=DATA,order=2) - - -grangertest(PRIV_HOUSING~LN_POP,data=DATA,order=2) ## Signficant -grangertest(EMP~LN_POP,data=DATA,order=2) ## Signficant -grangertest(EMP~PRIV_HOUSING,data=DATA,order=2) ##Signficant -grangertest(EMP~LN_POP,data=DATA,order=2) ##Signficant -grangertest(PRIV_HOUSING~COAL,data=DATA,order=1) ##Signficant -grangertest(PRIV_HOUSING~COAL,data=DATA,order=2) ##Signficant -grangertest(PRIV_HOUSING~COAL,data=DATA,order=3) ##Signficant -grangertest(EMP~COAL,data=DATA,order=1) ##Signficant - - - - - - -grangertest(LN_POP~EMP,data=DATA,order=2) -grangertest(PRIV_HOUSING~EMP,data=DATA,order=2) - - - - - - -plot(DATA) -library(BVAR) -MOD <- bvar(DATA,lags=1, n_draw=40000) -opt_irf <- bv_irf(horizon = 25, identification = TRUE) -plot(irf(MOD,opt_irf,conf_bands = c(0.05, 0.1,0.15)),area=TRUE) - - - -head(DATA) - - - - -DATA -# ATNHPIUS56023A Housing price index -# Median income MHIWY56023A052NCEN -# Employed Persons in Lincoln LAUCN560230000000005 - -#####Plan and ideas -#1) Review IMPLAN for industry multipliers -#2) Review IMPLAN for employment to population multipliers (imparted) -#3) Find a list of all planned new projects -#4) Use the IMPLAN multipliers for each sector to estimate total change -#5) Develop survey to estimate likelihood of new projects -#6) Compare to the ARMA percentile -#7) Adjust the ARMA up assuming some of these outputs are known. -#8) Occupancy rate from IMPLAN as a housing cap when projecting -#9) Model housing construciton rate (Maybe) -#10) Employment rate by age in IMPLAN -####Other ideas, develop larger plan? Maybe look at decline in other industries as a proportion of employment - -###Seperate out Kemmer and Diamondville? http://eadiv.state.wy.us/pop/wyc&sc40.htm diff --git a/Old/1_Run_Full_Simulation.r b/Old/1_Run_Full_Simulation.r new file mode 100644 index 0000000..6136abd --- /dev/null +++ b/Old/1_Run_Full_Simulation.r @@ -0,0 +1,78 @@ +#####Packages +library(tidyverse) #Cleaning data +library(fixest) #Estimating a model of birth rates, to provide variance in the birth rate Monte Carlo using a fixed effect model. +library(parallel) +library(uuid) #To add a index to each batch +#Load custom functions needed for the simulation +source("Scripts/Birth_Simulation_Functions.r") +source("Scripts/Monte_Carlo_Functions.r") +source("Scripts/Migration_Simulation_Functions.r") + +#####User Configuration Values +KEMMER_SIM <- TRUE #Wether the simulation should predict Kemmerer (and Diamondville) or Lincoln County as a whole. TRUE, is Kemmerer False is Lincoln +START_SIM_YEAR <- 2025 #The first year to simulate + NUM_SIMULATIONS <- 10^5 #Number of Monte Carlo Simulations to run + RERUN_MORTALITY_SIMULATION <- TRUE #Rerun the Monte Carlo simulation of future mortality rates (not actual deaths) even if a Rds file of a mortality rates exists. This can be used to speed up runs when FALSE + RERUN_MIGRATION_SIMULATION <- TRUE #Rerun the ARIMA simulations that predict total migration in any year even if a Rds file of a mortality rates exists. This can be used to speed up runs when FALSE + + NUM_YEARS_PROJECTED <- 50 #How many years into the future should each Monte Carlo run project to. For example 25 years if starting from 2025 and ending in 2050. + BIRTH_RATE_REG_RESULTS <- "Data/Regression_Results/Birth_Rate_Model.Rds" #Location of the regression used to model variance in birth rates in each Monte Carlo simulation. + START_DEMOGRAPHIC_DATA <- "Data/Cleaned_Data/Start_Year_Demographic_Data_With_Fertility_Groups.Rds" #Location of the data for the first year needing a forecasted birth rate, which aggregates the yearly splits of populations, into a single, year-county data set with variables for key birth prediction (total number of men and women in ages with high fertility rates), and then combines with the data set including births, deaths, migration and population. + AGE_OF_MIGRANT_PROBABILITY <- "Data/Other_Intermediate_Outputs/Migreation_Age_Probablity_One_to_Ninety.csv" #Location of the data which is the result of regression analysis of the age of migrants. That is to say 18 year olds may migrate more than 70 year olds, and this is the distribution by age. Sex was not found to be a major factor +####Run any scripts required before main Monte Carlo + source("Survival_Simulation.r") #Populate a table with a simulation of future mortality rates, for quick recall during the simulation. + #A script contains the code needed to create a feols (fixest) regression of the birth rate given age distribution. Load this saved result or else create it to use in each simulation for gathering variance of births in any given age distribution path of the Monte Carlo. + if(file.exists(BIRTH_RATE_REG_RESULTS)){MOD_BIRTHS <- readRDS(BIRTH_RATE_REG_RESULTS);FIRST_PREDICT_YEAR_POPULATION_DATA <- readRDS(START_DEMOGRAPHIC_DATA)} else{source("Birth_Rate_Regression.r")} + if(file.exists(AGE_OF_MIGRANT_PROBABILITY)){MIG_AGE_DIST <- read.csv(AGE_OF_MIGRANT_PROBABILITY)} else{source("Migration_Regression.r")} + + #Rerun the migration simulation if requested +if(RERUN_MIGRATION_SIMULATION ){source("Migration_Regression.r")} +#######################################################Main Monte Carlo +START_DEM_DATA <- readRDS("Data/Cleaned_Data/Lincoln_Demographic_Data.Rds") %>% group_by(County) %>% filter(Year==2023) %>% ungroup %>% select(-County) +MORTALITY_SIMULATION <- readRDS("./Data/Simulated_Data_Sets/MORTALITY_MONTE_CARLO.Rds") #Load the Mortality simulation to speed up simulation +MIGRATION_ARIMA_SIMULATION <- readRDS("./Data/Simulated_Data_Sets/Migration_ARIMA.Rds") #Load the Migration simulation to speed up simulation + +#####################!!!!!!!!!!!!!!!!!!!Working on pulling in data from the Data/Cleaned_Data/Intiate_Simulation/ directory which stores a starting point for all groups. Make data much more clean +if(KEMMER_SIM){ + LN_POP_ST <- FIRST_PREDICT_YEAR_POPULATION_DATA$Population #Population of Lincoln County + START_DEM_DATA <- readRDS("Data/Cleaned_Data/Kemmerer_Demographic_Data.Rds") %>% select(-County) %>% mutate(County='Lincoln') + FIRST_PREDICT_YEAR_POPULATION_DATA <- readRDS("Data/Cleaned_Data/Kemmerer_Summary_Start_Data.Rds") + KEM_POP_ST <- FIRST_PREDICT_YEAR_POPULATION_DATA$Population + POP_ST_RATIO <- LN_POP_ST/KEM_POP_ST + MIGRATION_ARIMA_SIMS <- round(MIGRATION_ARIMA_SIMS*POP_ST_RATIO) #Downscale County migreation to the city level based on average popualtion +} +#Second run, work on making into loop and saving the output to file (check if that will slow the results). Maybe use a predefined matrix, so that the results can be stored quickly + +SINGLE_PATH_SIM <- function(j){ + C_RES <- RUN_SINGLE_SIM(MOD_BIRTHS,FIRST_PREDICT_YEAR_POPULATION_DATA,START_DEM_DATA,MORTALITY_SIMULATION,SIM_NUMBER=j,START_OF_SIM=START_SIM_YEAR,MIGRATION_ARIMA_SIMULATION,MIG_AGE_DIST) + RES <- C_RES[[1]] + for(i in 1:NUM_YEARS_PROJECTED){ + C_RES <- RUN_SINGLE_SIM(MOD_BIRTHS,C_RES[[3]],C_RES[[2]],MORTALITY_SIMULATION,SIM_NUMBER=j,START_OF_SIM=START_SIM_YEAR,MIGRATION_ARIMA_SIMULATION,MIG_AGE_DIST) + RES <- rbind(RES,C_RES[[1]]) + } + return(RES) +} +#Run the full simulation across all simulations simulating changes in demographic, and mortality data. + #Setup save results + RES_DIR <- "./Results" + RAW_SIM_FILE <- paste0(RES_DIR,"/Raw_Simulations.csv") + PERCENTILE_DATA <- paste0(RES_DIR,"/Percentile_Clean_Results.csv") + #Loop all results saving in batches + dir.create(RES_DIR,showWarnings=FALSE) + BATCH_SIZE <- 1000 + NUM_RUNS <- ceiling(NUM_SIMULATIONS/BATCH_SIZE) + #Run the loop +for(x in 1:NUM_RUNS) { + BATCH_GUID <- UUIDgenerate() + try(FULL_RESULTS <- mclapply(1:BATCH_SIZE,function(x){try(SINGLE_PATH_SIM(x))},mc.cores = detectCores()-1)) + if(exists("FULL_RESULTS")){ + FULL_RESULTS <- do.call(rbind,lapply(1:BATCH_SIZE,function(x){FULL_RESULTS[[x]] %>% mutate(SIM_ID=UUIDgenerate())})) + FULL_RESULTS$BATCH_ID <- BATCH_GUID + FULL_RESULTS <- FULL_RESULTS%>% select(BATCH_ID,SIM_ID,everything()) + if(x==1){write_csv(FULL_RESULTS,RAW_SIM_FILE)}else {write_csv(FULL_RESULTS,RAW_SIM_FILE,append=TRUE)} + rm(FULL_RESULTS) + gc() +} +} + + diff --git a/Scripts/1C_Download_and_Process_Demographic_Data.r b/Scripts/1C_Download_and_Process_Demographic_Data.r index 961616e..3e62049 100644 --- a/Scripts/1C_Download_and_Process_Demographic_Data.r +++ b/Scripts/1C_Download_and_Process_Demographic_Data.r @@ -74,8 +74,12 @@ DEM_DATA <- rbind(DEM_2020,DEM_DATA) %>% ungroup %>% arrange(Year,Age) %>% uniq if(!exists("SAVE_DEMO_LOC")){SAVE_DEMO_LOC <-"./Data/Cleaned_Data/Demographic_Sex_Age_Data"} CSV_SAVE <- paste0(SAVE_DEMO_LOC,"/CSV") RDS_SAVE <- paste0(SAVE_DEMO_LOC,"/RDS") - - #Save files for all county demographics + #Location for starting demographic data that initiates a simulation (different periods included for robustness checks) + if(!exists("STARTING_SIM_DATA_SAVE_LOC")){STARTING_SIM_DATA_SAVE_LOC <-"./Data/Intermediate_Inputs/Starting_Demographic_Data_Sets_of_Monte_Carlo/"} +dir.create(STARTING_SIM_DATA_SAVE_LOC, recursive = TRUE, showWarnings = FALSE) + + + #Save files for all county demographic cleaned data dir.create(CSV_SAVE , recursive = TRUE, showWarnings = FALSE) dir.create(RDS_SAVE , recursive = TRUE, showWarnings = FALSE) saveRDS(DEM_DATA,paste0(RDS_SAVE,"/All_Wyoming_Counties_Demographics.Rds" )) @@ -86,19 +90,19 @@ DEM_DATA <- rbind(DEM_2020,DEM_DATA) %>% ungroup %>% arrange(Year,Age) %>% uniq MOST_RECENT_YEAR <- max(LIN_DEM$Year) LIN_MAT <- LIN_DEM %>% filter(Year==MOST_RECENT_YEAR) %>% select(Num_Male,Num_Female) %>% as.matrix rownames(LIN_MAT) <- 0:85 - saveRDS(LIN_MAT,paste0(RDS_SAVE,paste0("/",MOST_RECENT_YEAR,"_Starting_Lincoln_County_Demographics_Matrix.Rds") )) + saveRDS(LIN_MAT,paste0(STARTING_SIM_DATA_SAVE_LOC,MOST_RECENT_YEAR,"_Starting_Lincoln_County_Demographics_Matrix.Rds") ) LIN_MAT_OLD <- LIN_DEM %>% filter(Year==2016) %>% select(Num_Male,Num_Female) %>% as.matrix #2016 to match the mid value of Kemmerer data years LIN_MAT_OLD[86,] <-colSums(LIN_MAT_OLD[ 86:nrow(LIN_MAT_OLD ),]) LIN_MAT_OLD <- LIN_MAT_OLD[1:86,] #Combine 85+ into one group to match the death stastics data rownames(LIN_MAT_OLD) <- 0:85 - saveRDS(LIN_MAT_OLD,paste0(RDS_SAVE,"/2016_Starting_Lincoln_County_Demographics_Matrix.Rds" )) + saveRDS(LIN_MAT_OLD,paste0(STARTING_SIM_DATA_SAVE_LOC,"2016_Starting_Lincoln_County_Demographics_Matrix.Rds" )) LIN_MAT_VERY_OLD <- LIN_DEM %>% filter(Year==2025-40) %>% select(Num_Male,Num_Female) %>% as.matrix #1985 to match 40 years of estiamtes or the end date. LIN_MAT_VERY_OLD[86,] <-colSums(LIN_MAT_VERY_OLD[ 86:nrow(LIN_MAT_VERY_OLD ),]) LIN_MAT_VERY_OLD <- LIN_MAT_VERY_OLD[1:86,] #Combine 85+ into one group to match the death stastics data rownames(LIN_MAT_VERY_OLD) <- 0:85 - saveRDS(LIN_MAT_VERY_OLD,paste0(RDS_SAVE,"/1985_Starting_Lincoln_County_Demographics_Matrix.Rds" )) + saveRDS(LIN_MAT_VERY_OLD,paste0(STARTING_SIM_DATA_SAVE_LOC,"1985_Starting_Lincoln_County_Demographics_Matrix.Rds" )) saveRDS(LIN_DEM,paste0(RDS_SAVE,"/Full_Lincoln_County_Demographics.Rds" )) diff --git a/Scripts/1D_Use_ACS_Census_Data_to_Estimate_Kemmerer_Demographics.r b/Scripts/1D_Use_ACS_Census_Data_to_Estimate_Kemmerer_Demographics.r index 889856d..f0e95c2 100644 --- a/Scripts/1D_Use_ACS_Census_Data_to_Estimate_Kemmerer_Demographics.r +++ b/Scripts/1D_Use_ACS_Census_Data_to_Estimate_Kemmerer_Demographics.r @@ -70,9 +70,12 @@ write_csv(KEM_DEMO_DATA,paste0(CSV_SAVE,"/Kemmerer_Diamondville_Demographics.csv #Other Lincoln area (not Kemmerer) save saveRDS(OTHER_LIN_DEMO_DATA,paste0(RDS_SAVE,"/Other_Lincoln_Demographics.Rds" )) write_csv(OTHER_LIN_DEMO_DATA,paste0(CSV_SAVE,"/Other_Lincoln_Demographics.csv" )) -#Matrix save -saveRDS(KEM_DEMO_MAT,paste0(RDS_SAVE,"/",MAX_YEAR,"_Starting_Kemmerer_Diamondville_Demographics_Matrix.Rds") ) -saveRDS(KEM_DEMO_OLD_MAT,paste0(RDS_SAVE,"/",MED_YEAR,"_Starting_Kemmerer_Diamondville_Demographics_Matrix.Rds" )) -saveRDS(OTHER_LIN_DEM_MAT,paste0(RDS_SAVE,"/",MAX_YEAR,"_Starting_Other_Demographics_Matrix.Rds" )) -saveRDS(OTHER_LIN_DEM_OLD_MAT,paste0(RDS_SAVE,"/",MED_YEAR,"_Starting_Other_Demographics_Matrix.Rds" )) +#Matrix save of starting Monte Carlo Data sets +if(!exists("STARTING_SIM_DATA_SAVE_LOC")){STARTING_SIM_DATA_SAVE_LOC <-"./Data/Intermediate_Inputs/Starting_Demographic_Data_Sets_of_Monte_Carlo/"} +dir.create(STARTING_SIM_DATA_SAVE_LOC, recursive = TRUE, showWarnings = FALSE) + +saveRDS(KEM_DEMO_MAT,paste0(STARTING_SIM_DATA_SAVE_LOC,MAX_YEAR,"_Starting_Kemmerer_Diamondville_Demographics_Matrix.Rds") ) +saveRDS(KEM_DEMO_OLD_MAT,paste0(STARTING_SIM_DATA_SAVE_LOC,MED_YEAR,"_Starting_Kemmerer_Diamondville_Demographics_Matrix.Rds" )) +saveRDS(OTHER_LIN_DEM_MAT,paste0(STARTING_SIM_DATA_SAVE_LOC,MAX_YEAR,"_Starting_Other_Demographics_Matrix.Rds" )) +saveRDS(OTHER_LIN_DEM_OLD_MAT,paste0(STARTING_SIM_DATA_SAVE_LOC,MED_YEAR,"_Starting_Other_Demographics_Matrix.Rds" )) diff --git a/Scripts/2B_Impart_Deaths_and_Migration_to_Subregions.r b/Scripts/2B_Impart_Deaths_and_Migration_to_Subregions.r index 1904950..6476f0e 100644 --- a/Scripts/2B_Impart_Deaths_and_Migration_to_Subregions.r +++ b/Scripts/2B_Impart_Deaths_and_Migration_to_Subregions.r @@ -3,8 +3,8 @@ library(tidyverse) #setwd("../") KEM_POP_LOC <- "Data/Cleaned_Data/Population_Data/RDS/Kemmerer_Diamondville_Population_Data.Rds" OTHER_POP_LOC <- "Data/Cleaned_Data/Population_Data/RDS/Other_Lincoln_Population_Data.Rds" -KEM_POP_LOC_CSV <- "Data/Cleaned_Data/Population_Data/RDS/Kemmerer_Diamondville_Population_Data.csv" -OTHER_POP_LOC_CSV <- "Data/Cleaned_Data/Population_Data/RDS/Other_Lincoln_Population_Data.csv" +KEM_POP_LOC_CSV <- "Data/Cleaned_Data/Population_Data/CSV/Kemmerer_Diamondville_Population_Data.csv" +OTHER_POP_LOC_CSV <- "Data/Cleaned_Data/Population_Data/CSV/Other_Lincoln_Population_Data.csv" MORT <- readRDS("Data/Cleaned_Data/Mortality_Rate_Data/RDS/Lincoln_County_Mortality_Rates.Rds") KEM_DEM <-readRDS("Data/Cleaned_Data/Demographic_Sex_Age_Data/RDS/Kemmerer_Diamondville_Demographics.Rds") @@ -35,12 +35,15 @@ LIN_POP <- LIN_POP %>% left_join(LIN_PRED) %>% mutate(Deaths=ifelse(is.na(Deat KEM_POP <- readRDS(KEM_POP_LOC) KEM_DEATHS <- LIN_POP %>% select(Year,Deaths) %>% left_join(Death_Adj) %>% left_join(KEM_PRED) %>% mutate(Deaths=round(Deaths*Kem_Death_Ratio)) %>% select(Year,Deaths) KEM_POP <- KEM_POP%>% select(-Deaths) %>% left_join(KEM_DEATHS) %>% mutate(BD=Population+Births-Deaths) %>% mutate(Missing=Population-lag(BD),Missing=lead(Missing),Migration=ifelse(is.na(Migration),Missing,Migration))%>% mutate(NEXT=Population+Births-Deaths+Migration)%>% arrange(desc(Year)) %>% select(colnames(LIN_POP)) +#Rond up any values to match the fact that only whole people can exists +KEM_POP$Population <- round(KEM_POP$Population) ###Estimate the number of deaths in the other parts of Lincoln (not Kemmerer) based on the total deaths, and the predicted share of deaths #Find migration based on the remaining missing population (after deaths,and births) OTHER_POP <- readRDS(OTHER_POP_LOC) OTHER_DEATHS <- LIN_POP %>% select(Year,Deaths) %>% left_join(Death_Adj) %>% left_join(OTHER_PRED) %>% mutate(Deaths=round(Deaths*Other_Death_Ratio)) %>% select(Year,Deaths) OTHER_POP <- OTHER_POP %>% select(-Deaths) %>% left_join(OTHER_DEATHS) %>% mutate(BD=Population+Births-Deaths) %>% mutate(Missing=Population-lag(BD),Missing=lead(Missing),Migration=ifelse(is.na(Migration),Missing,Migration))%>% arrange(desc(Year)) %>% mutate(NEXT=Population+Births-Deaths+Migration)%>% select(colnames(LIN_POP)) %>% arrange(desc(Year)) +OTHER_POP$Population <- round(OTHER_POP$Population) #Save outputs saveRDS(KEM_POP,KEM_POP_LOC) saveRDS(OTHER_POP,OTHER_POP_LOC) diff --git a/Scripts/2C_Migration_by_Age_Regression.r b/Scripts/2C_Migration_by_Age_Regression.r index bcb9eae..0376ae2 100644 --- a/Scripts/2C_Migration_by_Age_Regression.r +++ b/Scripts/2C_Migration_by_Age_Regression.r @@ -103,7 +103,7 @@ MIG_AGE_DIST <- MIG_AGE_DIST[1:85] MIG_AGE_DIST <- c(MIG_AGE_DIST[1],MIG_AGE_DIST)%>% as.vector MIG_AGE_DIST <- MIG_AGE_DIST/sum(MIG_AGE_DIST) #Create Location to probability results -if(!exists("RES_SAVE_LOC")){RES_SAVE_LOC <- "./Data/Intermediate_Inputs/"} +if(!exists("RES_SAVE_LOC")){RES_SAVE_LOC <- "./Data/Intermediate_Inputs/Migration_Trends/"} dir.create(RES_SAVE_LOC , recursive = TRUE, showWarnings = FALSE) write.csv(MIG_AGE_DIST ,paste0(RES_SAVE_LOC,"Migration_Age_Probablity_Zero_to_85.csv"),row.names=FALSE) diff --git a/Scripts/2D_Overall_Migration_ARIMA.r b/Scripts/2D_Overall_Migration_ARIMA.r index 309bbf9..df59d42 100644 --- a/Scripts/2D_Overall_Migration_ARIMA.r +++ b/Scripts/2D_Overall_Migration_ARIMA.r @@ -1,15 +1,14 @@ library(forecast) library(tidyverse) +library(texreg) #setwd("../") ####Work on overall migration trends #Could use code cleanup after trying things, but have but I have a working ARIMA to model Lincoln county migration -POP_DATA <- readRDS("Data/Cleaned_Data/Population_Data/RDS/Full_Lincoln_County_Population_Data.Rds") %>% mutate(Migration=Migration/Population) +POP_DATA <- readRDS("Data/Cleaned_Data/Population_Data/RDS/Full_Lincoln_County_Population_Data.Rds") POP_DATA_TEST <- readRDS("Data/Cleaned_Data/Population_Data/RDS/Full_Lincoln_County_Population_Data.Rds") %>% mutate(Migration=Migration/Population) POP_KEM_DATA <- readRDS("Data/Cleaned_Data/Population_Data/RDS/Kemmerer_Diamondville_Population_Data.Rds") POP_OTHER_DATA <- readRDS("Data/Cleaned_Data/Population_Data/RDS/Other_Lincoln_Population_Data.Rds") -hist(POP_OTHER_DATA$Migration/POP_OTHER_DATA$Population) -hist(POP_KEM_DATA$Migration/POP_KEM_DATA$Population) TS_DATA <- POP_DATA %>% mutate(In_Migration=ifelse(Migration>0,1,0)) %>% group_by(County) %>% arrange(County,Year) %>% mutate(Prev_Pop=lag(Population)) %>% ungroup @@ -29,9 +28,6 @@ ST_YEAR_OTHER <- min(pull(TS_OTHER_DATA %>% filter(!is.na(Migration)),Year)) END_YEAR_OTHER <- max(pull(TS_OTHER_DATA %>% filter(!is.na(Migration)),Year)) -#GRAPH_DATA <- TS_DATA %>% filter(!is.na(Migration)) -#GRAPH_DATA_LN <- TS_DATA %>% filter(!is.na(Migration),County=="Lincoln") -#ggplot(GRAPH_DATA,aes(x=Year,y=Migration/Prev_Pop,group=County,color=County))+geom_point()+geom_line(data=GRAPH_DATA_LN) TS_WIDE <- TS_DATA %>% dplyr::select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% filter(Year>ST_YEAR+1,Year<=END_YEAR) %>%ts(start=c(ST_YEAR+1),frequency=1) TS_KEM_WIDE <- TS_KEM_DATA %>% dplyr::select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% filter(Year>ST_YEAR+1,Year<=END_YEAR) %>%ts(start=c(ST_YEAR+1),frequency=1) @@ -41,7 +37,6 @@ TS_OTHER_WIDE <- TS_OTHER_DATA %>% dplyr::select(Year,County,Migration) %>% pivo LN <- TS_DATA %>% dplyr::select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% dplyr::select(Lincoln,Year) %>% filter(Year>=ST_YEAR,Year<=END_YEAR) %>% dplyr::select(-Year) %>%ts(start=c(ST_YEAR),frequency=1) KEM <- TS_KEM_DATA %>% dplyr::select(Year,Region,Migration) %>% pivot_wider(values_from=Migration,names_from=Region) %>% arrange(Year) %>% dplyr::select('Kemmerer & Diamondville',Year) %>% filter(Year>=ST_YEAR_KEM,Year<=END_YEAR_KEM) %>% dplyr::select(-Year) %>%ts(start=c(ST_YEAR_KEM),frequency=1) -TS_OTHER_DATA OTHER <- TS_OTHER_DATA %>% dplyr::select(Year,Region,Migration) %>% pivot_wider(values_from=Migration,names_from=Region) %>% arrange(Year) %>% dplyr::select('Lincoln Other'=Lincoln_Other,Year) %>% filter(Year>=ST_YEAR_OTHER,Year<=END_YEAR_OTHER) %>% dplyr::select(-Year) %>%ts(start=c(ST_YEAR_OTHER),frequency=1) #Create an ARIMA of Migration so the number of people migrating can be simulated @@ -49,24 +44,73 @@ OTHER <- TS_OTHER_DATA %>% dplyr::select(Year,Region,Migration) %>% pivot_wider #library(tseries) #adf.test(LN,k=1) #Stationary with one lag, otherwise not stationary #kpss.test(LN) #Stationary,default of program and has some model fit improvements -MOD <- auto.arima(LN,stationary=TRUE) +MOD <- auto.arima(LN) MOD_KEM <- auto.arima(KEM) MOD_OTHER <- auto.arima(OTHER) -plot(forecast(MOD )) -plot(forecast(MOD_KEM )) -plot(forecast(MOD_OTHER )) -plot(forecast(MOD,abs(KEM)) -#summary(MOD) -#Validity tests -#autoplot(MOD) -#acf(resid(MOD)) -#pacf(resid(MOD)) -# adf.test(resid(MOD)) -#checkresiduals(MOD) -#Save the resulting model outputs, will need to be changed if looking at other counties -#saveRDS(MOD,"Data/Regression_Results/LN_ARIMA_MODEL.Rds") +##Because the Lincoln county simulation has more data, it will show trends more easily. As a proxy for the subregions adjust the Kemmerer and Diamondville data such that the magnitude of the mean migration is the same portion of the magnitude of the Lincoln County data. Note that the ARIMA for Kemmerer ONLY includes a mean value so this is a reasonable way to model overall migration. In a similar way asses all non Kemmerer net migration to the other regions. This does understate migration in years where the sub-regions have opposite direction of net migrations, but works well when assuming the two regions share a portion of total county migration +LN_ADJ_KEM <- mean(abs(LN))/mean(abs(KEM)) +LN_ADJ_OTHER <- 1-1/LN_ADJ_KEM -MIGRATION_ARIMA_SIMS <- (do.call(cbind,mclapply(1:NUM_SIMULATIONS,function(x){as.numeric(round(simulate(MOD,future=TRUE, nsim=NUM_YEARS_PROJECTED)))},mc.cores =detectCores()-1)))#testing a multiple run simulation could use parallel process -saveRDS(MIGRATION_ARIMA_SIMS,"Data/Simulated_Data_Sets/Migration_ARIMA.Rds") -write.csv(MIGRATION_ARIMA_SIMS,row.names=FALSE,"Data/Simulated_Data_Sets/Migration_ARIMA.csv") +#Downshift the sub-region data +KEM_NEW <- LN/LN_ADJ_KEM +OTHER_NEW <- LN/LN_ADJ_OTHER +#Create new models for migration forecasts +MOD_KEM_ADJ <- auto.arima(KEM_NEW ,stationary=TRUE) +MOD_OTHER_ADJ <- auto.arima(OTHER_NEW,stationary=TRUE) +#Save the models for use in the Monte Carlo simulation + if(!exists("SAVE_LOC_ARIMA_MODELS")){SAVE_LOC_ARIMA_MODELS <-"./Data/Intermediate_Inputs/Migration_ARIMA_Models/"} + dir.create(SAVE_LOC_ARIMA_MODELS, recursive = TRUE, showWarnings = FALSE) +saveRDS(MOD,paste0(SAVE_LOC_ARIMA_MODELS,"Full_Lincoln_County_Net_Migration_ARIMA.Rds")) +saveRDS(MOD_KEM_ADJ,paste0(SAVE_LOC_ARIMA_MODELS,"Kemmerer_Diamondville_Net_Migration_ARIMA.Rds")) +saveRDS(MOD_OTHER_ADJ,paste0(SAVE_LOC_ARIMA_MODELS,"Other_Lincoln_Net_Migration_ARIMA.Rds")) + + + +##Save model summary results and create useful figures + if(!exists("SAVE_LOC_ARIMA_FIGURES")){SAVE_LOC_ARIMA_FIGURES <-"./Results/Migration_ARIMA/"} + dir.create(SAVE_LOC_ARIMA_FIGURES, recursive = TRUE, showWarnings = FALSE) + + +DICT <- list("ar1"="Autoregressive (1 Year)","ma1"="Moving Average (1 Year)","intercept"="Average Migration") +FILE_NAME <- "Migration_ARIMA_Tables" +REG_TABLE_LOC <- paste0(SAVE_LOC_ARIMA_FIGURES,FILE_NAME,".tex") +sink(REG_TABLE_LOC) +cat("\\documentclass[border=0pt]{article}","\n","\\pagestyle{empty}","\n","\\usepackage{booktabs,dcolumn}","\n","\\begin{document}") +texreg(l=list(MOD,MOD_KEM),custom.model.names=c("\\textbf{Lincoln County}","\\textbf{Kemmerer \\& Diamondville}"),table=FALSE,use.packages=FALSE,booktabs=TRUE,dcolumn=TRUE,caption.above=TRUE,custom.coef.map=DICT) +cat("\n","\\end{document}") +sink() +system(paste0("pdflatex ",REG_TABLE_LOC)) +system(paste0("pdfcrop --margins '5 5 5 5' ",FILE_NAME,".pdf ",SAVE_LOC_ARIMA_FIGURES,FILE_NAME,".pdf")) +file.remove(list.files(pattern=FILE_NAME)) + +####Lincoln Total Results +png(paste0(SAVE_LOC_ARIMA_FIGURES,"Lincoln_County_ARIMA_Migration_Forecast.png"), res = 600, height = 12, width=16, units = "in") + plot(forecast(MOD,h=2050-2023),xlab="Year","ylab"="Net Migration",main="Lincoln County Net Migration ARIMA Forcast") +dev.off() + +png(paste0(SAVE_LOC_ARIMA_FIGURES,"Lincoln_County_Migration_ARIMA_Residual_Checks.png"), res = 600, height = 12, width=16, units = "in") + checkresiduals(MOD) +dev.off() + +#####Kemmerer Results +png(paste0(SAVE_LOC_ARIMA_FIGURES,"Kemmerer_and_Diamondville_ARIMA_Migration_Forecast.png"), res = 600, height = 12, width=16, units = "in") + plot(forecast(MOD_KEM,h=2050-2023),xlab="Year","ylab"="Net Migration",main="Lincoln County Net Migration ARIMA Forcast") +dev.off() + +png(paste0(SAVE_LOC_ARIMA_FIGURES,"Kemmerer_and_Diamondville_Migration_ARIMA_Residual_Checks.png"), res = 600, height = 12, width=16, units = "in") + checkresiduals(MOD_KEM) +dev.off() + +#####Rest of Lincoln Results +png(paste0(SAVE_LOC_ARIMA_FIGURES,"Lincoln_Other_Areas_ARIMA_Migration_Forecast.png"), res = 600, height = 12, width=16, units = "in") + plot(forecast(MOD_OTHER,h=2050-2023),xlab="Year","ylab"="Net Migration",main="Kemmerer & Diamondville Net Migration ARIMA Forcast") +dev.off() + +png(paste0(SAVE_LOC_ARIMA_FIGURES,"Lincoln_Other_Areas_Migration_ARIMA_Residual_Checks.png"), res = 600, height = 12, width=16, units = "in") + checkresiduals(MOD_OTHER) +dev.off() + + + +#MIGRATION_ARIMA_SIMS <- (do.call(cbind,mclapply(1:NUM_SIMULATIONS,function(x){as.numeric(round(simulate(MOD,future=TRUE, nsim=NUM_YEARS_PROJECTED)))},mc.cores =detectCores()-1)))#testing a multiple run simulation could use parallel process diff --git a/Scripts/2E_Move_Current_Demographic_Data_to_Current_Year.r b/Scripts/2E_Move_Current_Demographic_Data_to_Current_Year.r new file mode 100644 index 0000000..22b909d --- /dev/null +++ b/Scripts/2E_Move_Current_Demographic_Data_to_Current_Year.r @@ -0,0 +1,19 @@ +#Script to increment the migration, and death data to start simulation in 2024, projecting 2025 +ODDS_MIGRATE <- readRDS("Data/Intermediate_Inputs/Migration_Trends/Migration_Age_Probablity_Zero_to_85.Rds") +KEM_CURRENT <- readRDS("Data/Intermediate_Inputs/Starting_Demographic_Data_Sets_of_Monte_Carlo/2023_Starting_Kemmerer_Diamondville_Demographics_Matrix.Rds") +KEM_CURRENT_POP <- readRDS("Data/Cleaned_Data/Population_Data/RDS/Kemmerer_Diamondville_Population_Data.Rds") +num_migrated <- KEM_CURRENT_POP %>% filter(Year==2023) %>% pull(Migration) %>% abs + +LENGTH <- length(KEM_CURRENT) + +for(x in 1:10^5){ +KEM_NEW <- KEM_CURRENT +for(i in 1:num_migrated){ + MIGRATED <- sample(1:LENGTH,1,prob=1-(1-ODDS_MIGRATE)^KEM_CURRENT) + KEM_NEW[MIGRATED] + KEM_NEW[MIGRATED] <- KEM_NEW[MIGRATED]-1 +} + if(x==1){RES <- KEM_CURRENT-KEM_NEW} else{RES<- cbind(RES,KEM_NEW-KEM_CURRENT)} +} +plot((rowMeans(RES))) +