Finished ARIMA, starting simulation starting work

This commit is contained in:
Alex Gebben Work 2025-11-17 17:15:48 -07:00
parent 99063bec79
commit 3941c5d6ea
9 changed files with 192 additions and 190 deletions

View File

@ -3,6 +3,10 @@ library(tidyverse) #Cleaning data
library(fixest) #Estimating a model of birth rates, to provide variance in the birth rate Monte Carlo using a fixed effect model.
library(parallel)
library(uuid) #To add a index to each batch
####If the prelimnary data needs to be reloaded run the supplied bash script to download, process, and generate all needed data sets for the Monte Carlo popopulation Simulation. Otherwise skip this step to save time
RELOAD_DATA <- TRUE
if(RELOAD_DATA){system("bash Prelim_Process.sh")}
#Load custom functions needed for the simulation
source("Scripts/Birth_Simulation_Functions.r")
source("Scripts/Monte_Carlo_Functions.r")

View File

@ -1,153 +0,0 @@
######################################DATA WORKING OFF OF DATA SCRIPT!!!!!!!!!!!!!!!!!!! Start there
#Future things to consider
###Naughton 1 & 2 natural gas conversion
###Add some variance in employment at TerraPower (odds of more or less)
###Probablity of more nuclear or power addtions
###Data center coming to the region
#Vague project with 500 electrions needed https://cowboystatedaily.com/2025/07/06/data-centers-boomer-retirements-make-electrician-the-new-it-job-in-wyoming/
#Coal mine sale possible alt use of coal https://cowboystatedaily.com/2024/09/13/kemmerer-coal-mine-operation-sold-to-southern-california-real-estate-company/
#Road project cancelled https://cowboystatedaily.com/2024/09/12/30m-project-to-move-3-miles-of-u-s-30-to-access-kemmerer-coal-stopped/
#Data center agrement https://cowboystatedaily.com/2025/01/22/kemmerer-officials-hope-terrapower-deal-could-attract-ai-data-center/
library(tidyverse)
library(forecast)
library(lmtest)
source("Scripts/Functions.r")
#source("Scripts/Load_Wyoming_Web_Data.r")
DF <- FRED_GET('WYLINC3POP','LN_POP') %>% select(-YEAR)
TS <- 1000*ts(DF,start=c(1970),end=c(2024),frequency=1)
BC <- BoxCox.lambda(TS)
MODEL <- auto.arima(TS, lambda = BC)
forecast(MODEL,h=20)
ARMA_POP <- forecast(MODEL,h=35,fan=TRUE,bootstrap = FALSE,biasadj=FALSE)
plot(ARMA_POP,main="Lincoln County Population Forecast",xlab="Year",ylab="Population")
#####City level
#See data http://eadiv.state.wy.us/pop/
KEM <- c(3273,3523,3688,3667,3626,3637,3611,3388,3156,3040,3020,3029,2989,2959,2976,2963,2910,2807,2729,2690,2657,2608,2575,2561,2574,2579,2603,2640,2679,2692,2642,2597,2551,2575,2578,2554,2544,2499,2457,2435,2413,2445,2445,2404,2378)
DIAMOND <- c(1000,1070,1114,1101,1082,1078,1063,991,916,876,864,863,847,835,835,827,808,774,748,732,705,695,690,689,695,700,710,723,738,745,731,704,677,667,652,629,613,586,559,540,523,526,527,521,517)
AREA_POP <- KEM+DIAMOND
LN <- c(12177,13254,14031,14110,14111,14319,14384,13658,12875,12552,12625,12975,13124,13329,13759,14073,14206,14099,14114,14338,14621,14697,14858,15117,15539,15917,16429,17013,17629,18082,18083,17946,17822,18148,18346,18473,18766,18899,19042,19379,19658,20174,20690,20909,21000)
# NO_CITY <- c(10095,10392,10747,10944,11043)
YEAR <- 1980:2024
####Old data addtion:Period Ends in 1970
#See in part http://eadiv.state.wy.us/demog_data/cntycity_hist.htm
LN_OLD <- c(12487,10894,10286,9023,9018,8640)
KEM_OLD <- c(843,1517,1884,2026,1667,2028,2292)
DIAMOND_OLD <- c(696,726,812,586,415,398,485)
AREA_OLD <- KEM_OLD+DIAMOND_OLD
AREA2 <- c(AREA_OLD,AREA_POP)
LN2 <- c(NA,LN_OLD,LN)
YEAR2 <- c(seq(1910,1980,by=10),1981:2024)
A <- cbind(YEAR2,LN2) %>% as_tibble %>% rename(Population=LN2) %>% mutate(Region='Lincoln County')
B <- cbind(YEAR2,AREA2) %>% as_tibble %>% rename(Population=AREA2) %>% mutate(Region='Kemmerer & Diamondvile')
DATA <- rbind(A,B) %>% rename(Year=YEAR2)
ggplot(aes(x=Year,y=Population,group=Region,color=Region),data=DATA) +geom_line(linewidth=1.5)+scale_x_continuous()+geom_vline(xintercept= 2022, linetype = "dashed", color = "red",size = 1)
###Kemmerer ARMA
KEM_TS <- DATA %>% filter(Year>=1980,Region=='Kemmerer & Diamondvile') %>% pull(Population) %>% ts(start=c(1980),end=c(2024),frequency=1)
BC <- BoxCox.lambda(KEM_TS)
MODEL2 <- auto.arima(KEM_TS, lambda = BC)
ARMA_KEM <- forecast(MODEL2,fan=TRUE,h=40,biasadj=FALSE)
plot(ARMA_KEM,main="Kemmerer & Diamondvile Population",xlab="Year",ylab="Population",ylim=c(0,6000))
###IMPLAN adjustment
#TerraPower: Commuting SAM
LOW_EMP <- 310.75 #Employees
#TerraPower: No commuting
HIGH_EMP <- 325 #Employees
#Employment in Licolin
LIN_EMP <- 625
#Regional (zip code level) ratios
EMPLOYMENT <- 2425.84
HOUSEHOLDS <- 1850.20
POPULATION <- 4038
#Half of Lincolin Employment in some low industried can be added to Kemmerer effect. For example restruants may be scaled based on current output, when new ones will be built.
ADJ_LOCAL_EMP <- 78.15
#85% commute from OUTSIDE Kemmer
#15% local labor
RATIO <- POPULATION/EMPLOYMENT
#Note that number in household will be much larger. This acounts for average commuting rates by looking at population compared to employment
LOW_GROWTH <- RATIO*LOW_EMP
RATIO*HIGH_EMP
MID_GROWTH <- RATIO*(LOW_EMP+ADJ_LOCAL_EMP)
HIGH_GROWTH <- RATIO*(HIGH_EMP+ADJ_LOCAL_EMP)
MID_GROWTH
HIGH_GROWTH
##############
TS <- EMP %>% select(EMP) %>% ts(start=c(1990),end=c(2024),frequency=1)
BC <- BoxCox.lambda(TS)
MODEL2 <- auto.arima(TS, lambda = BC)
MODEL2
ARMA_EMP <- forecast(MODEL2,h=35,fan=TRUE,bootstrap = TRUE,npaths=100000,biasadj=FALSE)
plot(ARMA_EMP,main="Lincoln County Employment",xlab="Year",ylab="Employed")
##########
DATA <- FRED_GET('BPPRIV056023','PRIV_HOUSING') %>% inner_join(FRED_GET('WYLINC3POP','LN_POP')) %>% inner_join(FRED_GET('ATNHPIUS56023A','HOUSE_PRICE_INDEX')) %>% left_join(FRED_GET('LAUCN560230000000005','EMP')) %>%left_join(FRED_GET('DCOILWTICO','WTI')) %>%left_join(FRED_GET('PCOALAUUSDM','COAL')) %>% mutate(LN_POP=1000*LN_POP) %>% select(-YEAR) %>% ts() %>% log() %>% diff()
grangertest(PRIV_HOUSING~HOUSE_PRICE_INDEX,data=DATA,order=1)
grangertest(HOUSE_PRICE_INDEX~PRIV_HOUSING,data=DATA,order=2)
grangertest(LN_POP~PRIV_HOUSING,data=DATA,order=1)
grangertest(HOUSE_PRICE_INDEX~COAL,data=DATA,order=2)
grangertest(LN_POP~WTI,data=DATA,order=1)
grangertest(COAL~WTI,data=DATA,order=2)
grangertest(WTI~COAL,data=DATA,order=2)
grangertest(LN_POP~COAL,data=DATA,order=1)
grangertest(EMP~LN_POP,data=DATA,order=2)
grangertest(PRIV_HOUSING~LN_POP,data=DATA,order=2) ## Signficant
grangertest(EMP~LN_POP,data=DATA,order=2) ## Signficant
grangertest(EMP~PRIV_HOUSING,data=DATA,order=2) ##Signficant
grangertest(EMP~LN_POP,data=DATA,order=2) ##Signficant
grangertest(PRIV_HOUSING~COAL,data=DATA,order=1) ##Signficant
grangertest(PRIV_HOUSING~COAL,data=DATA,order=2) ##Signficant
grangertest(PRIV_HOUSING~COAL,data=DATA,order=3) ##Signficant
grangertest(EMP~COAL,data=DATA,order=1) ##Signficant
grangertest(LN_POP~EMP,data=DATA,order=2)
grangertest(PRIV_HOUSING~EMP,data=DATA,order=2)
plot(DATA)
library(BVAR)
MOD <- bvar(DATA,lags=1, n_draw=40000)
opt_irf <- bv_irf(horizon = 25, identification = TRUE)
plot(irf(MOD,opt_irf,conf_bands = c(0.05, 0.1,0.15)),area=TRUE)
head(DATA)
DATA
# ATNHPIUS56023A Housing price index
# Median income MHIWY56023A052NCEN
# Employed Persons in Lincoln LAUCN560230000000005
#####Plan and ideas
#1) Review IMPLAN for industry multipliers
#2) Review IMPLAN for employment to population multipliers (imparted)
#3) Find a list of all planned new projects
#4) Use the IMPLAN multipliers for each sector to estimate total change
#5) Develop survey to estimate likelihood of new projects
#6) Compare to the ARMA percentile
#7) Adjust the ARMA up assuming some of these outputs are known.
#8) Occupancy rate from IMPLAN as a housing cap when projecting
#9) Model housing construciton rate (Maybe)
#10) Employment rate by age in IMPLAN
####Other ideas, develop larger plan? Maybe look at decline in other industries as a proportion of employment
###Seperate out Kemmer and Diamondville? http://eadiv.state.wy.us/pop/wyc&sc40.htm

View File

@ -0,0 +1,78 @@
#####Packages
library(tidyverse) #Cleaning data
library(fixest) #Estimating a model of birth rates, to provide variance in the birth rate Monte Carlo using a fixed effect model.
library(parallel)
library(uuid) #To add a index to each batch
#Load custom functions needed for the simulation
source("Scripts/Birth_Simulation_Functions.r")
source("Scripts/Monte_Carlo_Functions.r")
source("Scripts/Migration_Simulation_Functions.r")
#####User Configuration Values
KEMMER_SIM <- TRUE #Wether the simulation should predict Kemmerer (and Diamondville) or Lincoln County as a whole. TRUE, is Kemmerer False is Lincoln
START_SIM_YEAR <- 2025 #The first year to simulate
NUM_SIMULATIONS <- 10^5 #Number of Monte Carlo Simulations to run
RERUN_MORTALITY_SIMULATION <- TRUE #Rerun the Monte Carlo simulation of future mortality rates (not actual deaths) even if a Rds file of a mortality rates exists. This can be used to speed up runs when FALSE
RERUN_MIGRATION_SIMULATION <- TRUE #Rerun the ARIMA simulations that predict total migration in any year even if a Rds file of a mortality rates exists. This can be used to speed up runs when FALSE
NUM_YEARS_PROJECTED <- 50 #How many years into the future should each Monte Carlo run project to. For example 25 years if starting from 2025 and ending in 2050.
BIRTH_RATE_REG_RESULTS <- "Data/Regression_Results/Birth_Rate_Model.Rds" #Location of the regression used to model variance in birth rates in each Monte Carlo simulation.
START_DEMOGRAPHIC_DATA <- "Data/Cleaned_Data/Start_Year_Demographic_Data_With_Fertility_Groups.Rds" #Location of the data for the first year needing a forecasted birth rate, which aggregates the yearly splits of populations, into a single, year-county data set with variables for key birth prediction (total number of men and women in ages with high fertility rates), and then combines with the data set including births, deaths, migration and population.
AGE_OF_MIGRANT_PROBABILITY <- "Data/Other_Intermediate_Outputs/Migreation_Age_Probablity_One_to_Ninety.csv" #Location of the data which is the result of regression analysis of the age of migrants. That is to say 18 year olds may migrate more than 70 year olds, and this is the distribution by age. Sex was not found to be a major factor
####Run any scripts required before main Monte Carlo
source("Survival_Simulation.r") #Populate a table with a simulation of future mortality rates, for quick recall during the simulation.
#A script contains the code needed to create a feols (fixest) regression of the birth rate given age distribution. Load this saved result or else create it to use in each simulation for gathering variance of births in any given age distribution path of the Monte Carlo.
if(file.exists(BIRTH_RATE_REG_RESULTS)){MOD_BIRTHS <- readRDS(BIRTH_RATE_REG_RESULTS);FIRST_PREDICT_YEAR_POPULATION_DATA <- readRDS(START_DEMOGRAPHIC_DATA)} else{source("Birth_Rate_Regression.r")}
if(file.exists(AGE_OF_MIGRANT_PROBABILITY)){MIG_AGE_DIST <- read.csv(AGE_OF_MIGRANT_PROBABILITY)} else{source("Migration_Regression.r")}
#Rerun the migration simulation if requested
if(RERUN_MIGRATION_SIMULATION ){source("Migration_Regression.r")}
#######################################################Main Monte Carlo
START_DEM_DATA <- readRDS("Data/Cleaned_Data/Lincoln_Demographic_Data.Rds") %>% group_by(County) %>% filter(Year==2023) %>% ungroup %>% select(-County)
MORTALITY_SIMULATION <- readRDS("./Data/Simulated_Data_Sets/MORTALITY_MONTE_CARLO.Rds") #Load the Mortality simulation to speed up simulation
MIGRATION_ARIMA_SIMULATION <- readRDS("./Data/Simulated_Data_Sets/Migration_ARIMA.Rds") #Load the Migration simulation to speed up simulation
#####################!!!!!!!!!!!!!!!!!!!Working on pulling in data from the Data/Cleaned_Data/Intiate_Simulation/ directory which stores a starting point for all groups. Make data much more clean
if(KEMMER_SIM){
LN_POP_ST <- FIRST_PREDICT_YEAR_POPULATION_DATA$Population #Population of Lincoln County
START_DEM_DATA <- readRDS("Data/Cleaned_Data/Kemmerer_Demographic_Data.Rds") %>% select(-County) %>% mutate(County='Lincoln')
FIRST_PREDICT_YEAR_POPULATION_DATA <- readRDS("Data/Cleaned_Data/Kemmerer_Summary_Start_Data.Rds")
KEM_POP_ST <- FIRST_PREDICT_YEAR_POPULATION_DATA$Population
POP_ST_RATIO <- LN_POP_ST/KEM_POP_ST
MIGRATION_ARIMA_SIMS <- round(MIGRATION_ARIMA_SIMS*POP_ST_RATIO) #Downscale County migreation to the city level based on average popualtion
}
#Second run, work on making into loop and saving the output to file (check if that will slow the results). Maybe use a predefined matrix, so that the results can be stored quickly
SINGLE_PATH_SIM <- function(j){
C_RES <- RUN_SINGLE_SIM(MOD_BIRTHS,FIRST_PREDICT_YEAR_POPULATION_DATA,START_DEM_DATA,MORTALITY_SIMULATION,SIM_NUMBER=j,START_OF_SIM=START_SIM_YEAR,MIGRATION_ARIMA_SIMULATION,MIG_AGE_DIST)
RES <- C_RES[[1]]
for(i in 1:NUM_YEARS_PROJECTED){
C_RES <- RUN_SINGLE_SIM(MOD_BIRTHS,C_RES[[3]],C_RES[[2]],MORTALITY_SIMULATION,SIM_NUMBER=j,START_OF_SIM=START_SIM_YEAR,MIGRATION_ARIMA_SIMULATION,MIG_AGE_DIST)
RES <- rbind(RES,C_RES[[1]])
}
return(RES)
}
#Run the full simulation across all simulations simulating changes in demographic, and mortality data.
#Setup save results
RES_DIR <- "./Results"
RAW_SIM_FILE <- paste0(RES_DIR,"/Raw_Simulations.csv")
PERCENTILE_DATA <- paste0(RES_DIR,"/Percentile_Clean_Results.csv")
#Loop all results saving in batches
dir.create(RES_DIR,showWarnings=FALSE)
BATCH_SIZE <- 1000
NUM_RUNS <- ceiling(NUM_SIMULATIONS/BATCH_SIZE)
#Run the loop
for(x in 1:NUM_RUNS) {
BATCH_GUID <- UUIDgenerate()
try(FULL_RESULTS <- mclapply(1:BATCH_SIZE,function(x){try(SINGLE_PATH_SIM(x))},mc.cores = detectCores()-1))
if(exists("FULL_RESULTS")){
FULL_RESULTS <- do.call(rbind,lapply(1:BATCH_SIZE,function(x){FULL_RESULTS[[x]] %>% mutate(SIM_ID=UUIDgenerate())}))
FULL_RESULTS$BATCH_ID <- BATCH_GUID
FULL_RESULTS <- FULL_RESULTS%>% select(BATCH_ID,SIM_ID,everything())
if(x==1){write_csv(FULL_RESULTS,RAW_SIM_FILE)}else {write_csv(FULL_RESULTS,RAW_SIM_FILE,append=TRUE)}
rm(FULL_RESULTS)
gc()
}
}

View File

@ -74,8 +74,12 @@ DEM_DATA <- rbind(DEM_2020,DEM_DATA) %>% ungroup %>% arrange(Year,Age) %>% uniq
if(!exists("SAVE_DEMO_LOC")){SAVE_DEMO_LOC <-"./Data/Cleaned_Data/Demographic_Sex_Age_Data"}
CSV_SAVE <- paste0(SAVE_DEMO_LOC,"/CSV")
RDS_SAVE <- paste0(SAVE_DEMO_LOC,"/RDS")
#Save files for all county demographics
#Location for starting demographic data that initiates a simulation (different periods included for robustness checks)
if(!exists("STARTING_SIM_DATA_SAVE_LOC")){STARTING_SIM_DATA_SAVE_LOC <-"./Data/Intermediate_Inputs/Starting_Demographic_Data_Sets_of_Monte_Carlo/"}
dir.create(STARTING_SIM_DATA_SAVE_LOC, recursive = TRUE, showWarnings = FALSE)
#Save files for all county demographic cleaned data
dir.create(CSV_SAVE , recursive = TRUE, showWarnings = FALSE)
dir.create(RDS_SAVE , recursive = TRUE, showWarnings = FALSE)
saveRDS(DEM_DATA,paste0(RDS_SAVE,"/All_Wyoming_Counties_Demographics.Rds" ))
@ -86,19 +90,19 @@ DEM_DATA <- rbind(DEM_2020,DEM_DATA) %>% ungroup %>% arrange(Year,Age) %>% uniq
MOST_RECENT_YEAR <- max(LIN_DEM$Year)
LIN_MAT <- LIN_DEM %>% filter(Year==MOST_RECENT_YEAR) %>% select(Num_Male,Num_Female) %>% as.matrix
rownames(LIN_MAT) <- 0:85
saveRDS(LIN_MAT,paste0(RDS_SAVE,paste0("/",MOST_RECENT_YEAR,"_Starting_Lincoln_County_Demographics_Matrix.Rds") ))
saveRDS(LIN_MAT,paste0(STARTING_SIM_DATA_SAVE_LOC,MOST_RECENT_YEAR,"_Starting_Lincoln_County_Demographics_Matrix.Rds") )
LIN_MAT_OLD <- LIN_DEM %>% filter(Year==2016) %>% select(Num_Male,Num_Female) %>% as.matrix #2016 to match the mid value of Kemmerer data years
LIN_MAT_OLD[86,] <-colSums(LIN_MAT_OLD[ 86:nrow(LIN_MAT_OLD ),])
LIN_MAT_OLD <- LIN_MAT_OLD[1:86,] #Combine 85+ into one group to match the death stastics data
rownames(LIN_MAT_OLD) <- 0:85
saveRDS(LIN_MAT_OLD,paste0(RDS_SAVE,"/2016_Starting_Lincoln_County_Demographics_Matrix.Rds" ))
saveRDS(LIN_MAT_OLD,paste0(STARTING_SIM_DATA_SAVE_LOC,"2016_Starting_Lincoln_County_Demographics_Matrix.Rds" ))
LIN_MAT_VERY_OLD <- LIN_DEM %>% filter(Year==2025-40) %>% select(Num_Male,Num_Female) %>% as.matrix #1985 to match 40 years of estiamtes or the end date.
LIN_MAT_VERY_OLD[86,] <-colSums(LIN_MAT_VERY_OLD[ 86:nrow(LIN_MAT_VERY_OLD ),])
LIN_MAT_VERY_OLD <- LIN_MAT_VERY_OLD[1:86,] #Combine 85+ into one group to match the death stastics data
rownames(LIN_MAT_VERY_OLD) <- 0:85
saveRDS(LIN_MAT_VERY_OLD,paste0(RDS_SAVE,"/1985_Starting_Lincoln_County_Demographics_Matrix.Rds" ))
saveRDS(LIN_MAT_VERY_OLD,paste0(STARTING_SIM_DATA_SAVE_LOC,"1985_Starting_Lincoln_County_Demographics_Matrix.Rds" ))
saveRDS(LIN_DEM,paste0(RDS_SAVE,"/Full_Lincoln_County_Demographics.Rds" ))

View File

@ -70,9 +70,12 @@ write_csv(KEM_DEMO_DATA,paste0(CSV_SAVE,"/Kemmerer_Diamondville_Demographics.csv
#Other Lincoln area (not Kemmerer) save
saveRDS(OTHER_LIN_DEMO_DATA,paste0(RDS_SAVE,"/Other_Lincoln_Demographics.Rds" ))
write_csv(OTHER_LIN_DEMO_DATA,paste0(CSV_SAVE,"/Other_Lincoln_Demographics.csv" ))
#Matrix save
saveRDS(KEM_DEMO_MAT,paste0(RDS_SAVE,"/",MAX_YEAR,"_Starting_Kemmerer_Diamondville_Demographics_Matrix.Rds") )
saveRDS(KEM_DEMO_OLD_MAT,paste0(RDS_SAVE,"/",MED_YEAR,"_Starting_Kemmerer_Diamondville_Demographics_Matrix.Rds" ))
saveRDS(OTHER_LIN_DEM_MAT,paste0(RDS_SAVE,"/",MAX_YEAR,"_Starting_Other_Demographics_Matrix.Rds" ))
saveRDS(OTHER_LIN_DEM_OLD_MAT,paste0(RDS_SAVE,"/",MED_YEAR,"_Starting_Other_Demographics_Matrix.Rds" ))
#Matrix save of starting Monte Carlo Data sets
if(!exists("STARTING_SIM_DATA_SAVE_LOC")){STARTING_SIM_DATA_SAVE_LOC <-"./Data/Intermediate_Inputs/Starting_Demographic_Data_Sets_of_Monte_Carlo/"}
dir.create(STARTING_SIM_DATA_SAVE_LOC, recursive = TRUE, showWarnings = FALSE)
saveRDS(KEM_DEMO_MAT,paste0(STARTING_SIM_DATA_SAVE_LOC,MAX_YEAR,"_Starting_Kemmerer_Diamondville_Demographics_Matrix.Rds") )
saveRDS(KEM_DEMO_OLD_MAT,paste0(STARTING_SIM_DATA_SAVE_LOC,MED_YEAR,"_Starting_Kemmerer_Diamondville_Demographics_Matrix.Rds" ))
saveRDS(OTHER_LIN_DEM_MAT,paste0(STARTING_SIM_DATA_SAVE_LOC,MAX_YEAR,"_Starting_Other_Demographics_Matrix.Rds" ))
saveRDS(OTHER_LIN_DEM_OLD_MAT,paste0(STARTING_SIM_DATA_SAVE_LOC,MED_YEAR,"_Starting_Other_Demographics_Matrix.Rds" ))

View File

@ -3,8 +3,8 @@ library(tidyverse)
#setwd("../")
KEM_POP_LOC <- "Data/Cleaned_Data/Population_Data/RDS/Kemmerer_Diamondville_Population_Data.Rds"
OTHER_POP_LOC <- "Data/Cleaned_Data/Population_Data/RDS/Other_Lincoln_Population_Data.Rds"
KEM_POP_LOC_CSV <- "Data/Cleaned_Data/Population_Data/RDS/Kemmerer_Diamondville_Population_Data.csv"
OTHER_POP_LOC_CSV <- "Data/Cleaned_Data/Population_Data/RDS/Other_Lincoln_Population_Data.csv"
KEM_POP_LOC_CSV <- "Data/Cleaned_Data/Population_Data/CSV/Kemmerer_Diamondville_Population_Data.csv"
OTHER_POP_LOC_CSV <- "Data/Cleaned_Data/Population_Data/CSV/Other_Lincoln_Population_Data.csv"
MORT <- readRDS("Data/Cleaned_Data/Mortality_Rate_Data/RDS/Lincoln_County_Mortality_Rates.Rds")
KEM_DEM <-readRDS("Data/Cleaned_Data/Demographic_Sex_Age_Data/RDS/Kemmerer_Diamondville_Demographics.Rds")
@ -35,12 +35,15 @@ LIN_POP <- LIN_POP %>% left_join(LIN_PRED) %>% mutate(Deaths=ifelse(is.na(Deat
KEM_POP <- readRDS(KEM_POP_LOC)
KEM_DEATHS <- LIN_POP %>% select(Year,Deaths) %>% left_join(Death_Adj) %>% left_join(KEM_PRED) %>% mutate(Deaths=round(Deaths*Kem_Death_Ratio)) %>% select(Year,Deaths)
KEM_POP <- KEM_POP%>% select(-Deaths) %>% left_join(KEM_DEATHS) %>% mutate(BD=Population+Births-Deaths) %>% mutate(Missing=Population-lag(BD),Missing=lead(Missing),Migration=ifelse(is.na(Migration),Missing,Migration))%>% mutate(NEXT=Population+Births-Deaths+Migration)%>% arrange(desc(Year)) %>% select(colnames(LIN_POP))
#Rond up any values to match the fact that only whole people can exists
KEM_POP$Population <- round(KEM_POP$Population)
###Estimate the number of deaths in the other parts of Lincoln (not Kemmerer) based on the total deaths, and the predicted share of deaths
#Find migration based on the remaining missing population (after deaths,and births)
OTHER_POP <- readRDS(OTHER_POP_LOC)
OTHER_DEATHS <- LIN_POP %>% select(Year,Deaths) %>% left_join(Death_Adj) %>% left_join(OTHER_PRED) %>% mutate(Deaths=round(Deaths*Other_Death_Ratio)) %>% select(Year,Deaths)
OTHER_POP <- OTHER_POP %>% select(-Deaths) %>% left_join(OTHER_DEATHS) %>% mutate(BD=Population+Births-Deaths) %>% mutate(Missing=Population-lag(BD),Missing=lead(Missing),Migration=ifelse(is.na(Migration),Missing,Migration))%>% arrange(desc(Year)) %>% mutate(NEXT=Population+Births-Deaths+Migration)%>% select(colnames(LIN_POP)) %>% arrange(desc(Year))
OTHER_POP$Population <- round(OTHER_POP$Population)
#Save outputs
saveRDS(KEM_POP,KEM_POP_LOC)
saveRDS(OTHER_POP,OTHER_POP_LOC)

View File

@ -103,7 +103,7 @@ MIG_AGE_DIST <- MIG_AGE_DIST[1:85]
MIG_AGE_DIST <- c(MIG_AGE_DIST[1],MIG_AGE_DIST)%>% as.vector
MIG_AGE_DIST <- MIG_AGE_DIST/sum(MIG_AGE_DIST)
#Create Location to probability results
if(!exists("RES_SAVE_LOC")){RES_SAVE_LOC <- "./Data/Intermediate_Inputs/"}
if(!exists("RES_SAVE_LOC")){RES_SAVE_LOC <- "./Data/Intermediate_Inputs/Migration_Trends/"}
dir.create(RES_SAVE_LOC , recursive = TRUE, showWarnings = FALSE)
write.csv(MIG_AGE_DIST ,paste0(RES_SAVE_LOC,"Migration_Age_Probablity_Zero_to_85.csv"),row.names=FALSE)

View File

@ -1,15 +1,14 @@
library(forecast)
library(tidyverse)
library(texreg)
#setwd("../")
####Work on overall migration trends
#Could use code cleanup after trying things, but have but I have a working ARIMA to model Lincoln county migration
POP_DATA <- readRDS("Data/Cleaned_Data/Population_Data/RDS/Full_Lincoln_County_Population_Data.Rds") %>% mutate(Migration=Migration/Population)
POP_DATA <- readRDS("Data/Cleaned_Data/Population_Data/RDS/Full_Lincoln_County_Population_Data.Rds")
POP_DATA_TEST <- readRDS("Data/Cleaned_Data/Population_Data/RDS/Full_Lincoln_County_Population_Data.Rds") %>% mutate(Migration=Migration/Population)
POP_KEM_DATA <- readRDS("Data/Cleaned_Data/Population_Data/RDS/Kemmerer_Diamondville_Population_Data.Rds")
POP_OTHER_DATA <- readRDS("Data/Cleaned_Data/Population_Data/RDS/Other_Lincoln_Population_Data.Rds")
hist(POP_OTHER_DATA$Migration/POP_OTHER_DATA$Population)
hist(POP_KEM_DATA$Migration/POP_KEM_DATA$Population)
TS_DATA <- POP_DATA %>% mutate(In_Migration=ifelse(Migration>0,1,0)) %>% group_by(County) %>% arrange(County,Year) %>% mutate(Prev_Pop=lag(Population)) %>% ungroup
@ -29,9 +28,6 @@ ST_YEAR_OTHER <- min(pull(TS_OTHER_DATA %>% filter(!is.na(Migration)),Year))
END_YEAR_OTHER <- max(pull(TS_OTHER_DATA %>% filter(!is.na(Migration)),Year))
#GRAPH_DATA <- TS_DATA %>% filter(!is.na(Migration))
#GRAPH_DATA_LN <- TS_DATA %>% filter(!is.na(Migration),County=="Lincoln")
#ggplot(GRAPH_DATA,aes(x=Year,y=Migration/Prev_Pop,group=County,color=County))+geom_point()+geom_line(data=GRAPH_DATA_LN)
TS_WIDE <- TS_DATA %>% dplyr::select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% filter(Year>ST_YEAR+1,Year<=END_YEAR) %>%ts(start=c(ST_YEAR+1),frequency=1)
TS_KEM_WIDE <- TS_KEM_DATA %>% dplyr::select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% filter(Year>ST_YEAR+1,Year<=END_YEAR) %>%ts(start=c(ST_YEAR+1),frequency=1)
@ -41,7 +37,6 @@ TS_OTHER_WIDE <- TS_OTHER_DATA %>% dplyr::select(Year,County,Migration) %>% pivo
LN <- TS_DATA %>% dplyr::select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% dplyr::select(Lincoln,Year) %>% filter(Year>=ST_YEAR,Year<=END_YEAR) %>% dplyr::select(-Year) %>%ts(start=c(ST_YEAR),frequency=1)
KEM <- TS_KEM_DATA %>% dplyr::select(Year,Region,Migration) %>% pivot_wider(values_from=Migration,names_from=Region) %>% arrange(Year) %>% dplyr::select('Kemmerer & Diamondville',Year) %>% filter(Year>=ST_YEAR_KEM,Year<=END_YEAR_KEM) %>% dplyr::select(-Year) %>%ts(start=c(ST_YEAR_KEM),frequency=1)
TS_OTHER_DATA
OTHER <- TS_OTHER_DATA %>% dplyr::select(Year,Region,Migration) %>% pivot_wider(values_from=Migration,names_from=Region) %>% arrange(Year) %>% dplyr::select('Lincoln Other'=Lincoln_Other,Year) %>% filter(Year>=ST_YEAR_OTHER,Year<=END_YEAR_OTHER) %>% dplyr::select(-Year) %>%ts(start=c(ST_YEAR_OTHER),frequency=1)
#Create an ARIMA of Migration so the number of people migrating can be simulated
@ -49,24 +44,73 @@ OTHER <- TS_OTHER_DATA %>% dplyr::select(Year,Region,Migration) %>% pivot_wider
#library(tseries)
#adf.test(LN,k=1) #Stationary with one lag, otherwise not stationary
#kpss.test(LN) #Stationary,default of program and has some model fit improvements
MOD <- auto.arima(LN,stationary=TRUE)
MOD <- auto.arima(LN)
MOD_KEM <- auto.arima(KEM)
MOD_OTHER <- auto.arima(OTHER)
plot(forecast(MOD ))
plot(forecast(MOD_KEM ))
plot(forecast(MOD_OTHER ))
plot(forecast(MOD,abs(KEM))
#summary(MOD)
#Validity tests
#autoplot(MOD)
#acf(resid(MOD))
#pacf(resid(MOD))
# adf.test(resid(MOD))
#checkresiduals(MOD)
#Save the resulting model outputs, will need to be changed if looking at other counties
#saveRDS(MOD,"Data/Regression_Results/LN_ARIMA_MODEL.Rds")
##Because the Lincoln county simulation has more data, it will show trends more easily. As a proxy for the subregions adjust the Kemmerer and Diamondville data such that the magnitude of the mean migration is the same portion of the magnitude of the Lincoln County data. Note that the ARIMA for Kemmerer ONLY includes a mean value so this is a reasonable way to model overall migration. In a similar way asses all non Kemmerer net migration to the other regions. This does understate migration in years where the sub-regions have opposite direction of net migrations, but works well when assuming the two regions share a portion of total county migration
LN_ADJ_KEM <- mean(abs(LN))/mean(abs(KEM))
LN_ADJ_OTHER <- 1-1/LN_ADJ_KEM
MIGRATION_ARIMA_SIMS <- (do.call(cbind,mclapply(1:NUM_SIMULATIONS,function(x){as.numeric(round(simulate(MOD,future=TRUE, nsim=NUM_YEARS_PROJECTED)))},mc.cores =detectCores()-1)))#testing a multiple run simulation could use parallel process
saveRDS(MIGRATION_ARIMA_SIMS,"Data/Simulated_Data_Sets/Migration_ARIMA.Rds")
write.csv(MIGRATION_ARIMA_SIMS,row.names=FALSE,"Data/Simulated_Data_Sets/Migration_ARIMA.csv")
#Downshift the sub-region data
KEM_NEW <- LN/LN_ADJ_KEM
OTHER_NEW <- LN/LN_ADJ_OTHER
#Create new models for migration forecasts
MOD_KEM_ADJ <- auto.arima(KEM_NEW ,stationary=TRUE)
MOD_OTHER_ADJ <- auto.arima(OTHER_NEW,stationary=TRUE)
#Save the models for use in the Monte Carlo simulation
if(!exists("SAVE_LOC_ARIMA_MODELS")){SAVE_LOC_ARIMA_MODELS <-"./Data/Intermediate_Inputs/Migration_ARIMA_Models/"}
dir.create(SAVE_LOC_ARIMA_MODELS, recursive = TRUE, showWarnings = FALSE)
saveRDS(MOD,paste0(SAVE_LOC_ARIMA_MODELS,"Full_Lincoln_County_Net_Migration_ARIMA.Rds"))
saveRDS(MOD_KEM_ADJ,paste0(SAVE_LOC_ARIMA_MODELS,"Kemmerer_Diamondville_Net_Migration_ARIMA.Rds"))
saveRDS(MOD_OTHER_ADJ,paste0(SAVE_LOC_ARIMA_MODELS,"Other_Lincoln_Net_Migration_ARIMA.Rds"))
##Save model summary results and create useful figures
if(!exists("SAVE_LOC_ARIMA_FIGURES")){SAVE_LOC_ARIMA_FIGURES <-"./Results/Migration_ARIMA/"}
dir.create(SAVE_LOC_ARIMA_FIGURES, recursive = TRUE, showWarnings = FALSE)
DICT <- list("ar1"="Autoregressive (1 Year)","ma1"="Moving Average (1 Year)","intercept"="Average Migration")
FILE_NAME <- "Migration_ARIMA_Tables"
REG_TABLE_LOC <- paste0(SAVE_LOC_ARIMA_FIGURES,FILE_NAME,".tex")
sink(REG_TABLE_LOC)
cat("\\documentclass[border=0pt]{article}","\n","\\pagestyle{empty}","\n","\\usepackage{booktabs,dcolumn}","\n","\\begin{document}")
texreg(l=list(MOD,MOD_KEM),custom.model.names=c("\\textbf{Lincoln County}","\\textbf{Kemmerer \\& Diamondville}"),table=FALSE,use.packages=FALSE,booktabs=TRUE,dcolumn=TRUE,caption.above=TRUE,custom.coef.map=DICT)
cat("\n","\\end{document}")
sink()
system(paste0("pdflatex ",REG_TABLE_LOC))
system(paste0("pdfcrop --margins '5 5 5 5' ",FILE_NAME,".pdf ",SAVE_LOC_ARIMA_FIGURES,FILE_NAME,".pdf"))
file.remove(list.files(pattern=FILE_NAME))
####Lincoln Total Results
png(paste0(SAVE_LOC_ARIMA_FIGURES,"Lincoln_County_ARIMA_Migration_Forecast.png"), res = 600, height = 12, width=16, units = "in")
plot(forecast(MOD,h=2050-2023),xlab="Year","ylab"="Net Migration",main="Lincoln County Net Migration ARIMA Forcast")
dev.off()
png(paste0(SAVE_LOC_ARIMA_FIGURES,"Lincoln_County_Migration_ARIMA_Residual_Checks.png"), res = 600, height = 12, width=16, units = "in")
checkresiduals(MOD)
dev.off()
#####Kemmerer Results
png(paste0(SAVE_LOC_ARIMA_FIGURES,"Kemmerer_and_Diamondville_ARIMA_Migration_Forecast.png"), res = 600, height = 12, width=16, units = "in")
plot(forecast(MOD_KEM,h=2050-2023),xlab="Year","ylab"="Net Migration",main="Lincoln County Net Migration ARIMA Forcast")
dev.off()
png(paste0(SAVE_LOC_ARIMA_FIGURES,"Kemmerer_and_Diamondville_Migration_ARIMA_Residual_Checks.png"), res = 600, height = 12, width=16, units = "in")
checkresiduals(MOD_KEM)
dev.off()
#####Rest of Lincoln Results
png(paste0(SAVE_LOC_ARIMA_FIGURES,"Lincoln_Other_Areas_ARIMA_Migration_Forecast.png"), res = 600, height = 12, width=16, units = "in")
plot(forecast(MOD_OTHER,h=2050-2023),xlab="Year","ylab"="Net Migration",main="Kemmerer & Diamondville Net Migration ARIMA Forcast")
dev.off()
png(paste0(SAVE_LOC_ARIMA_FIGURES,"Lincoln_Other_Areas_Migration_ARIMA_Residual_Checks.png"), res = 600, height = 12, width=16, units = "in")
checkresiduals(MOD_OTHER)
dev.off()
#MIGRATION_ARIMA_SIMS <- (do.call(cbind,mclapply(1:NUM_SIMULATIONS,function(x){as.numeric(round(simulate(MOD,future=TRUE, nsim=NUM_YEARS_PROJECTED)))},mc.cores =detectCores()-1)))#testing a multiple run simulation could use parallel process

View File

@ -0,0 +1,19 @@
#Script to increment the migration, and death data to start simulation in 2024, projecting 2025
ODDS_MIGRATE <- readRDS("Data/Intermediate_Inputs/Migration_Trends/Migration_Age_Probablity_Zero_to_85.Rds")
KEM_CURRENT <- readRDS("Data/Intermediate_Inputs/Starting_Demographic_Data_Sets_of_Monte_Carlo/2023_Starting_Kemmerer_Diamondville_Demographics_Matrix.Rds")
KEM_CURRENT_POP <- readRDS("Data/Cleaned_Data/Population_Data/RDS/Kemmerer_Diamondville_Population_Data.Rds")
num_migrated <- KEM_CURRENT_POP %>% filter(Year==2023) %>% pull(Migration) %>% abs
LENGTH <- length(KEM_CURRENT)
for(x in 1:10^5){
KEM_NEW <- KEM_CURRENT
for(i in 1:num_migrated){
MIGRATED <- sample(1:LENGTH,1,prob=1-(1-ODDS_MIGRATE)^KEM_CURRENT)
KEM_NEW[MIGRATED]
KEM_NEW[MIGRATED] <- KEM_NEW[MIGRATED]-1
}
if(x==1){RES <- KEM_CURRENT-KEM_NEW} else{RES<- cbind(RES,KEM_NEW-KEM_CURRENT)}
}
plot((rowMeans(RES)))