From e60fa6fd53857c9d8d6eae8f56c305849ddc1c0a Mon Sep 17 00:00:00 2001
From: Alex Gebben Work <agebben@uwyo.edu>
Date: Mon, 3 Nov 2025 17:04:15 -0700
Subject: [PATCH] Working on Kemmerer Conversion form Lincoln data

---
 1_Run_Full_Simulation.r              | 19 +++++--
 Birth_Rate_Regression.r              |  2 +-
 Scripts/Birth_Simulation_Functions.r |  1 -
 Scripts/Monte_Carlo_Functions.r      |  3 +-
 Zip_Code.r                           | 84 +++++++++++++---------------
 5 files changed, 57 insertions(+), 52 deletions(-)

diff --git a/1_Run_Full_Simulation.r b/1_Run_Full_Simulation.r
index f871d47..2e152a1 100644
--- a/1_Run_Full_Simulation.r
+++ b/1_Run_Full_Simulation.r
@@ -9,6 +9,8 @@ source("Scripts/Monte_Carlo_Functions.r")
 source("Scripts/Migration_Simulation_Functions.r")
 
 #####User Configuration Values
+KEMMER_SIM <- TRUE #Wether the simulation should predict Kemmerer (and Diamondville) or Lincoln County as a whole. TRUE, is Kemmerer False is Lincoln
+START_SIM_YEAR <- 2025 #The first year to simulate
 	NUM_SIMULATIONS <- 10^5 #Number of Monte Carlo Simulations to run
 	RERUN_MORTALITY_SIMULATION <- TRUE #Rerun the Monte Carlo simulation of future mortality rates (not actual deaths) even if a Rds file of a mortality rates exists. This can be used to speed up runs when FALSE
 	RERUN_MIGRATION_SIMULATION <- TRUE #Rerun the ARIMA simulations that predict total migration in any year even if a Rds file of a mortality rates exists. This can be used to speed up runs when FALSE
@@ -17,7 +19,7 @@ source("Scripts/Migration_Simulation_Functions.r")
 	BIRTH_RATE_REG_RESULTS <- "Data/Regression_Results/Birth_Rate_Model.Rds" #Location of the regression used to model variance in birth rates in each Monte Carlo simulation.
 	START_DEMOGRAPHIC_DATA <- "Data/Cleaned_Data/Start_Year_Demographic_Data_With_Fertility_Groups.Rds" #Location of the data for the first year needing a forecasted birth rate, which aggregates the yearly splits of populations, into a single, year-county data set with variables for key birth prediction (total number of men and women in ages with high fertility rates), and then combines with the data set including births, deaths, migration and population.
 	AGE_OF_MIGRANT_PROBABILITY <- "Data/Other_Intermediate_Outputs/Migreation_Age_Probablity_One_to_Ninety.csv" #Location of the data which is the result of regression analysis of the age of migrants. That is to say 18 year olds may migrate more than 70 year olds, and this is the distribution by age. Sex was not found to be a major factor
-####Run any scirpts required before main Monte Carlo 
+####Run any scripts required before main Monte Carlo 
 	source("Survival_Simulation.r") #Populate a table with a simulation of future mortality rates, for quick recall during the simulation.
 	#A script contains the code needed to create a feols (fixest) regression of the birth rate given age distribution. Load this saved result or else create it to use in each simulation for gathering variance of births in any given age distribution path of the Monte Carlo.
 	if(file.exists(BIRTH_RATE_REG_RESULTS)){MOD_BIRTHS <- readRDS(BIRTH_RATE_REG_RESULTS);FIRST_PREDICT_YEAR_POPULATION_DATA <- readRDS(START_DEMOGRAPHIC_DATA)} else{source("Birth_Rate_Regression.r")}
@@ -25,18 +27,25 @@ source("Scripts/Migration_Simulation_Functions.r")
 
 	#Rerun the migration simulation if requested
 if(RERUN_MIGRATION_SIMULATION ){source("Migration_Regression.r")}
-
 #######################################################Main Monte Carlo
 START_DEM_DATA <- readRDS("Data/Cleaned_Data/Lincoln_Demographic_Data.Rds") %>% group_by(County) %>% filter(Year==2023) %>% ungroup %>% select(-County)
 MORTALITY_SIMULATION <- readRDS("./Data/Simulated_Data_Sets/MORTALITY_MONTE_CARLO.Rds") #Load the Mortality simulation to speed up simulation
 MIGRATION_ARIMA_SIMULATION <- readRDS("./Data/Simulated_Data_Sets/Migration_ARIMA.Rds") #Load the Migration simulation to speed up simulation
+if(KEMMER_SIM){
+	LN_POP_ST <- FIRST_PREDICT_YEAR_POPULATION_DATA$Population #Population of Lincoln County
+ 	START_DEM_DATA <- readRDS("Data/Cleaned_Data/Kemmerer_Demographic_Data.Rds") %>% select(-County) %>% mutate(County='Lincoln')
+	FIRST_PREDICT_YEAR_POPULATION_DATA <-  readRDS("Data/Cleaned_Data/Kemmerer_Summary_Start_Data.Rds")
+	KEM_POP_ST <- FIRST_PREDICT_YEAR_POPULATION_DATA$Population
+	POP_ST_RATIO <- LN_POP_ST/KEM_POP_ST
+	MIGRATION_ARIMA_SIMS <- round(MIGRATION_ARIMA_SIMS*POP_ST_RATIO) #Downscale County migreation to the city level based on average popualtion 
+}
+#Second run, work on making into loop and saving the output to file (check if that will slow the results). Maybe use a predefined matrix, so that the results can be stored quickly
 
-#Second run, work on making into loop and saving the output to file (check if that will slow the results). Maybe use a predifined matrix, so that the results can be stored quirckly
 SINGLE_PATH_SIM <- function(j){
-  C_RES <- RUN_SINGLE_SIM(MOD_BIRTHS,FIRST_PREDICT_YEAR_POPULATION_DATA,START_DEM_DATA,MORTALITY_SIMULATION,SIM_NUMBER=j,START_OF_SIM=2023,MIGRATION_ARIMA_SIMULATION,MIG_AGE_DIST)
+  C_RES <- RUN_SINGLE_SIM(MOD_BIRTHS,FIRST_PREDICT_YEAR_POPULATION_DATA,START_DEM_DATA,MORTALITY_SIMULATION,SIM_NUMBER=j,START_OF_SIM=START_SIM_YEAR,MIGRATION_ARIMA_SIMULATION,MIG_AGE_DIST)
   RES <- C_RES[[1]]
     for(i in 1:NUM_YEARS_PROJECTED){
-      C_RES <- RUN_SINGLE_SIM(MOD_BIRTHS,C_RES[[3]],C_RES[[2]],MORTALITY_SIMULATION,SIM_NUMBER=j,START_OF_SIM=2023,MIGRATION_ARIMA_SIMULATION,MIG_AGE_DIST)
+      C_RES <- RUN_SINGLE_SIM(MOD_BIRTHS,C_RES[[3]],C_RES[[2]],MORTALITY_SIMULATION,SIM_NUMBER=j,START_OF_SIM=START_SIM_YEAR,MIGRATION_ARIMA_SIMULATION,MIG_AGE_DIST)
       RES <- rbind(RES,C_RES[[1]])
     }
   return(RES)
diff --git a/Birth_Rate_Regression.r b/Birth_Rate_Regression.r
index 73c4fce..c43ac4b 100644
--- a/Birth_Rate_Regression.r
+++ b/Birth_Rate_Regression.r
@@ -24,7 +24,7 @@ MOD_BIRTHS <- feols(log(Births)~log(PREV_BIRTH)+log(PREV_TWO_BIRTH)+log(Min_Birt
   RES_DATA$RESID <- resid(MOD_BIRTHS)
   acf(RES_DATA %>% pull(RESID))
   pacf(RES_DATA %>% pull(RESID))
-
+saveRDS(RES_DATA,"Data/Regression_Results/Birth_Regression_Data_Set.Rds")
   saveRDS(MOD_BIRTHS,BIRTH_RATE_REG_RESULTS)
   saveRDS(FIRST_PREDICT_YEAR_POPULATION_DATA,START_DEMOGRAPHIC_DATA) #Save the cleaned data set for later use when starting the simulation.
   #Cleanup data no longer needed, and save some RAM
diff --git a/Scripts/Birth_Simulation_Functions.r b/Scripts/Birth_Simulation_Functions.r
index 073ebcd..768c677 100644
--- a/Scripts/Birth_Simulation_Functions.r
+++ b/Scripts/Birth_Simulation_Functions.r
@@ -2,7 +2,6 @@
 	#Uncomment to test the function step by step
 #REG_MODEL <- MOD_BIRTHS;REG_DATA <- FIRST_PREDICT_YEAR_POPULATION_DATA;NUM_SIMS=1
 BIRTH_SIM <- function(REG_MODEL,REG_DATA,NUM_SIMS=1){
-	predict(REG_MODEL,newdata=REG_DATA)
 	C_PREDICT <- predict(REG_MODEL,REG_DATA,interval = "prediction",level=0.95)
 	PRED_MEAN <- C_PREDICT$fit
 	SE_PRED <- (C_PREDICT$ci_high-C_PREDICT$ci_low)/3.92
diff --git a/Scripts/Monte_Carlo_Functions.r b/Scripts/Monte_Carlo_Functions.r
index 79ca9ac..21d74fe 100644
--- a/Scripts/Monte_Carlo_Functions.r
+++ b/Scripts/Monte_Carlo_Functions.r
@@ -1,5 +1,5 @@
 #Uncomment to check the function line by line
-#	REG_BIRTH_MODEL=MOD_BIRTHS;START_BASIC_DATA=FIRST_PREDICT_YEAR_POPULATION_DATA;START_DETAILED_DATA=START_DEM_DATA;Mortality_Rate_Sim=MORTALITY_SIMULATION;SIM_NUMBER=1;START_OF_SIM_YEAR=2023;Migration_ARIMA_Sim=MIGRATION_ARIMA_SIMULATION;Migration_Age_Distribution=MIG_AGE_DIST
+#	REG_BIRTH_MODEL=MOD_BIRTHS;START_BASIC_DATA=FIRST_PREDICT_YEAR_POPULATION_DATA;START_DETAILED_DATA=START_DEM_DATA;Mortality_Rate_Sim=MORTALITY_SIMULATION;SIM_NUMBER=1;START_OF_SIM_YEAR=2025;Migration_ARIMA_Sim=MIGRATION_ARIMA_SIMULATION;Migration_Age_Distribution=MIG_AGE_DIST
 	#FIRST_PREDICT_YEAR_POPULATION_DATA
 	#START_BASIC_DATA <- C_RES[[3]]
 	#START_DETAILED_DATA <- C_RES[[2]]
@@ -25,6 +25,7 @@ RUN_SINGLE_SIM <- function(REG_BIRTH_MODEL,START_BASIC_DATA,START_DETAILED_DATA,
 	START_DETAILED_DATA[,"Age"] <- START_DETAILED_DATA[,"Age"]+1
 	START_DETAILED_DATA[START_DETAILED_DATA$Age==85,c("Num_Male","Num_Female")] <- START_DETAILED_DATA[START_DETAILED_DATA$Age==86,c("Num_Male","Num_Female")]+START_DETAILED_DATA[START_DETAILED_DATA$Age==85,c("Num_Male","Num_Female")] #Sum the 85 and 86 ages into one row for age 85
 	START_DETAILED_DATA <- START_DETAILED_DATA[START_DETAILED_DATA$Age!=86,] #Anyone older than 85 is lumped into one group remove the 86 group
+	####CURRENT ERROR MISSING COUNTY IN ONE COLUMN
 	START_DETAILED_DATA <- rbind(C_BIRTHS,START_DETAILED_DATA) #Add the new simulated births
 	#Run a preliminary Monte Carlo which estimates the future mortality rate, for each simulation and year of of Monte Carlo Simulation
 
diff --git a/Zip_Code.r b/Zip_Code.r
index 04d52bd..cdd2efe 100644
--- a/Zip_Code.r
+++ b/Zip_Code.r
@@ -1,6 +1,7 @@
 library(tidyverse)
 library(tidycensus)
 library(zipcodeR)
+library(scales) #For a pretty population Pyramid
 source("Scripts/Downshift_Population_Functions.r")
 #Packages to instal on computer if zipcodeR won't install
 #Sudo apt install libssl-dev libudunits2-dev libabsl-dev libcurl4-openssl-dev libgdal-dev cmake libfontconfig1-dev libharfbuzz-dev libfribidi-dev
@@ -47,54 +48,49 @@ AGE_DATA <- AGE_DATA %>%  mutate(Per_Pop=Population/sum(Population)) %>% group_b
 AGE_DATA <- AGE_DATA %>% mutate(Region=ifelse(IN_KEM==1,'Kemmerer','Lincoln')) 
 #AGE_DATA <- AGE_DATA %>% group_by(IN_KEM) %>% mutate(MORE_KEMMER=ifelse(sum(IN_KEM*Per_Pop_Region)>sum((1-IN_KEM)*Per_Pop_Region),1,0)) %>% ungroup 
 
-PLOT <- ggplot(AGE_DATA, aes(x =Ages, y = Per_Pop_Region)) +  geom_line() +  geom_point(aes(color = Region ), size = 5) +  scale_color_brewer(palette = "Set1", direction = 1) +  theme(legend.position = "bottom")+facet_wrap(~Sex,ncol=1)
+#PLOT <- ggplot(AGE_DATA, aes(x =Ages, y = Per_Pop_Region)) +  geom_line() +  geom_point(aes(color = Region ), size = 5) +  scale_color_brewer(palette = "Set1", direction = 1) +  theme(legend.position = "bottom")+facet_wrap(~Sex,ncol=1)
 #PLOT
-LIN_TO_KEMMER_CONVERSION_RATIOS <- INTERPOLATE_COUNTY_AGE_DEMOGRAPHIC_DATA_TO_CITY_LEVEL(AGE_DATA,YEARS_FORWARD=5)
-LIN_TO_KEMMER_CONVERSION_RATIOS 
+LIN_TO_KEMMER_CONVERSION_RATIOS <- INTERPOLATE_COUNTY_AGE_DEMOGRAPHIC_DATA_TO_CITY_LEVEL(AGE_DATA,YEARS_FORWARD=1) %>% pivot_wider(names_from=Sex,values_from=Conversion_Ratio,names_prefix="Convert_")
+
+KEM_DEMO_DATA <- readRDS("Data/Cleaned_Data/Lincoln_Demographic_Data.Rds")  %>% filter(Year==max(Year)) %>% left_join(LIN_TO_KEMMER_CONVERSION_RATIOS ) %>% mutate(Num_Male=Num_Male*Convert_Male,Num_Female=Num_Female*Convert_Female) %>% mutate(County='Kemmerer') %>% select(-Convert_Male,-Convert_Female)
+NUM_KEM <- sum(KEM_DEMO_DATA[,4:5] )
+CURRENT_KEM_POP <- readRDS("Data/Cleaned_Data/Wyoming_City_Population.Rds") %>% filter(City=='Kemmerer' | City=='Diamondville') %>% group_by(City) %>% filter(Year==max(Year)) %>% ungroup %>% group_by(Year) %>% summarize(Population=sum(as.numeric(Population)),County='Kemmerer')
+
+SCALE_KEM_FACTOR <- CURRENT_KEM_POP$Population/NUM_KEM
+KEM_NEW_DEMO_DATA <- KEM_DEMO_DATA
+KEM_NEW_DEMO_DATA[,c("Num_Male","Num_Female")] <- round(SCALE_KEM_FACTOR*KEM_NEW_DEMO_DATA[,c("Num_Male","Num_Female")])
+##Create a simplfied summary of the Kemmerer/Diamondville population estimates, to start the Monte Carlo
+	ST_BIRTH_KEM <- sum((KEM_NEW_DEMO_DATA %>% filter(Age==0))[4:5])
+	ST_BIRTH_TWO_PREV_KEM <- sum((KEM_NEW_DEMO_DATA %>% filter(Age==1))[4:5])
+
+KEM_DEMO_DATA %>% filter(Age==0)
+
+	ST_POP <- sum((KEM_NEW_DEMO_DATA)[4:5])
+	INTIATE_KEMMER <- KEM_NEW_DEMO_DATA%>% mutate(Male_Window=Age>=18 & Age<=30,Female_Window=Age>=18 & Age<=28) %>% group_by(County,Year) %>% summarize(Female_Birth_Group=sum(Num_Female*Female_Window),Male_Birth_Group=sum(Num_Male*Male_Window),Min_Birth_Group=ifelse(Female_Birth_Group<Male_Birth_Group,Female_Birth_Group,Male_Birth_Group),PREV_BIRTH=ST_BIRTH_KEM,PREV_TWO_BIRTH=ST_BIRTH_TWO_PREV_KEM,Population=ST_POP) %>% mutate(Births=NA,Deaths=NA,Migration=NA) %>% select(Year,County,Population,Births,Deaths,Migration,Min_Birth_Group,PREV_BIRTH,PREV_TWO_BIRTH) %>% ungroup
+INTIATE_KEMMER$County <- (readRDS("Data/Regression_Results/Birth_Regression_Data_Set.Rds") %>% filter(County=='Lincoln',Year==2020))$County #Making the factor the same for the regression
+	saveRDS(INTIATE_KEMMER ,"Data/Cleaned_Data/Kemmerer_Summary_Start_Data.Rds")
+	write_csv(INTIATE_KEMMER ,"Data/Cleaned_Data/Kemmerer_Summary_Start_Data.csv")
+
+	saveRDS(KEM_NEW_DEMO_DATA,"Data/Cleaned_Data/Kemmerer_Demographic_Data.Rds")
+	write_csv(KEM_NEW_DEMO_DATA,"Data/Cleaned_Data/Kemmerer_Demographic_Data.csv")
+#sum(KEM_NEW_DEMO_DATA[,4:5])-CURRENT_KEM_POP$Population
+###Make a population Pyramid Graph
+PY_KEM_DATA <- KEM_NEW_DEMO_DATA %>% pivot_longer(cols=c(Num_Male,Num_Female),names_to="Sex",values_to="Population") %>% mutate(Sex=ifelse(Sex=='Num_Male','Male','Female'),Population=ifelse(Sex=='Male',-Population,Population))
+LN_CURRENT_DEMO <- readRDS("Data/Cleaned_Data/Wyoming_County_Demographic_Data.Rds") %>% filter(County=='Lincoln',Year==max(Year))
+
+PY_LN_DATA <- LN_CURRENT_DEMO %>% pivot_longer(cols=c(Num_Male,Num_Female),names_to="Sex",values_to="Population") %>% mutate(Sex=ifelse(Sex=='Num_Male','Male','Female'),Population=ifelse(Sex=='Male',-Population,Population))
+PY_LN_DATA <- PY_LN_DATA %>% left_join(PY_KEM_DATA %>% rename(POP_SHIFT=Population) %>% select(-County)) %>% mutate(Population=Population-POP_SHIFT) %>% select(-POP_SHIFT)
+PY_LN_DATA$Population <- PY_LN_DATA$Population/sum(abs(PY_LN_DATA$Population ))
 
 
+PY_KEM_PER <- PY_KEM_DATA
+PY_KEM_PER$Population <- PY_KEM_PER$Population/sum(abs(PY_KEM_PER$Population ))
+GRAPH_DATA <- full_join(PY_KEM_PER ,PY_LN_DATA)   %>% mutate(Population=ifelse(County=='Kemmerer',abs(Population),-abs(Population)))
 
 
-
-AGE_FORWARD <- AGE_DATA %>% filter(!(Min_Age==0 & Max_Age==Inf))%>% mutate(POP_TOTAL=POP_OUT+POP_KEM,KEM_RATIO=POP_KEM/POP_TOTAL) %>% select(Sex,Min_Age,Max_Age,KEM_RATIO)
-STORE <- AGE_FORWARD %>% filter(Min_Age==0,Max_Age==4)
-AGE_FORWARD <- STORE %>% full_join(AGE_FORWARD %>% mutate(Min_Age=Min_Age+4,Max_Age=Max_Age+4))
-MALE_FORWARD <- AGE_FORWARD %>% filter(Sex=='Male') %>% arrange(Min_Age) %>% filter(KEM_RATIO!=0) %>% mutate(Med_Age=(Min_Age+Max_Age)/2)
-NUM_IN_GROUP <- MALE_FORWARD$Max_Age- MALE_FORWARD$Min_Age+1
-NUM_IN_GROUP[23] <- 1
-MALE_FORWARD$KEM_RATIO
-ggplot(MALE_FORWARD,aes(x=Med_Age,y=KEM_RATIO)) +geom_point()+geom_smooth(span=0.25)
-loess(KEM_RATIO ~ Med_Age,data=MALE_FORWARD,span=0.3 )
-?loess
-FEMALE_FORWARD <- AGE_FORWARD %>% filter(Sex=='Female') %>% arrange(Min_Age)
-MALE_FORWARD 
-
-plot(AGE_FORWARD$KEM_RATIO)
-gg <- ggplot(GRAPH_DATA, aes(x=PER_OUT, xend=PER_KEM, y=MED_AGE, group=Sex)) +
-        geom_dumbbell(color="#a3c4dc",
-                      size=0.75,
-                      point.colour.l="#0e668b") +
-        scale_x_continuous(label=percent) +
-        labs(x=NULL,
-             y=NULL,
-             title="Dumbbell Chart",
-             subtitle="Pct Change: 2013 vs 2014",
-             caption="Source: https://github.com/hrbrmstr/ggalt") +
-        theme(plot.title = element_text(hjust=0.5, face="bold"),
-              plot.background=element_rect(fill="#f7f7f7"),
-              panel.background=element_rect(fill="#f7f7f7"),
-              panel.grid.minor=element_blank(),
-              panel.grid.major.y=element_blank(),
-              panel.grid.major.x=element_line(),
-              axis.ticks=element_blank(),
-              legend.position="top",
-	       panel.border=element_blank())
-ggplot(GRAPH_DATA %>% filter(Sex=='Male')) +geom_point(aes(x=Med_Age,y=PER_KEM),color='blue') +geom_point(aes(x=Med_Age,y=PER_OUT)) +geom_smooth(aes(x=Med_Age,y=PER_OUT),color='black',span=SPAN)+geom_smooth(aes(x=Med_Age,y=PER_KEM),span=SPAN)
-ggplot(GRAPH_DATA %>% filter(Sex=='Female')) +geom_point(aes(x=Med_Age,y=PER_KEM),color='blue') +geom_point(aes(x=Med_Age,y=PER_OUT)) +geom_smooth(aes(x=Med_Age,y=PER_OUT),color='black',span=SPAN)+geom_smooth(aes(x=Med_Age,y=PER_KEM),span=SPAN)
+RANGE <- c(-0.015,pretty(range(GRAPH_DATA$Population),n=8))
+LAB <- 	percent(abs(RANGE),accuracy=0.1 )
+ POP_PYRAMID <- ggplot(GRAPH_DATA,aes(y=factor(Age),x=Population,fill=County))+geom_col() +scale_x_continuous(breaks  = RANGE,labels = LAB)+facet_grid(~Sex)
+POP_PYRAMID 
 
 
-
-
-
-AGE_DATA %>% pivot_wider(names_from=c(Sex,Min_Age,Max_Age,Med_Age,IN_KEM),values_from=Population) 
-