library(tidyverse) library(fixest) #setwd("../") ##########################Model Population Trends ##Run Regression #Pull in Demographic data and create categories for key groups in the regression, male/female population with high fertility, children under one and two (but not zero). This data is broken down by each age group so aggregate to the county, year level for the final regression. #Fertility age bounds were informed by the regression found in the file ./Scripts/Other_Analysis/Select_Range_of_Male_Female_Fertility.r Which qualitatively supports that the number of people in these age ranges (18-28 Women, 18-30 Men) have the most significance in predicting birth rates. These two are combined into one variable which represent the minimum number of people in the key fertility window between the sexes, this is the binding fertility constraint and has more explanatory power than including either the number of men or women in the fertility window alone, providing a good trade off for including more variables or reducing variance. if(!exists("DEMOGRAPHIC_COUNTY_LOC")){DEMOGRAPHIC_COUNTY_LOC <- "./Data/Cleaned_Data/Demographic_Sex_Age_Data/RDS/All_Wyoming_Counties_Demographics.Rds"} if(!exists("DEMOGRAPHIC_KEM_LOC")){DEMOGRAPHIC_KEM_LOC <- "./Data/Cleaned_Data/Demographic_Sex_Age_Data/RDS/Kemmerer_Diamondville_Demographics.Rds"} if(!exists("DEMOGRAPHIC_OTHER_LIN_LOC")){DEMOGRAPHIC_OTHER_LIN_LOC <- "./Data/Cleaned_Data/Demographic_Sex_Age_Data/RDS/Other_Lincoln_Demographics.Rds"} if(!exists("POPULATION_COUNTY_LOC")){POPULATION_COUNTY_LOC <- "./Data/Cleaned_Data/Population_Data/RDS/All_Wyoming_County_Populations.Rds"} if(!exists("POPULATION_CITY_LOC")){POPULATION_CITY_LOC <- "./Data/Cleaned_Data/Population_Data/RDS/All_Wyoming_City_Populations.Rds"} #Function to make the data consistent for each data set used to run a birth simulation in the Monte Carlo ADD_BIRTH_GROUP_DATA <- function(DEMO_DATA){ return(DEMO_DATA %>% mutate(Male_Window=Age>=18 & Age<=30,Female_Window=Age>=18 & Age<=28) %>% group_by(County,Region,Year) %>% summarize(Female_Birth_Group=sum(Num_Female*Female_Window,na.rm=TRUE),Male_Birth_Group=sum(Num_Male*Male_Window,na.rm=TRUE),Min_Birth_Group=ifelse(Female_Birth_Group% ungroup%>% select(-Female_Birth_Group,-Male_Birth_Group)) } ####################Create data set for regression of birth rates using available county data (not sub-regions like Kemmerer) DEMOGRAPHIC_COUNTY_DATA <- readRDS(DEMOGRAPHIC_COUNTY_LOC) COUNTY_POP <- readRDS(POPULATION_COUNTY_LOC) FILL_IN_DATA <- DEMOGRAPHIC_COUNTY_DATA %>% mutate(POP=Num_Male+Num_Female,BIRTHS=ifelse(Age==0,POP,0)) %>% group_by(County,Region,Year) %>% summarize(BIRTHS=sum(BIRTHS)) %>% arrange(County,Year) #Join and replace missing records COUNTY_BIRTH_DATA <- COUNTY_POP %>% left_join(FILL_IN_DATA) %>% mutate(Births=ifelse(is.na(Births),BIRTHS,Births)) %>% select(-BIRTHS) %>% select(Year,County,Region,everything()) %>% mutate(Region=County) COUNTY_BIRTH_DATA <- COUNTY_BIRTH_DATA %>% left_join(ADD_BIRTH_GROUP_DATA(DEMOGRAPHIC_COUNTY_DATA)) ####################Create same data set but for only the Kemmerer Diamondville area KEM_RAW_DEMOGRAPHIC_DATA <- readRDS(DEMOGRAPHIC_KEM_LOC) KEM_DEMO_DATA <- KEM_RAW_DEMOGRAPHIC_DATA %>% mutate(POP=Num_Male+Num_Female,Births=ifelse(Age==0,POP,0)) %>% group_by(Year,County,Region) %>% summarize(Births=sum(Births)) KEM_POP_DATA <- readRDS(POPULATION_CITY_LOC)%>% rename(Region=City) %>% filter(Region %in% c("Kemmerer","Diamondville")) %>% group_by(Year) %>% mutate(Population=sum(Population,na.rm=TRUE)) %>% ungroup %>% mutate(Region='Kemmerer & Diamondville') %>% unique %>% ungroup KEM_BIRTH_DATA <- KEM_POP_DATA %>% left_join(KEM_DEMO_DATA) KEM_BIRTH_DATA <- KEM_BIRTH_DATA %>% left_join(ADD_BIRTH_GROUP_DATA(KEM_RAW_DEMOGRAPHIC_DATA)) ####################Create same data set but for only parts of Lincoln not in the Kemmerer Diamondville area OTHER_RAW_DEMOGRAPHIC_DATA <- readRDS(DEMOGRAPHIC_OTHER_LIN_LOC) OTHER_DEMO_DATA <- OTHER_RAW_DEMOGRAPHIC_DATA %>% mutate(POP=Num_Male+Num_Female,Births=ifelse(Age==0,POP,0)) %>% group_by(Year,County,Region) %>% summarize(Births=sum(Births)) OTHER_POP_DATA <- readRDS(POPULATION_CITY_LOC)%>% rename(Region=City) %>% filter(!(Region %in% c("Kemmerer","Diamondville")),County=='Lincoln') %>% group_by(Year) %>% mutate(Population=sum(Population,na.rm=TRUE)) %>% ungroup %>% mutate(Region='Lincoln_Other') %>% unique %>% ungroup OTHER_BIRTH_DATA <- OTHER_POP_DATA %>% left_join(OTHER_DEMO_DATA) OTHER_BIRTH_DATA <- OTHER_BIRTH_DATA %>% left_join(ADD_BIRTH_GROUP_DATA(OTHER_RAW_DEMOGRAPHIC_DATA)) ####################################################3 ###Because the total of the Kemmerer and Other Lincoln should sum to the births in the entire county, we assume that the ratio of children under the age of 1, in each area is the same percentage as the total births in the county. The total births in the county is then taken from the Wyoming eadiv estimates at the county level. This makes the two sum to the proper amount REGIONAL_BIRTH_ADJUST <- KEM_BIRTH_DATA %>% rbind(OTHER_BIRTH_DATA) %>% group_by(Year) %>% filter(!is.na(Births)) %>% mutate(PER_COUNTY_BIRTHS=Births/sum(Births)) %>% group_by(Region,Year) %>% select(Year,County,Region,PER_COUNTY_BIRTHS) ADJUSTED_KEM_STUB_DATA <- COUNTY_BIRTH_DATA %>% filter(Region=='Lincoln') %>% select(Year,County,Births) %>% inner_join(REGIONAL_BIRTH_ADJUST) %>% mutate(Births=round(Births*PER_COUNTY_BIRTHS)) %>% select(-PER_COUNTY_BIRTHS) KEM_BIRTH_DATA <- KEM_BIRTH_DATA %>% select(-Births) %>% left_join(ADJUSTED_KEM_STUB_DATA ) OTHER_BIRTH_DATA <- OTHER_BIRTH_DATA %>% select(-Births) %>% left_join(ADJUSTED_KEM_STUB_DATA) REG_DATA <- rbind(COUNTY_BIRTH_DATA,rbind(KEM_BIRTH_DATA,OTHER_BIRTH_DATA) %>% mutate(Deaths=NA,Migration=NA) %>% select(colnames(COUNTY_BIRTH_DATA))) REG_DATA <- REG_DATA %>% group_by(Region) %>% arrange(Year) %>% mutate(Lag_Births=lag(Births),Lag_Two_Births=lag(Births,2)) %>% ungroup %>% arrange(County,Region,Year) REG_REDUCED_DATA <- REG_DATA %>% filter(!is.na(Births),!is.na(Lag_Two_Births),!is.na(Min_Birth_Group),!is.na(Lag_Births),!is.na(Region)) %>% select(Year,Region,Min_Birth_Group,Births,Lag_Births,Lag_Two_Births) ###Predict the number of Births MOD_BIRTHS <- feols(log(Births)~log(Lag_Births)+log(Lag_Two_Births)+log(Min_Birth_Group)+Year*Region,cluster=~Year+Region, data=REG_REDUCED_DATA ) #Higher AIC but worse acf ####Alternate models with different years of data to test to model against a counterfactual REG_DATA_2016 <- REG_REDUCED_DATA %>% filter(Year<=2016) MOD_BIRTHS_2016 <- feols(log(Births)~log(Lag_Births)+log(Lag_Two_Births)+log(Min_Birth_Group)+Year*Region,cluster=~Year+Region, data=REG_DATA_2016 ) REG_DATA_1985 <- REG_REDUCED_DATA %>% filter(Year<=1985) MOD_BIRTHS_1985 <- feols(log(Births)~log(Lag_Births)+log(Lag_Two_Births)+log(Min_Birth_Group)+Year*Region,cluster=~Year+Region, data=REG_DATA_1985 ) ###Models to more easily show the results #Easier to read for a LaTex Table DICT <- c("log(Lag_Births)"="Births Last Years (log)","log(Lag_Two_Births)"="Births Two Years Ago (log)","LN"='Kemmerer Area','log(Births)'='Births (log)','log(Min_Birth_Group)'='Child Rearing Aged Adults (log)','Region'="County" ) REG_VIEW_DATA <- REG_REDUCED_DATA %>% mutate(LN=ifelse(Region=='Kemmerer & Diamondville',1,0)) MOD_VIEW_BIRTHS <- feols(log(Births)~log(Lag_Births)+log(Lag_Two_Births)+log(Min_Birth_Group)+LN*Year+Year*Region|Region,cluster=~Year+Region, data=REG_VIEW_DATA ) #Higher AIC but worse acf REG_VIEW_DATA_2016 <- REG_VIEW_DATA %>% filter(Year<=2016) MOD_VIEW_BIRTHS_2016 <- feols(log(Births)~log(Lag_Births)+log(Lag_Two_Births)+log(Min_Birth_Group)+LN*Year+Year*Region|Region,cluster=~Year+Region, data=REG_VIEW_DATA_2016 ) REG_VIEW_DATA_1985 <- REG_VIEW_DATA%>% filter(Year<=1985) MOD_VIEW_BIRTHS_1985 <- feols(log(Births)~log(Lag_Births)+log(Lag_Two_Births)+log(Min_Birth_Group)+LN*Year+Year*Region|Region,cluster=~Year+Region, data=REG_VIEW_DATA_1985 ) 1 ###Prelim information for fixest table NOTES <- c("Natural log used for all variables besides years, and counties","Kemmerer Area: Includes both Kemmerer and Diamondville","Child Rearing Adults: Minimum of all women aged 18-28 or men aged 18-30.","Kemmerer data from the American Community Survey Data starts in 2009") LAB <- c("Kem.","Kem. Pre 2016","Lincoln Pre 1985") HEAD <- list(c("All Data","Pre 2016","Pre 1985")) #Current year prediction #exp(predict(MOD_BIRTHS,REG_DATA %>% filter(Region=='Kemmerer & Diamondville',Year==2023))) #REG_DATA %>% filter(Region=='Kemmerer & Diamondville',Year==2023) #MOD_BIRTHS_ALT <- feols(log(Births)~log(Lag_Births)+log(Min_Birth_Group)+Year*Region,cluster=~Year+Region, data=REG_REDUCED_DATA ) #AIC(MOD_BIRTHS)% filter(Region=='Kemmerer & Diamondville') %>% arrange(Year) png(paste0(SAVE_FIG_LOC,"/Kemmerer_ACF.png"), width = 12, height = 8, units = "in", res = 600) acf(C_TEMP$RESID,main='ACF of Kemmerer & Diamondville Birth Estimate Residuals',xlab="Lag (Years)") dev.off() png(paste0(SAVE_FIG_LOC,"/Kemmerer_PACF.png"), width = 12, height = 8, units = "in", res = 600) pacf(C_TEMP$RESID,main='PACF of Kemmerer & Diamondville Birth Estimate Residuals',xlab="Lag (Years)") dev.off() #Lincoln total ACF/PACF C_TEMP <- TEMP %>% filter(Region=='Lincoln') %>% arrange(Year) png(paste0(SAVE_FIG_LOC,"/Lincoln_ACF.png"), width = 12, height = 8, units = "in", res = 600) acf(C_TEMP$RESID,main='ACF of Lincoln County Birth Estimate Residuals',xlab="Lag (Years)") dev.off() png(paste0(SAVE_FIG_LOC,"/Lincoln_PACF.png"), width = 12, height = 8, units = "in", res = 600) pacf(C_TEMP$RESID,main='PACF of Lincoln County Birth Estimate Residuals',xlab="Lag (Years)") dev.off() #Lincoln Other (Not Kemmerer) ACF/PACF C_TEMP <- TEMP %>% filter(Region=='Lincoln_Other') %>% arrange(Year) png(paste0(SAVE_FIG_LOC,"/Lincoln_Other_Areas_ACF.png"), width = 12, height = 8, units = "in", res = 600) acf(C_TEMP$RESID,main='ACF of Other Parts of Lincoln County Birth Estimate Residuals',xlab="Lag (Years)") dev.off() png(paste0(SAVE_FIG_LOC,"/Lincoln_Other_Areas_PACF.png"), width = 12, height = 8, units = "in", res = 600) pacf(C_TEMP$RESID,main='PACF of Other Parts of Lincoln County Birth Estimate Residuals',xlab="Lag (Years)") dev.off() ####Create data stubs to start a simulation. That is predict the births from this most recent year. Include records from various years of potential interest ST_REG_DATA <- REG_REDUCED_DATA %>% filter(Region=='Lincoln') %>% filter(Year==max(Year)) %>% rbind(REG_REDUCED_DATA %>% filter(Region=='Kemmerer & Diamondville') %>% filter(Year==max(Year))) %>% rbind(REG_REDUCED_DATA %>% filter(Region=='Lincoln_Other') %>% filter(Year==max(Year))) %>% rbind(REG_REDUCED_DATA %>% filter(Region=='Lincoln') %>% filter(Year==2016)) %>% rbind(REG_REDUCED_DATA %>% filter(Region=='Kemmerer & Diamondville') %>% filter(Year==2016)) %>% rbind(REG_REDUCED_DATA %>% filter(Region=='Lincoln_Other') %>% filter(Year==2016)) %>% rbind(REG_REDUCED_DATA %>% filter(Region=='Lincoln') %>% filter(Year==1985)) if(!exists("SAVE_REG_LOC")){SAVE_REG_LOC <- "Data/Intermediate_Inputs/Birth_Regressions"} dir.create(SAVE_REG_LOC , recursive = TRUE, showWarnings = FALSE) saveRDS(MOD_BIRTHS,paste0(SAVE_REG_LOC,"/Birth_Regression.Rds")) saveRDS(MOD_BIRTHS_2016,paste0(SAVE_REG_LOC,"/Birth_Regression_2016.Rds")) saveRDS(MOD_BIRTHS_1985,paste0(SAVE_REG_LOC,"/Birth_Regression_1985.Rds")) if(!exists("SAVE_DATA_LOC")){SAVE_DATA_LOC <- "Data/Intermediate_Inputs/Birth_Regressions/Regression_Data"} dir.create(SAVE_DATA_LOC , recursive = TRUE, showWarnings = FALSE) saveRDS(ST_REG_DATA,paste0(SAVE_DATA_LOC,"/Birth_Simulation_Key_Starting_Points.Rds")) write_csv(ST_REG_DATA,paste0(SAVE_DATA_LOC,"/Birth_Simulation_Key_Starting_Points.csv")) saveRDS(REG_REDUCED_DATA,paste0(SAVE_DATA_LOC,"/Birth_Regression_Input_Data.Rds")) write_csv(REG_REDUCED_DATA,paste0(SAVE_DATA_LOC,"/Birth_Regression_Input_Data.csv"))