88 lines
7.8 KiB
R
88 lines
7.8 KiB
R
library(tidyverse)
|
|
library(fixest)
|
|
#setwd("../")
|
|
##########################Model Population Trends
|
|
##Run Regression
|
|
#Pull in Demographic data and create categories for key groups in the regression, male/female population with high fertility, children under one and two (but not zero). This data is broken down by each age group so aggregate to the county, year level for the final regression.
|
|
#Fertility age bounds were informed by the regression found in the file ./Scripts/Other_Analysis/Select_Range_of_Male_Female_Fertility.r Which qualitatively supports that the number of people in these age ranges (18-28 Women, 18-30 Men) have the most significance in predicting birth rates. These two are combined into one variable which represent the minimum number of people in the key fertility window between the sexes, this is the binding fertility constraint and has more explanatory power than including either the number of men or women in the fertility window alone, providing a good trade off for including more variables or reducing variance.
|
|
if(!exists("DEMOGRAPHIC_COUNTY_LOC")){DEMOGRAPHIC_COUNTY_LOC <- "./Data/Cleaned_Data/Demographic_Sex_Age_Data/RDS/All_Wyoming_Counties_Demographics.Rds"}
|
|
if(!exists("DEMOGRAPHIC_KEM_LOC")){DEMOGRAPHIC_KEM_LOC <- "./Data/Cleaned_Data/Demographic_Sex_Age_Data/RDS/Kemmerer_Diamondville_Demographics.Rds"}
|
|
if(!exists("DEMOGRAPHIC_OTHER_LIN_LOC")){DEMOGRAPHIC_OTHER_LIN_LOC <- "./Data/Cleaned_Data/Demographic_Sex_Age_Data/RDS/Other_Lincoln_Demographics.Rds"}
|
|
|
|
if(!exists("POPULATION_COUNTY_LOC")){POPULATION_COUNTY_LOC <- "./Data/Cleaned_Data/Population_Data/RDS/All_Wyoming_County_Populations.Rds"}
|
|
if(!exists("POPULATION_CITY_LOC")){POPULATION_CITY_LOC <- "./Data/Cleaned_Data/Population_Data/RDS/All_Wyoming_City_Populations.Rds"}
|
|
|
|
|
|
#Function to make the data consistent for each data set used to run a birth simulation in the Monte Carlo
|
|
#DEMO_DATA <- readRDS(DEMOGRAPHIC_KEM_LOC);POP_DATA <- readRDS(POPULATION_CITY_LOC)
|
|
MAKE_REG_DATA <- function(DEMO_DATA){
|
|
return(DEMO_DATA %>% mutate(Male_Window=Age>=18 & Age<=30,Female_Window=Age>=18 & Age<=28) %>% group_by(County,Region,Year) %>% summarize(Female_Birth_Group=sum(Num_Female*Female_Window,na.rm=TRUE),Male_Birth_Group=sum(Num_Male*Male_Window,na.rm=TRUE),Min_Birth_Group=ifelse(Female_Birth_Group<Male_Birth_Group,Female_Birth_Group,Male_Birth_Group)) %>% ungroup)
|
|
}
|
|
DEMOGRAPHIC_COUNTY_DATA <- readRDS(DEMOGRAPHIC_COUNTY_LOC)
|
|
COUNTY_POP <- readRDS(POPULATION_COUNTY_LOC)
|
|
REG_DATA <- readRDS(POPULATION_COUNTY_LOC) %>% full_join(MAKE_REG_DATA(DEMOGRAPHIC_COUNTY_DATA))
|
|
REG_DATA <- REG_DATA %>% group_by(County,Region) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup
|
|
REG_DATA <- REG_DATA %>% select(-Female_Birth_Group,-Male_Birth_Group)%>% mutate(Region=County) %>% select(Year,County,Region,everything())
|
|
#Store the data set of only the first year needing a birth forecast, to start the birth Monte Carlo simulations.
|
|
###Some of the years are missing births, previous births etc. Where missing fill this in by assuming all age zero children in the demographic data (DEMOGRAPHIC_LOC) were born in the last year. This makes a more complete data set. Some test find a near perfect 1 to 1 with this method
|
|
#Data to fill in the missing records
|
|
FILL_IN_DATA <- DEMOGRAPHIC_COUNTY_DATA %>% mutate(POP=Num_Male+Num_Female,BIRTHS=ifelse(Age==0,POP,0)) %>% group_by(County,Region,Year) %>% summarize(BIRTHS=sum(BIRTHS)) %>% arrange(County,Year) %>% mutate(ALT=lag(BIRTHS),ALT2=lag(BIRTHS,2)) %>% ungroup
|
|
#Join and replace missing records
|
|
REG_DATA <- REG_DATA %>% left_join(FILL_IN_DATA ) %>% mutate(Births=ifelse(is.na(Births),BIRTHS,Births),PREV_BIRTH=ifelse(is.na(PREV_BIRTH),ALT,PREV_BIRTH),PREV_TWO_BIRTH=ifelse(is.na(PREV_TWO_BIRTH),ALT2,PREV_TWO_BIRTH)) %>% select(-BIRTHS,-ALT,-ALT2) %>% select(Year,County,Region,everything()) %>% mutate(Region=County)
|
|
ST_LIN_REG <- REG_DATA %>% filter(County=="Lincoln",Year==2024)
|
|
|
|
####################Create same data set but for only the Kemmerer Diamondville area
|
|
KEM_DEMO_DATA <- readRDS(DEMOGRAPHIC_KEM_LOC)%>% mutate(POP=Num_Male+Num_Female,Births=ifelse(Age==0,POP,0)) %>% group_by(Year,County,Region) %>% summarize(Births=sum(Births)) %>% ungroup %>% arrange(Region,Year) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup %>% mutate(Deaths=NA,Migration=NA) %>% left_join(MAKE_REG_DATA(readRDS(DEMOGRAPHIC_KEM_LOC)))
|
|
|
|
KEM_POP_DATA <- readRDS(POPULATION_CITY_LOC)%>% rename(Region=City) %>% filter(Region %in% c("Kemmerer","Diamondville")) %>% group_by(Year) %>% mutate(Population=sum(Population,na.rm=TRUE)) %>% ungroup %>% mutate(Region='Kemmerer & Diamondville') %>% unique %>% ungroup
|
|
|
|
KEM_REG_DATA <- KEM_POP_DATA %>% left_join(KEM_DEMO_DATA) %>% select(colnames(REG_DATA))
|
|
KEM_REG_DATA
|
|
ST_KEM_REG <- KEM_REG_DATA[KEM_REG_DATA$Year==2023,]
|
|
ST_KEM_REG$Year <-2024
|
|
KEM_REG_DATA %>% tail
|
|
###The starting entry to predict births in the next period based on the current population
|
|
ST_KEM_REG[,"Population"] <- KEM_REG_DATA[KEM_REG_DATA$Year==2024,] %>% pull("Population")
|
|
####################Create same data set but for only parts of Lincoln not in the Kemmerer Diamondville area
|
|
OTHER_DEMO_DATA <- readRDS(DEMOGRAPHIC_OTHER_LIN_LOC)%>% mutate(POP=Num_Male+Num_Female,Births=ifelse(Age==0,POP,0)) %>% group_by(Year,County,Region) %>% summarize(Births=sum(Births)) %>% ungroup %>% arrange(Region,Year) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup %>% mutate(Deaths=NA,Migration=NA) %>% left_join(MAKE_REG_DATA(readRDS(DEMOGRAPHIC_OTHER_LIN_LOC)))
|
|
|
|
|
|
|
|
OTHER_POP_DATA <- readRDS(POPULATION_CITY_LOC)%>% rename(Region=City) %>% filter(!(Region %in% c("Kemmerer","Diamondville")),County=='Lincoln') %>% group_by(Year) %>% mutate(Population=sum(Population,na.rm=TRUE)) %>% ungroup %>% mutate(Region='Lincoln_Other') %>% unique %>% ungroup
|
|
|
|
OTHER_REG_DATA <- OTHER_POP_DATA %>% left_join(OTHER_DEMO_DATA) %>% select(colnames(REG_DATA))
|
|
ST_OTHER_REG <- OTHER_REG_DATA[OTHER_REG_DATA$Year==2023,]
|
|
ST_OTHER_REG$Year <-2024
|
|
###The starting entry to predict births in the next period based on the current population
|
|
ST_OTHER_REG[,"Population"] <- OTHER_REG_DATA[OTHER_REG_DATA$Year==2024,] %>% pull("Population")
|
|
|
|
####################################################3
|
|
REG_DATA <- REG_DATA %>% rbind(OTHER_REG_DATA) %>% rbind(KEM_REG_DATA)
|
|
|
|
###Predict the number of Births
|
|
MOD_BIRTHS <- feols(log(Births)~log(PREV_BIRTH)+log(PREV_TWO_BIRTH)+log(Min_Birth_Group)+Year*Region,cluster=~Year+Region, data=REG_DATA ) #Higher AIC but worse acf
|
|
ST_OTHER_REG
|
|
#Current year prediction
|
|
exp(predict(MOD_BIRTHS,newdata=ST_KEM_REG)) #Kemmerer births
|
|
exp(predict(MOD_BIRTHS,newdata=ST_OTHER_REG)) #Other Lincoln births
|
|
exp(predict(MOD_BIRTHS,newdata=ST_LIN_REG)) #All Lincoln births
|
|
#Note that due to useing diffrent data sets Lincoln is NOT colinear with Kemmere+Other Lincoln. Either result can be downshifted by the amount of diffrence
|
|
ADJUST_RESULTS_FACTOR <- (exp(predict(MOD_BIRTHS,newdata=ST_KEM_REG))+exp(predict(MOD_BIRTHS,newdata=ST_OTHER_REG)))/exp(predict(MOD_BIRTHS,newdata=ST_LIN_REG))
|
|
|
|
ADJUST_RESULTS_FACTOR
|
|
#MOD_BIRTHS_ALT <- feols(log(Births)~log(PREV_BIRTH)+log(Min_Birth_Group)+Year*County,cluster=~Year+County, data=REG_DATA )
|
|
#AIC(MOD_BIRTHS)<AIC(MOD_BIRTHS_ALT)
|
|
#Optional: Review the ACF and PACF for validity. Model made on October 24nd appears to have uncorrelated lags of residuals accept year three.
|
|
#acf(resid(MOD_BIRTHS))
|
|
#acf(resid(MOD_BIRTHS_ALT))
|
|
if(!exists("SAVE_REG_LOC")){SAVE_REG_LOC <- "Data/Intermediate_Inputs"}
|
|
dir.create(SAVE_REG_LOC , recursive = TRUE, showWarnings = FALSE)
|
|
|
|
|
|
saveRDS(REG_DATA,SAVE_REG_LOC(paste0(SAVE_REG_LOC,"/Birth_Regression_Data_Set.Rds")))
|
|
saveRDS(FIRST_PREDICT_YEAR_POPULATION_DATA,START_DEMOGRAPHIC_DATA) #Save the cleaned data set for later use when starting the simulation.
|
|
#Cleanup data no longer needed, and save some RAM
|
|
rm(POP_DATA,DEMOGRAPHIC_DATA,REG_DATA)
|
|
gc()
|
|
|