Population_Study/Birth_Rate_Regression.r

##########################Model Population Trends
##Run Regression
#Pull in Demographic data and create categories for key groups in the regression, male/female population with high fertility, children under one and two (but not zero). This data is broken down by each age group so aggregate to the county, year level for the final regression.
#Fertility age bounds were informed by the regression found in the file ./Scripts/Other_Analysis/Select_Range_of_Male_Female_Fertility.r Which qualitatively supports that the number of people in these age ranges (18-28 Women, 18-30 Men) have the most significance in predicting birth rates. These two are combined into one variable which represent the minimum number of people in the key fertility window between the sexes, this is the binding fertility constraint and has more explanatory power than including either the number of men or women in the fertility window alone, providing a good trade off for including more variables or reducing variance.
  DEMOGRAPHIC_DATA <- readRDS("Data/Cleaned_Data/Wyoming_County_Demographic_Data.Rds") %>% mutate(Male_Window=Age>=18 & Age<=30,Female_Window=Age>=18 & Age<=28) %>% group_by(County,Year) %>% summarize(Female_Birth_Group=sum(Num_Female*Female_Window),Male_Birth_Group=sum(Num_Male*Male_Window),Min_Birth_Group=ifelse(Female_Birth_Group<Male_Birth_Group,Female_Birth_Group,Male_Birth_Group))
  #Extract the population trend data to connect with demographics (Population,births,deaths)
  POP_DATA <- readRDS("Data/Cleaned_Data/Wyoming_County_Population.Rds") %>% mutate(LN=ifelse(County=="Lincoln",1,0))
  #Merger the two data sets and drop any records that cannot be used in the regression (this makes the "predict" function output the right number of records)
  REG_DATA <- POP_DATA %>% full_join(DEMOGRAPHIC_DATA)
  REG_DATA <- REG_DATA %>% group_by(County) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup %>% filter(!is.na(PREV_TWO_BIRTH),!is.na(Min_Birth_Group))
  REG_DATA$County <- factor(REG_DATA$County)
  FIRST_PREDICT_YEAR_POPULATION_DATA <-  REG_DATA %>% filter(Year==2023,County=='Lincoln') %>% select(-LN,-Female_Birth_Group,-Male_Birth_Group) #Store the data set of only the first year needing a birth forecast, to start the birth Monte Carlo simulations.
  REG_DATA <- REG_DATA %>% filter(!is.na(Births)) #Remove any values with missing births for a simpler regression which includes only complete data


###Predict the number of Births
  MOD_BIRTHS <- feols(log(Births)~log(PREV_BIRTH)+log(PREV_TWO_BIRTH)+log(Min_Birth_Group)+Year*County,cluster=~Year+County, data=REG_DATA )
 #Optional: Review the ACF and PACF for validity. Model made on October 22nd appears to have uncorrelated lags of residuals.
  #RES_DATA <- REG_DATA #Data to create visuals with, without changing the main file. Can be used for ggplot, or residual tests
  #RES_DATA$RESID <- resid(MOD_BIRTHS)
  #acf(RES_DATA %>% pull(RESID))
  #pacf(RES_DATA %>% pull(RESID))

  saveRDS(MOD_BIRTHS,BIRTH_RATE_REG_RESULTS)
  saveRDS(FIRST_PREDICT_YEAR_POPULATION_DATA,START_DEMOGRAPHIC_DATA) #Save the cleaned data set for later use when starting the simulation.
  #Cleanup data no longer needed, and save some RAM
	  rm(POP_DATA,DEMOGRAPHIC_DATA,REG_DATA)
	  gc()