library(tidyverse)
library(fixest)
#setwd("../")
##########################Model Population Trends
##Run Regression 
#Pull in Demographic data and create categories for key groups in the regression, male/female population with high fertility, children under one and two (but not zero). This data is broken down by each age group so aggregate to the county, year level for the final regression.
#Fertility age bounds were informed by the regression found in the file ./Scripts/Other_Analysis/Select_Range_of_Male_Female_Fertility.r Which qualitatively supports that the number of people in these age ranges (18-28 Women, 18-30 Men) have the most significance in predicting birth rates. These two are combined into one variable which represent the minimum number of people in the key fertility window between the sexes, this is the binding fertility constraint and has more explanatory power than including either the number of men or women in the fertility window alone, providing a good trade off for including more variables or reducing variance.
if(!exists("DEMOGRAPHIC_COUNTY_LOC")){DEMOGRAPHIC_COUNTY_LOC <- "./Data/Cleaned_Data/Demographic_Sex_Age_Data/RDS/All_Wyoming_Counties_Demographics.Rds"}
if(!exists("DEMOGRAPHIC_KEM_LOC")){DEMOGRAPHIC_KEM_LOC <- "./Data/Cleaned_Data/Demographic_Sex_Age_Data/RDS/Kemmerer_Diamondville_Demographics.Rds"}
if(!exists("DEMOGRAPHIC_OTHER_LIN_LOC")){DEMOGRAPHIC_OTHER_LIN_LOC <- "./Data/Cleaned_Data/Demographic_Sex_Age_Data/RDS/Other_Lincoln_Demographics.Rds"}

if(!exists("POPULATION_COUNTY_LOC")){POPULATION_COUNTY_LOC <- "./Data/Cleaned_Data/Population_Data/RDS/All_Wyoming_County_Populations.Rds"}
if(!exists("POPULATION_CITY_LOC")){POPULATION_CITY_LOC <- "./Data/Cleaned_Data/Population_Data/RDS/All_Wyoming_City_Populations.Rds"}


#Function to make the data consistent for each data set used to run a birth simulation in the Monte Carlo
ADD_BIRTH_GROUP_DATA <- function(DEMO_DATA){
	return(DEMO_DATA %>% mutate(Male_Window=Age>=18 & Age<=30,Female_Window=Age>=18 & Age<=28) %>% group_by(County,Region,Year) %>% summarize(Female_Birth_Group=sum(Num_Female*Female_Window,na.rm=TRUE),Male_Birth_Group=sum(Num_Male*Male_Window,na.rm=TRUE),Min_Birth_Group=ifelse(Female_Birth_Group<Male_Birth_Group,Female_Birth_Group,Male_Birth_Group)) %>% ungroup%>% select(-Female_Birth_Group,-Male_Birth_Group))
}
####################Create data set for regression of birth rates using available county data (not sub-regions like Kemmerer) 
DEMOGRAPHIC_COUNTY_DATA <- readRDS(DEMOGRAPHIC_COUNTY_LOC)
COUNTY_POP <- readRDS(POPULATION_COUNTY_LOC) 
FILL_IN_DATA <- DEMOGRAPHIC_COUNTY_DATA %>% mutate(POP=Num_Male+Num_Female,BIRTHS=ifelse(Age==0,POP,0)) %>% group_by(County,Region,Year)  %>% summarize(BIRTHS=sum(BIRTHS)) %>% arrange(County,Year) 
#Join and replace missing records
COUNTY_BIRTH_DATA <- COUNTY_POP %>% left_join(FILL_IN_DATA)  %>% mutate(Births=ifelse(is.na(Births),BIRTHS,Births)) %>% select(-BIRTHS) %>% select(Year,County,Region,everything()) %>% mutate(Region=County)
COUNTY_BIRTH_DATA <- COUNTY_BIRTH_DATA  %>% left_join(ADD_BIRTH_GROUP_DATA(DEMOGRAPHIC_COUNTY_DATA))

####################Create same data set but for only the Kemmerer Diamondville area 
KEM_RAW_DEMOGRAPHIC_DATA <- readRDS(DEMOGRAPHIC_KEM_LOC)
KEM_DEMO_DATA <-   KEM_RAW_DEMOGRAPHIC_DATA %>% mutate(POP=Num_Male+Num_Female,Births=ifelse(Age==0,POP,0)) %>% group_by(Year,County,Region) %>% summarize(Births=sum(Births))
KEM_POP_DATA <- readRDS(POPULATION_CITY_LOC)%>% rename(Region=City) %>% filter(Region %in% c("Kemmerer","Diamondville")) %>% group_by(Year) %>% mutate(Population=sum(Population,na.rm=TRUE)) %>% ungroup  %>% mutate(Region='Kemmerer & Diamondville') %>% unique %>% ungroup
KEM_BIRTH_DATA <- KEM_POP_DATA %>% left_join(KEM_DEMO_DATA)
KEM_BIRTH_DATA <- KEM_BIRTH_DATA %>% left_join(ADD_BIRTH_GROUP_DATA(KEM_RAW_DEMOGRAPHIC_DATA)) 

####################Create same data set but for only parts of Lincoln not in the Kemmerer Diamondville area 
OTHER_RAW_DEMOGRAPHIC_DATA <- readRDS(DEMOGRAPHIC_OTHER_LIN_LOC)
OTHER_DEMO_DATA <-  OTHER_RAW_DEMOGRAPHIC_DATA %>% mutate(POP=Num_Male+Num_Female,Births=ifelse(Age==0,POP,0)) %>% group_by(Year,County,Region) %>% summarize(Births=sum(Births)) 
OTHER_POP_DATA <- readRDS(POPULATION_CITY_LOC)%>% rename(Region=City) %>% filter(!(Region %in% c("Kemmerer","Diamondville")),County=='Lincoln') %>% group_by(Year) %>% mutate(Population=sum(Population,na.rm=TRUE)) %>% ungroup  %>% mutate(Region='Lincoln_Other') %>% unique %>% ungroup
OTHER_BIRTH_DATA <- OTHER_POP_DATA %>% left_join(OTHER_DEMO_DATA)
OTHER_BIRTH_DATA <- 	OTHER_BIRTH_DATA %>% left_join(ADD_BIRTH_GROUP_DATA(OTHER_RAW_DEMOGRAPHIC_DATA))  

####################################################3
###Because the total of the Kemmerer and Other Lincoln should sum to the births in the entire county, we assume that the ratio of children under the age of 1, in each area is the same percentage as the total births in the county. The total births in the county is then taken from the Wyoming eadiv estimates at the county level. This makes the two sum to the proper amount 
REGIONAL_BIRTH_ADJUST <-  KEM_BIRTH_DATA %>% rbind(OTHER_BIRTH_DATA) %>% group_by(Year) %>% filter(!is.na(Births)) %>% mutate(PER_COUNTY_BIRTHS=Births/sum(Births))  %>% group_by(Region,Year) %>% select(Year,County,Region,PER_COUNTY_BIRTHS)

ADJUSTED_KEM_STUB_DATA <-  COUNTY_BIRTH_DATA %>% filter(Region=='Lincoln') %>% select(Year,County,Births) %>% inner_join(REGIONAL_BIRTH_ADJUST) %>% mutate(Births=round(Births*PER_COUNTY_BIRTHS)) %>% select(-PER_COUNTY_BIRTHS) 
KEM_BIRTH_DATA <- KEM_BIRTH_DATA %>% select(-Births) %>% left_join(ADJUSTED_KEM_STUB_DATA ) 
OTHER_BIRTH_DATA <- OTHER_BIRTH_DATA %>% select(-Births) %>% left_join(ADJUSTED_KEM_STUB_DATA)
REG_DATA <- rbind(COUNTY_BIRTH_DATA,rbind(KEM_BIRTH_DATA,OTHER_BIRTH_DATA) %>% mutate(Deaths=NA,Migration=NA) %>% select(colnames(COUNTY_BIRTH_DATA)))
REG_DATA <- REG_DATA %>% group_by(Region) %>% arrange(Year) %>% mutate(Lag_Births=lag(Births),Lag_Two_Births=lag(Births,2))  %>% ungroup %>% arrange(County,Region,Year)
REG_REDUCED_DATA <- REG_DATA %>% filter(!is.na(Births),!is.na(Lag_Two_Births),!is.na(Min_Birth_Group),!is.na(Lag_Births),!is.na(Region)) %>% select(Year,Region,Min_Birth_Group,Births,Lag_Births,Lag_Two_Births)


###Predict the number of Births
MOD_BIRTHS <- feols(log(Births)~log(Lag_Births)+log(Lag_Two_Births)+log(Min_Birth_Group)+Year*Region,cluster=~Year+Region, data=REG_REDUCED_DATA ) #Higher AIC but worse acf

####Alternate models with different years of data to test to model against a counterfactual
REG_DATA_2016 <- REG_REDUCED_DATA %>% filter(Year<=2016)
MOD_BIRTHS_2016 <- feols(log(Births)~log(Lag_Births)+log(Lag_Two_Births)+log(Min_Birth_Group)+Year*Region,cluster=~Year+Region, data=REG_DATA_2016 ) 
REG_DATA_1985 <- REG_REDUCED_DATA %>% filter(Year<=1985)
MOD_BIRTHS_1985 <- feols(log(Births)~log(Lag_Births)+log(Lag_Two_Births)+log(Min_Birth_Group)+Year*Region,cluster=~Year+Region, data=REG_DATA_1985 ) 


###Models to more easily show the results
#Easier to read for a LaTex Table
DICT <- c("log(Lag_Births)"="Births Last Years (log)","log(Lag_Two_Births)"="Births Two Years Ago (log)","LN"='Kemmerer Area','log(Births)'='Births (log)','log(Min_Birth_Group)'='Child Rearing Aged Adults (log)','Region'="County" )

REG_VIEW_DATA <- REG_REDUCED_DATA %>% mutate(LN=ifelse(Region=='Kemmerer & Diamondville',1,0))
 MOD_VIEW_BIRTHS <- feols(log(Births)~log(Lag_Births)+log(Lag_Two_Births)+log(Min_Birth_Group)+LN*Year+Year*Region|Region,cluster=~Year+Region, data=REG_VIEW_DATA ) #Higher AIC but worse acf
REG_VIEW_DATA_2016 <- REG_VIEW_DATA %>% filter(Year<=2016)
MOD_VIEW_BIRTHS_2016 <- feols(log(Births)~log(Lag_Births)+log(Lag_Two_Births)+log(Min_Birth_Group)+LN*Year+Year*Region|Region,cluster=~Year+Region, data=REG_VIEW_DATA_2016 ) 
REG_VIEW_DATA_1985 <- REG_VIEW_DATA%>% filter(Year<=1985)
MOD_VIEW_BIRTHS_1985 <- feols(log(Births)~log(Lag_Births)+log(Lag_Two_Births)+log(Min_Birth_Group)+LN*Year+Year*Region|Region,cluster=~Year+Region, data=REG_VIEW_DATA_1985 ) 
1
###Prelim information for fixest table
NOTES <- c("Natural log used for all variables besides years, and counties","Kemmerer Area: Includes both Kemmerer and Diamondville","Child Rearing Adults: Minimum of all women aged 18-28 or men aged 18-30.","Kemmerer data from the American Community Survey Data starts in 2009")
LAB <- c("Kem.","Kem. Pre 2016","Lincoln Pre 1985")
HEAD <- list(c("All Data","Pre 2016","Pre 1985"))
#Current year prediction
	#exp(predict(MOD_BIRTHS,REG_DATA %>% filter(Region=='Kemmerer & Diamondville',Year==2023)))
	#REG_DATA %>% filter(Region=='Kemmerer & Diamondville',Year==2023)

#MOD_BIRTHS_ALT <- feols(log(Births)~log(Lag_Births)+log(Min_Birth_Group)+Year*Region,cluster=~Year+Region, data=REG_REDUCED_DATA )
#AIC(MOD_BIRTHS)<AIC(MOD_BIRTHS_ALT) #Better AIC for the two lag version
 #Optional: Review the ACF and PACF for validity. Model made on Nov 11 appears to have uncorrelated lags of residuals when looking at individual counties or regions. 
  if(!exists("SAVE_FIG_LOC")){SAVE_FIG_LOC <- "./Results/Birth_Regression"}
 dir.create(SAVE_FIG_LOC , recursive = TRUE, showWarnings = FALSE)
 REG_TABLE_LOC <- paste0(SAVE_FIG_LOC,"/Birth_Rate_Regressions.png")
 REG_TABLE_TEX_LOC <- paste0(SAVE_FIG_LOC,"/Birth_Rate_Regressions.tex")
 #For some reason it needs to be run twice to save the image
try(etable(MOD_VIEW_BIRTHS,MOD_VIEW_BIRTHS_2016,MOD_VIEW_BIRTHS_1985,headers=HEAD,group=list("County Trends"="Year "),style.tex=style.tex(yesNo="$\\checkmark$"),dict=DICT,notes=NOTES,file=REG_TABLE_TEX_LOC ,export=REG_TABLE_LOC,replace=TRUE))
try(etable(MOD_VIEW_BIRTHS,MOD_VIEW_BIRTHS_2016,MOD_VIEW_BIRTHS_1985,headers=HEAD,group=list("County Trends"="Year "),style.tex=style.tex(yesNo="$\\checkmark$"),dict=DICT,notes=NOTES,file=REG_TABLE_TEX_LOC ,export=REG_TABLE_LOC,replace=TRUE))


TEMP <- REG_REDUCED_DATA
	TEMP$RESID <- resid(MOD_BIRTHS)
#Kemmerer ACF/PACF
C_TEMP <- TEMP %>% filter(Region=='Kemmerer & Diamondville') %>% arrange(Year)
    png(paste0(SAVE_FIG_LOC,"/Kemmerer_ACF.png"), width = 12, height = 8, units = "in", res = 600)
	acf(C_TEMP$RESID,main='ACF of Kemmerer & Diamondville Birth Estimate Residuals',xlab="Lag (Years)")
     dev.off()
     png(paste0(SAVE_FIG_LOC,"/Kemmerer_PACF.png"), width = 12, height = 8, units = "in", res = 600)
	pacf(C_TEMP$RESID,main='PACF of Kemmerer & Diamondville Birth Estimate Residuals',xlab="Lag (Years)")
     dev.off()
#Lincoln total ACF/PACF
C_TEMP <- TEMP %>% filter(Region=='Lincoln') %>% arrange(Year)
    png(paste0(SAVE_FIG_LOC,"/Lincoln_ACF.png"), width = 12, height = 8, units = "in", res = 600)
	acf(C_TEMP$RESID,main='ACF of Lincoln County Birth Estimate Residuals',xlab="Lag (Years)")
    dev.off()
    png(paste0(SAVE_FIG_LOC,"/Lincoln_PACF.png"), width = 12, height = 8, units = "in", res = 600)
	pacf(C_TEMP$RESID,main='PACF of Lincoln County Birth Estimate Residuals',xlab="Lag (Years)")
    dev.off()
#Lincoln Other (Not Kemmerer) ACF/PACF
C_TEMP <- TEMP %>% filter(Region=='Lincoln_Other') %>% arrange(Year)
    png(paste0(SAVE_FIG_LOC,"/Lincoln_Other_Areas_ACF.png"), width = 12, height = 8, units = "in", res = 600)
	acf(C_TEMP$RESID,main='ACF of Other Parts of Lincoln County Birth Estimate Residuals',xlab="Lag (Years)")
    dev.off()
    png(paste0(SAVE_FIG_LOC,"/Lincoln_Other_Areas_PACF.png"), width = 12, height = 8, units = "in", res = 600)
	pacf(C_TEMP$RESID,main='PACF of Other Parts of Lincoln County Birth Estimate Residuals',xlab="Lag (Years)")
    dev.off()

####Create data stubs to start a simulation. That is predict the births from this most recent year. Include records from various years of potential interest
ST_REG_DATA <- REG_REDUCED_DATA %>% filter(Region=='Lincoln') %>% filter(Year==max(Year)) %>% rbind(REG_REDUCED_DATA %>% filter(Region=='Kemmerer & Diamondville') %>% filter(Year==max(Year))) %>% rbind(REG_REDUCED_DATA %>% filter(Region=='Lincoln_Other') %>% filter(Year==max(Year))) %>% rbind(REG_REDUCED_DATA %>% filter(Region=='Lincoln') %>% filter(Year==2016)) %>% rbind(REG_REDUCED_DATA %>% filter(Region=='Kemmerer & Diamondville') %>% filter(Year==2016)) %>% rbind(REG_REDUCED_DATA %>% filter(Region=='Lincoln_Other') %>% filter(Year==2016)) %>% rbind(REG_REDUCED_DATA %>% filter(Region=='Lincoln') %>% filter(Year==1985))


  if(!exists("SAVE_REG_LOC")){SAVE_REG_LOC <- "Data/Intermediate_Inputs/Birth_Regressions"}
 dir.create(SAVE_REG_LOC , recursive = TRUE, showWarnings = FALSE)
  saveRDS(MOD_BIRTHS,paste0(SAVE_REG_LOC,"/Birth_Regression.Rds"))
  saveRDS(MOD_BIRTHS_2016,paste0(SAVE_REG_LOC,"/Birth_Regression_2016.Rds"))
  saveRDS(MOD_BIRTHS_1985,paste0(SAVE_REG_LOC,"/Birth_Regression_1985.Rds"))
 if(!exists("SAVE_DATA_LOC")){SAVE_DATA_LOC <- "Data/Intermediate_Inputs/Birth_Regressions/Regression_Data"}
	 dir.create(SAVE_DATA_LOC , recursive = TRUE, showWarnings = FALSE)
  saveRDS(ST_REG_DATA,paste0(SAVE_DATA_LOC,"/Birth_Simulation_Key_Starting_Points.Rds"))
  write_csv(ST_REG_DATA,paste0(SAVE_DATA_LOC,"/Birth_Simulation_Key_Starting_Points.csv"))
  saveRDS(REG_REDUCED_DATA,paste0(SAVE_DATA_LOC,"/Birth_Regression_Input_Data.Rds"))
  write_csv(REG_REDUCED_DATA,paste0(SAVE_DATA_LOC,"/Birth_Regression_Input_Data.csv"))