Working on birth reg, prelim data

2025-11-10 17:17:24 -07:00 · 2025-11-10 17:17:24 -07:00 · f20749c434
commit f20749c434
parent 9d0ba09898
3 changed files with 61 additions and 37 deletions
--- a/Scripts/1A_Download_and_Process_Population_Data.r
+++ b/Scripts/1A_Download_and_Process_Population_Data.r
@ -86,10 +86,25 @@ COLUMNS <- c(1,which(TBL[ST-2,] %in% 1970:2025))
 NAMES <- TBL[4,COLUMNS][-1]
 TBL <- TBL[ST:END,COLUMNS ]
 colnames(TBL) <- c("County",NAMES)
+###Assign each city a county, this is used later to add in a county back to the city but is not used immediately
+	COUNTY_LOCATION <-  which(grepl(" County",TBL %>% pull(County)) & !grepl("Balance of",TBL %>% pull(County)))
+	RES <- c()

+	for(i in 1:nrow(TBL)){
+		RES[i] <- gsub(" County","",TBL[COUNTY_LOCATION[max(which(COUNTY_LOCATION<=i))],] %>% pull(County) )
+	}
+	CITY_TO_COUNTY <- cbind(TBL$County,RES) %>% as_tibble
+	CITY_TO_COUNTY <-	rbind(CITY_TO_COUNTY,c("Frannie","Big_Horn"))  #Manually adding a defunct city in earlier data
+	CITY_TO_COUNTY <-	rbind(CITY_TO_COUNTY,c('Wind_River_Res.','Wind_River_Res.'))  #Add a reservation which spans two counties and can be labled as a single region
+
+
+	colnames(CITY_TO_COUNTY) <- c("City","County")
+	CITY_TO_COUNTY$City <- gsub(" ","_",gsub("Balance of","Unincorporated",gsub(" County","",gsub(" city","",gsub(" town","",CITY_TO_COUNTY$City,ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE))
+###########Continue with data collection
 TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
 TBL$County <- gsub("  "," ",gsub("\n","",gsub("\r","",TBL %>% pull(County))))

+
 COUNTY_POP<- TBL[grep("COUNTY",TBL %>% pull(County),ignore.case=TRUE),]
 COUNTY_POP<-	COUNTY_POP[grep("Balance",COUNTY_POP%>% pull(County),invert=TRUE,ignore.case=TRUE),]
 COUNTY_POP$County <- gsub(" ","_",gsub(" County","",COUNTY_POP$County))
@ -307,22 +322,29 @@ sink()

 	KEM_OLD <- c(843,1517,1884,2026,1667,2028,2292) #1910 forward until 1970
 	Year <-  seq(1910,1970,by=10)
-	TEMP <-	cbind(Year,rep("kemmerer",7),KEM_OLD)
+	TEMP <-	cbind(Year,rep("Kemmerer",7),KEM_OLD)
 	colnames(TEMP ) <- c("Year","City","Population")
 	TEMP <- as_tibble(TEMP)
 	CITY_POP <- rbind(TEMP,CITY_POP) 


 	DIAMOND_OLD <- c(696,726,812,586,415,398,485)
-	TEMP <-	cbind(Year,rep("Diamondvile",7),DIAMOND_OLD)
+	TEMP <-	cbind(Year,rep("Diamondville",7),DIAMOND_OLD)
 	colnames(TEMP ) <- c("Year","City","Population")
 	TEMP <- as_tibble(TEMP)
 	CITY_POP <- rbind(TEMP,CITY_POP)  %>% arrange(City,Year)
 	#Remove empty values, ensure all numeric values are not saved as characters
 	CITY_POP <- CITY_POP %>% filter(!is.na(Population) ) %>% mutate(Population=parse_number(Population),Year=parse_number(Year)) 
+#Fix names
+CITY_POP[grep("Elk_Mtn",CITY_POP$City),"City"]  <- "Elk_Mountain"
+CITY_POP[grep("Frannie",CITY_POP$City),"City"] <- "Frannie"
+CITY_POP[grep("Ft\\._La",CITY_POP$City),"City"] <- "Fort_Laramie"
+
 #Add Other Data
 COUNTY_POP <- COUNTY_POP %>% mutate(Year=as.numeric(Year)) %>% unique
 WY_COUNTY_DATA_SET <- COUNTY_POP %>% left_join(WY_COUNTY_DATA_SET ) %>% mutate(Population=as.numeric(Population)) %>% unique
+CITY_POP <- CITY_POP %>% left_join(CITY_TO_COUNTY) %>% select(Year,City,County,Population)
+CITY_POP[is.na(CITY_POP$County),"County"] <-  str_remove(CITY_POP[is.na(CITY_POP$County),] %>% pull(City),"Unincorporated_") #Fix missing Unincorportaed records without a county.

 ###Save Population Results
 	if(!exists("SAVE_LOC_POP")){SAVE_LOC_POP <-"./Data/Cleaned_Data/Population_Data"}
--- a/Scripts/1D_Use_ACS_Census_Data_to_Estimate_Kemmerer_Demographics.r
+++ b/Scripts/1D_Use_ACS_Census_Data_to_Estimate_Kemmerer_Demographics.r
@ -28,7 +28,7 @@ OLD <-     	DEMO_DATA_ALL %>% filter(Age>=85) %>% group_by(County,IN_KEM,Year,Se
 DEMO_DATA_ALL <- rbind(YOUNG,OLD) %>% arrange(County,Year,IN_KEM,Sex)
 DEMO_DATA_ALL <- DEMO_DATA_ALL %>% mutate(Population=round(Population))  %>% pivot_wider(values_from=Population,names_from=Sex,names_prefix="Num_")
 OTHER_LIN_DEMO_DATA <- DEMO_DATA_ALL  %>% filter(IN_KEM==0) %>% rename(Region=IN_KEM) %>% mutate(Region='Lincoln_Other')
-KEM_DEMO_DATA <- DEMO_DATA_ALL  %>% filter(IN_KEM==1) %>% rename(Region=IN_KEM) %>% mutate(Region='Kemmerer')
+KEM_DEMO_DATA <- DEMO_DATA_ALL  %>% filter(IN_KEM==1) %>% rename(Region=IN_KEM) %>% mutate(Region='Kemmerer & Diamondville')



--- a/Scripts/2A_Birth_Rate_Regression.r
+++ b/Scripts/2A_Birth_Rate_Regression.r
@ -22,52 +22,54 @@ DEMOGRAPHIC_COUNTY_DATA <- readRDS(DEMOGRAPHIC_COUNTY_LOC)
 COUNTY_POP <- readRDS(POPULATION_COUNTY_LOC) 
 REG_DATA <- readRDS(POPULATION_COUNTY_LOC) %>% full_join(MAKE_REG_DATA(DEMOGRAPHIC_COUNTY_DATA))
  REG_DATA <- REG_DATA %>% group_by(County,Region) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup 
-  REG_DATA <-  REG_DATA %>% select(-Female_Birth_Group,-Male_Birth_Group) #Store the data set of only the first year needing a birth forecast, to start the birth Monte Carlo simulations.
+  REG_DATA <-  REG_DATA %>% select(-Female_Birth_Group,-Male_Birth_Group)%>% mutate(Region=County) %>% select(Year,County,Region,everything())
+ #Store the data set of only the first year needing a birth forecast, to start the birth Monte Carlo simulations.
 ###Some of the years are missing births, previous births etc. Where missing fill this in by assuming all age zero children in the demographic  data (DEMOGRAPHIC_LOC) were born in the last year. This makes a more complete data set. Some test find a near perfect 1 to 1 with this method 
  	#Data to fill in the missing records
 	FILL_IN_DATA <- DEMOGRAPHIC_COUNTY_DATA %>% mutate(POP=Num_Male+Num_Female,BIRTHS=ifelse(Age==0,POP,0)) %>% group_by(County,Region,Year)  %>% summarize(BIRTHS=sum(BIRTHS)) %>% arrange(County,Year) %>% mutate(ALT=lag(BIRTHS),ALT2=lag(BIRTHS,2)) %>% ungroup
 #Join and replace missing records
 REG_DATA <- REG_DATA %>% left_join(FILL_IN_DATA )  %>% mutate(Births=ifelse(is.na(Births),BIRTHS,Births),PREV_BIRTH=ifelse(is.na(PREV_BIRTH),ALT,PREV_BIRTH),PREV_TWO_BIRTH=ifelse(is.na(PREV_TWO_BIRTH),ALT2,PREV_TWO_BIRTH)) %>% select(-BIRTHS,-ALT,-ALT2) %>% select(Year,County,Region,everything()) %>% mutate(Region=County)
+ST_LIN_REG <- REG_DATA %>% filter(County=="Lincoln",Year==2024)  
 	
-###Working on Kemmerer data
-DEMOGRAPHIC_KEM_DATA <-  readRDS(DEMOGRAPHIC_KEM_LOC)
-readRDS(POPULATION_CITY_LOC) %>% filter(City %in% c("Kemmerer","Diamondville")) %>% group_by(Year) %>% mutate(Population=sum(Population,na.rm=TRUE)) %>% mutate(City='Kemmerer') %>% rename(Region=City)
-MAKE_REG_DATA(readRDS(DEMOGRAPHIC_KEM_LOC))
-REG_DATA
-readRDS(DEMOGRAPHIC_KEM_LOC)%>% mutate(POP=Num_Male+Num_Female,Births=ifelse(Age==0,POP,0)) %>% group_by(County,Region,Year)  %>% summarize(Births=sum(Births)) %>% arrange(County,Year) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup
+####################Create same data set but for only the Kemmerer Diamondville area 
+KEM_DEMO_DATA <-   readRDS(DEMOGRAPHIC_KEM_LOC)%>% mutate(POP=Num_Male+Num_Female,Births=ifelse(Age==0,POP,0)) %>% group_by(Year,County,Region) %>% summarize(Births=sum(Births)) %>% ungroup  %>% arrange(Region,Year) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup %>% mutate(Deaths=NA,Migration=NA) %>% left_join(MAKE_REG_DATA(readRDS(DEMOGRAPHIC_KEM_LOC)))
+
+KEM_POP_DATA <- readRDS(POPULATION_CITY_LOC)%>% rename(Region=City) %>% filter(Region %in% c("Kemmerer","Diamondville")) %>% group_by(Year) %>% mutate(Population=sum(Population,na.rm=TRUE)) %>% ungroup  %>% mutate(Region='Kemmerer & Diamondville') %>% unique %>% ungroup
+
+KEM_REG_DATA <- KEM_POP_DATA %>% left_join(KEM_DEMO_DATA) %>% select(colnames(REG_DATA))
+KEM_REG_DATA 
+ST_KEM_REG <- KEM_REG_DATA[KEM_REG_DATA$Year==2023,]
+ST_KEM_REG$Year <-2024
+KEM_REG_DATA %>% tail
+###The starting entry to predict births in the next period based on the current population
+ST_KEM_REG[,"Population"] <- KEM_REG_DATA[KEM_REG_DATA$Year==2024,] %>% pull("Population")
+####################Create same data set but for only parts of Lincoln not in the Kemmerer Diamondville area 
+OTHER_DEMO_DATA <-   readRDS(DEMOGRAPHIC_OTHER_LIN_LOC)%>% mutate(POP=Num_Male+Num_Female,Births=ifelse(Age==0,POP,0)) %>% group_by(Year,County,Region) %>% summarize(Births=sum(Births)) %>% ungroup  %>% arrange(Region,Year) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup %>% mutate(Deaths=NA,Migration=NA) %>% left_join(MAKE_REG_DATA(readRDS(DEMOGRAPHIC_OTHER_LIN_LOC)))


-	 readRDS(DEMOGRAPHIC_KEM_LOC)%>% mutate(Male_Window=Age>=18 & Age<=30,Female_Window=Age>=18 & Age<=28) %>% group_by(County,Year) %>% summarize(Female_Birth_Group=sum(Num_Female*Female_Window,na.rm=TRUE),Male_Birth_Group=sum(Num_Male*Male_Window,na.rm=TRUE),Min_Birth_Group=ifelse(Female_Birth_Group<Male_Birth_Group,Female_Birth_Group,Male_Birth_Group)) %>% ungroup
-DEMOGRAPHIC_DATA 
-TEST <- readRDS(POPULATION_COUNTY_LOC) 
-if(!("Births" %in% colnames(TEST)))
-"Deaths" %in% colnames(TEST)
-"Migration" %in% colnames(TEST)
-"Migration" %in% colnames(TEST)

+OTHER_POP_DATA <- readRDS(POPULATION_CITY_LOC)%>% rename(Region=City) %>% filter(!(Region %in% c("Kemmerer","Diamondville")),County=='Lincoln') %>% group_by(Year) %>% mutate(Population=sum(Population,na.rm=TRUE)) %>% ungroup  %>% mutate(Region='Lincoln_Other') %>% unique %>% ungroup

-readRDS(DEMOGRAPHIC_COUNTY_LOC)
-readRDS(POPULATION_COUNTY_LOC) 
-COUNTY_REG_DATA <-  MAKE_REG_DATA(readRDS(DEMOGRAPHIC_COUNTY_LOC),readRDS(POPULATION_COUNTY_LOC)  )
-readRDS(DEMOGRAPHIC_KEM_LOC)
-readRDS(POPULATION_CITY_LOC) %>% 
-readRDS(DEMOGRAPHIC_KEM_LOC)
-readRDS(DEMOGRAPHIC_KEM_LOC)
-  MAKE_REG_DATA(readRDS(DEMOGRAPHIC_KEM_LOC),readRDS(POPULATION_CITY_LOC)  ) %>% filter(!is.na(Region)) %>% pull(Region) %>% unique
-%>% pull(Region) %>% unique
-%>% filter(Region=='Kemmerer') 
-
-readRDS(POPULATION_CITY_LOC)
-MAKE_REG_DATA(readRDS("Data/Cleaned_Data/Demographic_Sex_Age_Data/RDS/Kemmerer_Diamondville_Demographics.Rds"),readRDS("Data/Cleaned_Data/Population_Data/RDS/All_Wyoming_City_Populations.Rds"))
-
-  #Extract the population trend data to connect with demographics (Population,births,deaths)
-  POP_DATA <- readRDS(POPULATION_LOC) 
-  #Merger the two data sets and drop any records that cannot be used in the regression (this makes the "predict" function output the right number of records)
-  REG_DATA <- POP_DATA %>% full_join(DEMOGRAPHIC_DATA) 
+OTHER_REG_DATA <- OTHER_POP_DATA %>% left_join(OTHER_DEMO_DATA) %>% select(colnames(REG_DATA))
+ST_OTHER_REG <- OTHER_REG_DATA[OTHER_REG_DATA$Year==2023,]
+ST_OTHER_REG$Year <-2024
+###The starting entry to predict births in the next period based on the current population
+ST_OTHER_REG[,"Population"] <- OTHER_REG_DATA[OTHER_REG_DATA$Year==2024,] %>% pull("Population")

+####################################################3
+REG_DATA <- REG_DATA  %>% rbind(OTHER_REG_DATA) %>% rbind(KEM_REG_DATA)

 ###Predict the number of Births
-MOD_BIRTHS <- feols(log(Births)~log(PREV_BIRTH)+log(PREV_TWO_BIRTH)+log(Min_Birth_Group)+Year*County,cluster=~Year+County, data=REG_DATA ) #Higher AIC but worse acf
+MOD_BIRTHS <- feols(log(Births)~log(PREV_BIRTH)+log(PREV_TWO_BIRTH)+log(Min_Birth_Group)+Year*Region,cluster=~Year+Region, data=REG_DATA ) #Higher AIC but worse acf
+ST_OTHER_REG
+#Current year prediction
+exp(predict(MOD_BIRTHS,newdata=ST_KEM_REG)) #Kemmerer births
+exp(predict(MOD_BIRTHS,newdata=ST_OTHER_REG)) #Other Lincoln births
+exp(predict(MOD_BIRTHS,newdata=ST_LIN_REG)) #All Lincoln births
+#Note that due to useing diffrent data sets Lincoln is NOT colinear with Kemmere+Other Lincoln. Either result can be downshifted by the amount of diffrence
+ADJUST_RESULTS_FACTOR <- (exp(predict(MOD_BIRTHS,newdata=ST_KEM_REG))+exp(predict(MOD_BIRTHS,newdata=ST_OTHER_REG)))/exp(predict(MOD_BIRTHS,newdata=ST_LIN_REG))
+
+ADJUST_RESULTS_FACTOR 
 #MOD_BIRTHS_ALT <- feols(log(Births)~log(PREV_BIRTH)+log(Min_Birth_Group)+Year*County,cluster=~Year+County, data=REG_DATA )
 #AIC(MOD_BIRTHS)<AIC(MOD_BIRTHS_ALT)
 #Optional: Review the ACF and PACF for validity. Model made on October 24nd appears to have uncorrelated lags of residuals accept year three.