Working on birth reg, prelim data

This commit is contained in:
Alex Gebben Work 2025-11-10 17:17:24 -07:00
parent 9d0ba09898
commit f20749c434
3 changed files with 61 additions and 37 deletions

View File

@ -86,10 +86,25 @@ COLUMNS <- c(1,which(TBL[ST-2,] %in% 1970:2025))
NAMES <- TBL[4,COLUMNS][-1] NAMES <- TBL[4,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ] TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("County",NAMES) colnames(TBL) <- c("County",NAMES)
###Assign each city a county, this is used later to add in a county back to the city but is not used immediately
COUNTY_LOCATION <- which(grepl(" County",TBL %>% pull(County)) & !grepl("Balance of",TBL %>% pull(County)))
RES <- c()
for(i in 1:nrow(TBL)){
RES[i] <- gsub(" County","",TBL[COUNTY_LOCATION[max(which(COUNTY_LOCATION<=i))],] %>% pull(County) )
}
CITY_TO_COUNTY <- cbind(TBL$County,RES) %>% as_tibble
CITY_TO_COUNTY <- rbind(CITY_TO_COUNTY,c("Frannie","Big_Horn")) #Manually adding a defunct city in earlier data
CITY_TO_COUNTY <- rbind(CITY_TO_COUNTY,c('Wind_River_Res.','Wind_River_Res.')) #Add a reservation which spans two counties and can be labled as a single region
colnames(CITY_TO_COUNTY) <- c("City","County")
CITY_TO_COUNTY$City <- gsub(" ","_",gsub("Balance of","Unincorporated",gsub(" County","",gsub(" city","",gsub(" town","",CITY_TO_COUNTY$City,ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE))
###########Continue with data collection
TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population)) TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL$County <- gsub(" "," ",gsub("\n","",gsub("\r","",TBL %>% pull(County)))) TBL$County <- gsub(" "," ",gsub("\n","",gsub("\r","",TBL %>% pull(County))))
COUNTY_POP<- TBL[grep("COUNTY",TBL %>% pull(County),ignore.case=TRUE),] COUNTY_POP<- TBL[grep("COUNTY",TBL %>% pull(County),ignore.case=TRUE),]
COUNTY_POP<- COUNTY_POP[grep("Balance",COUNTY_POP%>% pull(County),invert=TRUE,ignore.case=TRUE),] COUNTY_POP<- COUNTY_POP[grep("Balance",COUNTY_POP%>% pull(County),invert=TRUE,ignore.case=TRUE),]
COUNTY_POP$County <- gsub(" ","_",gsub(" County","",COUNTY_POP$County)) COUNTY_POP$County <- gsub(" ","_",gsub(" County","",COUNTY_POP$County))
@ -307,22 +322,29 @@ sink()
KEM_OLD <- c(843,1517,1884,2026,1667,2028,2292) #1910 forward until 1970 KEM_OLD <- c(843,1517,1884,2026,1667,2028,2292) #1910 forward until 1970
Year <- seq(1910,1970,by=10) Year <- seq(1910,1970,by=10)
TEMP <- cbind(Year,rep("kemmerer",7),KEM_OLD) TEMP <- cbind(Year,rep("Kemmerer",7),KEM_OLD)
colnames(TEMP ) <- c("Year","City","Population") colnames(TEMP ) <- c("Year","City","Population")
TEMP <- as_tibble(TEMP) TEMP <- as_tibble(TEMP)
CITY_POP <- rbind(TEMP,CITY_POP) CITY_POP <- rbind(TEMP,CITY_POP)
DIAMOND_OLD <- c(696,726,812,586,415,398,485) DIAMOND_OLD <- c(696,726,812,586,415,398,485)
TEMP <- cbind(Year,rep("Diamondvile",7),DIAMOND_OLD) TEMP <- cbind(Year,rep("Diamondville",7),DIAMOND_OLD)
colnames(TEMP ) <- c("Year","City","Population") colnames(TEMP ) <- c("Year","City","Population")
TEMP <- as_tibble(TEMP) TEMP <- as_tibble(TEMP)
CITY_POP <- rbind(TEMP,CITY_POP) %>% arrange(City,Year) CITY_POP <- rbind(TEMP,CITY_POP) %>% arrange(City,Year)
#Remove empty values, ensure all numeric values are not saved as characters #Remove empty values, ensure all numeric values are not saved as characters
CITY_POP <- CITY_POP %>% filter(!is.na(Population) ) %>% mutate(Population=parse_number(Population),Year=parse_number(Year)) CITY_POP <- CITY_POP %>% filter(!is.na(Population) ) %>% mutate(Population=parse_number(Population),Year=parse_number(Year))
#Fix names
CITY_POP[grep("Elk_Mtn",CITY_POP$City),"City"] <- "Elk_Mountain"
CITY_POP[grep("Frannie",CITY_POP$City),"City"] <- "Frannie"
CITY_POP[grep("Ft\\._La",CITY_POP$City),"City"] <- "Fort_Laramie"
#Add Other Data #Add Other Data
COUNTY_POP <- COUNTY_POP %>% mutate(Year=as.numeric(Year)) %>% unique COUNTY_POP <- COUNTY_POP %>% mutate(Year=as.numeric(Year)) %>% unique
WY_COUNTY_DATA_SET <- COUNTY_POP %>% left_join(WY_COUNTY_DATA_SET ) %>% mutate(Population=as.numeric(Population)) %>% unique WY_COUNTY_DATA_SET <- COUNTY_POP %>% left_join(WY_COUNTY_DATA_SET ) %>% mutate(Population=as.numeric(Population)) %>% unique
CITY_POP <- CITY_POP %>% left_join(CITY_TO_COUNTY) %>% select(Year,City,County,Population)
CITY_POP[is.na(CITY_POP$County),"County"] <- str_remove(CITY_POP[is.na(CITY_POP$County),] %>% pull(City),"Unincorporated_") #Fix missing Unincorportaed records without a county.
###Save Population Results ###Save Population Results
if(!exists("SAVE_LOC_POP")){SAVE_LOC_POP <-"./Data/Cleaned_Data/Population_Data"} if(!exists("SAVE_LOC_POP")){SAVE_LOC_POP <-"./Data/Cleaned_Data/Population_Data"}

View File

@ -28,7 +28,7 @@ OLD <- DEMO_DATA_ALL %>% filter(Age>=85) %>% group_by(County,IN_KEM,Year,Se
DEMO_DATA_ALL <- rbind(YOUNG,OLD) %>% arrange(County,Year,IN_KEM,Sex) DEMO_DATA_ALL <- rbind(YOUNG,OLD) %>% arrange(County,Year,IN_KEM,Sex)
DEMO_DATA_ALL <- DEMO_DATA_ALL %>% mutate(Population=round(Population)) %>% pivot_wider(values_from=Population,names_from=Sex,names_prefix="Num_") DEMO_DATA_ALL <- DEMO_DATA_ALL %>% mutate(Population=round(Population)) %>% pivot_wider(values_from=Population,names_from=Sex,names_prefix="Num_")
OTHER_LIN_DEMO_DATA <- DEMO_DATA_ALL %>% filter(IN_KEM==0) %>% rename(Region=IN_KEM) %>% mutate(Region='Lincoln_Other') OTHER_LIN_DEMO_DATA <- DEMO_DATA_ALL %>% filter(IN_KEM==0) %>% rename(Region=IN_KEM) %>% mutate(Region='Lincoln_Other')
KEM_DEMO_DATA <- DEMO_DATA_ALL %>% filter(IN_KEM==1) %>% rename(Region=IN_KEM) %>% mutate(Region='Kemmerer') KEM_DEMO_DATA <- DEMO_DATA_ALL %>% filter(IN_KEM==1) %>% rename(Region=IN_KEM) %>% mutate(Region='Kemmerer & Diamondville')

View File

@ -19,55 +19,57 @@ MAKE_REG_DATA <- function(DEMO_DATA){
return(DEMO_DATA %>% mutate(Male_Window=Age>=18 & Age<=30,Female_Window=Age>=18 & Age<=28) %>% group_by(County,Region,Year) %>% summarize(Female_Birth_Group=sum(Num_Female*Female_Window,na.rm=TRUE),Male_Birth_Group=sum(Num_Male*Male_Window,na.rm=TRUE),Min_Birth_Group=ifelse(Female_Birth_Group<Male_Birth_Group,Female_Birth_Group,Male_Birth_Group)) %>% ungroup) return(DEMO_DATA %>% mutate(Male_Window=Age>=18 & Age<=30,Female_Window=Age>=18 & Age<=28) %>% group_by(County,Region,Year) %>% summarize(Female_Birth_Group=sum(Num_Female*Female_Window,na.rm=TRUE),Male_Birth_Group=sum(Num_Male*Male_Window,na.rm=TRUE),Min_Birth_Group=ifelse(Female_Birth_Group<Male_Birth_Group,Female_Birth_Group,Male_Birth_Group)) %>% ungroup)
} }
DEMOGRAPHIC_COUNTY_DATA <- readRDS(DEMOGRAPHIC_COUNTY_LOC) DEMOGRAPHIC_COUNTY_DATA <- readRDS(DEMOGRAPHIC_COUNTY_LOC)
COUNTY_POP <- readRDS(POPULATION_COUNTY_LOC) COUNTY_POP <- readRDS(POPULATION_COUNTY_LOC)
REG_DATA <- readRDS(POPULATION_COUNTY_LOC) %>% full_join(MAKE_REG_DATA(DEMOGRAPHIC_COUNTY_DATA)) REG_DATA <- readRDS(POPULATION_COUNTY_LOC) %>% full_join(MAKE_REG_DATA(DEMOGRAPHIC_COUNTY_DATA))
REG_DATA <- REG_DATA %>% group_by(County,Region) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup REG_DATA <- REG_DATA %>% group_by(County,Region) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup
REG_DATA <- REG_DATA %>% select(-Female_Birth_Group,-Male_Birth_Group) #Store the data set of only the first year needing a birth forecast, to start the birth Monte Carlo simulations. REG_DATA <- REG_DATA %>% select(-Female_Birth_Group,-Male_Birth_Group)%>% mutate(Region=County) %>% select(Year,County,Region,everything())
#Store the data set of only the first year needing a birth forecast, to start the birth Monte Carlo simulations.
###Some of the years are missing births, previous births etc. Where missing fill this in by assuming all age zero children in the demographic data (DEMOGRAPHIC_LOC) were born in the last year. This makes a more complete data set. Some test find a near perfect 1 to 1 with this method ###Some of the years are missing births, previous births etc. Where missing fill this in by assuming all age zero children in the demographic data (DEMOGRAPHIC_LOC) were born in the last year. This makes a more complete data set. Some test find a near perfect 1 to 1 with this method
#Data to fill in the missing records #Data to fill in the missing records
FILL_IN_DATA <- DEMOGRAPHIC_COUNTY_DATA %>% mutate(POP=Num_Male+Num_Female,BIRTHS=ifelse(Age==0,POP,0)) %>% group_by(County,Region,Year) %>% summarize(BIRTHS=sum(BIRTHS)) %>% arrange(County,Year) %>% mutate(ALT=lag(BIRTHS),ALT2=lag(BIRTHS,2)) %>% ungroup FILL_IN_DATA <- DEMOGRAPHIC_COUNTY_DATA %>% mutate(POP=Num_Male+Num_Female,BIRTHS=ifelse(Age==0,POP,0)) %>% group_by(County,Region,Year) %>% summarize(BIRTHS=sum(BIRTHS)) %>% arrange(County,Year) %>% mutate(ALT=lag(BIRTHS),ALT2=lag(BIRTHS,2)) %>% ungroup
#Join and replace missing records #Join and replace missing records
REG_DATA <- REG_DATA %>% left_join(FILL_IN_DATA ) %>% mutate(Births=ifelse(is.na(Births),BIRTHS,Births),PREV_BIRTH=ifelse(is.na(PREV_BIRTH),ALT,PREV_BIRTH),PREV_TWO_BIRTH=ifelse(is.na(PREV_TWO_BIRTH),ALT2,PREV_TWO_BIRTH)) %>% select(-BIRTHS,-ALT,-ALT2) %>% select(Year,County,Region,everything()) %>% mutate(Region=County) REG_DATA <- REG_DATA %>% left_join(FILL_IN_DATA ) %>% mutate(Births=ifelse(is.na(Births),BIRTHS,Births),PREV_BIRTH=ifelse(is.na(PREV_BIRTH),ALT,PREV_BIRTH),PREV_TWO_BIRTH=ifelse(is.na(PREV_TWO_BIRTH),ALT2,PREV_TWO_BIRTH)) %>% select(-BIRTHS,-ALT,-ALT2) %>% select(Year,County,Region,everything()) %>% mutate(Region=County)
ST_LIN_REG <- REG_DATA %>% filter(County=="Lincoln",Year==2024)
###Working on Kemmerer data ####################Create same data set but for only the Kemmerer Diamondville area
DEMOGRAPHIC_KEM_DATA <- readRDS(DEMOGRAPHIC_KEM_LOC) KEM_DEMO_DATA <- readRDS(DEMOGRAPHIC_KEM_LOC)%>% mutate(POP=Num_Male+Num_Female,Births=ifelse(Age==0,POP,0)) %>% group_by(Year,County,Region) %>% summarize(Births=sum(Births)) %>% ungroup %>% arrange(Region,Year) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup %>% mutate(Deaths=NA,Migration=NA) %>% left_join(MAKE_REG_DATA(readRDS(DEMOGRAPHIC_KEM_LOC)))
readRDS(POPULATION_CITY_LOC) %>% filter(City %in% c("Kemmerer","Diamondville")) %>% group_by(Year) %>% mutate(Population=sum(Population,na.rm=TRUE)) %>% mutate(City='Kemmerer') %>% rename(Region=City)
MAKE_REG_DATA(readRDS(DEMOGRAPHIC_KEM_LOC)) KEM_POP_DATA <- readRDS(POPULATION_CITY_LOC)%>% rename(Region=City) %>% filter(Region %in% c("Kemmerer","Diamondville")) %>% group_by(Year) %>% mutate(Population=sum(Population,na.rm=TRUE)) %>% ungroup %>% mutate(Region='Kemmerer & Diamondville') %>% unique %>% ungroup
REG_DATA
readRDS(DEMOGRAPHIC_KEM_LOC)%>% mutate(POP=Num_Male+Num_Female,Births=ifelse(Age==0,POP,0)) %>% group_by(County,Region,Year) %>% summarize(Births=sum(Births)) %>% arrange(County,Year) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup KEM_REG_DATA <- KEM_POP_DATA %>% left_join(KEM_DEMO_DATA) %>% select(colnames(REG_DATA))
KEM_REG_DATA
ST_KEM_REG <- KEM_REG_DATA[KEM_REG_DATA$Year==2023,]
ST_KEM_REG$Year <-2024
KEM_REG_DATA %>% tail
###The starting entry to predict births in the next period based on the current population
ST_KEM_REG[,"Population"] <- KEM_REG_DATA[KEM_REG_DATA$Year==2024,] %>% pull("Population")
####################Create same data set but for only parts of Lincoln not in the Kemmerer Diamondville area
OTHER_DEMO_DATA <- readRDS(DEMOGRAPHIC_OTHER_LIN_LOC)%>% mutate(POP=Num_Male+Num_Female,Births=ifelse(Age==0,POP,0)) %>% group_by(Year,County,Region) %>% summarize(Births=sum(Births)) %>% ungroup %>% arrange(Region,Year) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup %>% mutate(Deaths=NA,Migration=NA) %>% left_join(MAKE_REG_DATA(readRDS(DEMOGRAPHIC_OTHER_LIN_LOC)))
readRDS(DEMOGRAPHIC_KEM_LOC)%>% mutate(Male_Window=Age>=18 & Age<=30,Female_Window=Age>=18 & Age<=28) %>% group_by(County,Year) %>% summarize(Female_Birth_Group=sum(Num_Female*Female_Window,na.rm=TRUE),Male_Birth_Group=sum(Num_Male*Male_Window,na.rm=TRUE),Min_Birth_Group=ifelse(Female_Birth_Group<Male_Birth_Group,Female_Birth_Group,Male_Birth_Group)) %>% ungroup
DEMOGRAPHIC_DATA
TEST <- readRDS(POPULATION_COUNTY_LOC)
if(!("Births" %in% colnames(TEST)))
"Deaths" %in% colnames(TEST)
"Migration" %in% colnames(TEST)
"Migration" %in% colnames(TEST)
OTHER_POP_DATA <- readRDS(POPULATION_CITY_LOC)%>% rename(Region=City) %>% filter(!(Region %in% c("Kemmerer","Diamondville")),County=='Lincoln') %>% group_by(Year) %>% mutate(Population=sum(Population,na.rm=TRUE)) %>% ungroup %>% mutate(Region='Lincoln_Other') %>% unique %>% ungroup
readRDS(DEMOGRAPHIC_COUNTY_LOC) OTHER_REG_DATA <- OTHER_POP_DATA %>% left_join(OTHER_DEMO_DATA) %>% select(colnames(REG_DATA))
readRDS(POPULATION_COUNTY_LOC) ST_OTHER_REG <- OTHER_REG_DATA[OTHER_REG_DATA$Year==2023,]
COUNTY_REG_DATA <- MAKE_REG_DATA(readRDS(DEMOGRAPHIC_COUNTY_LOC),readRDS(POPULATION_COUNTY_LOC) ) ST_OTHER_REG$Year <-2024
readRDS(DEMOGRAPHIC_KEM_LOC) ###The starting entry to predict births in the next period based on the current population
readRDS(POPULATION_CITY_LOC) %>% ST_OTHER_REG[,"Population"] <- OTHER_REG_DATA[OTHER_REG_DATA$Year==2024,] %>% pull("Population")
readRDS(DEMOGRAPHIC_KEM_LOC)
readRDS(DEMOGRAPHIC_KEM_LOC)
MAKE_REG_DATA(readRDS(DEMOGRAPHIC_KEM_LOC),readRDS(POPULATION_CITY_LOC) ) %>% filter(!is.na(Region)) %>% pull(Region) %>% unique
%>% pull(Region) %>% unique
%>% filter(Region=='Kemmerer')
readRDS(POPULATION_CITY_LOC)
MAKE_REG_DATA(readRDS("Data/Cleaned_Data/Demographic_Sex_Age_Data/RDS/Kemmerer_Diamondville_Demographics.Rds"),readRDS("Data/Cleaned_Data/Population_Data/RDS/All_Wyoming_City_Populations.Rds"))
#Extract the population trend data to connect with demographics (Population,births,deaths)
POP_DATA <- readRDS(POPULATION_LOC)
#Merger the two data sets and drop any records that cannot be used in the regression (this makes the "predict" function output the right number of records)
REG_DATA <- POP_DATA %>% full_join(DEMOGRAPHIC_DATA)
####################################################3
REG_DATA <- REG_DATA %>% rbind(OTHER_REG_DATA) %>% rbind(KEM_REG_DATA)
###Predict the number of Births ###Predict the number of Births
MOD_BIRTHS <- feols(log(Births)~log(PREV_BIRTH)+log(PREV_TWO_BIRTH)+log(Min_Birth_Group)+Year*County,cluster=~Year+County, data=REG_DATA ) #Higher AIC but worse acf MOD_BIRTHS <- feols(log(Births)~log(PREV_BIRTH)+log(PREV_TWO_BIRTH)+log(Min_Birth_Group)+Year*Region,cluster=~Year+Region, data=REG_DATA ) #Higher AIC but worse acf
ST_OTHER_REG
#Current year prediction
exp(predict(MOD_BIRTHS,newdata=ST_KEM_REG)) #Kemmerer births
exp(predict(MOD_BIRTHS,newdata=ST_OTHER_REG)) #Other Lincoln births
exp(predict(MOD_BIRTHS,newdata=ST_LIN_REG)) #All Lincoln births
#Note that due to useing diffrent data sets Lincoln is NOT colinear with Kemmere+Other Lincoln. Either result can be downshifted by the amount of diffrence
ADJUST_RESULTS_FACTOR <- (exp(predict(MOD_BIRTHS,newdata=ST_KEM_REG))+exp(predict(MOD_BIRTHS,newdata=ST_OTHER_REG)))/exp(predict(MOD_BIRTHS,newdata=ST_LIN_REG))
ADJUST_RESULTS_FACTOR
#MOD_BIRTHS_ALT <- feols(log(Births)~log(PREV_BIRTH)+log(Min_Birth_Group)+Year*County,cluster=~Year+County, data=REG_DATA ) #MOD_BIRTHS_ALT <- feols(log(Births)~log(PREV_BIRTH)+log(Min_Birth_Group)+Year*County,cluster=~Year+County, data=REG_DATA )
#AIC(MOD_BIRTHS)<AIC(MOD_BIRTHS_ALT) #AIC(MOD_BIRTHS)<AIC(MOD_BIRTHS_ALT)
#Optional: Review the ACF and PACF for validity. Model made on October 24nd appears to have uncorrelated lags of residuals accept year three. #Optional: Review the ACF and PACF for validity. Model made on October 24nd appears to have uncorrelated lags of residuals accept year three.