diff --git a/Scripts/1A_Download_and_Process_Population_Data.r b/Scripts/1A_Download_and_Process_Population_Data.r index 23771e9..3cb4188 100644 --- a/Scripts/1A_Download_and_Process_Population_Data.r +++ b/Scripts/1A_Download_and_Process_Population_Data.r @@ -86,10 +86,25 @@ COLUMNS <- c(1,which(TBL[ST-2,] %in% 1970:2025)) NAMES <- TBL[4,COLUMNS][-1] TBL <- TBL[ST:END,COLUMNS ] colnames(TBL) <- c("County",NAMES) +###Assign each city a county, this is used later to add in a county back to the city but is not used immediately + COUNTY_LOCATION <- which(grepl(" County",TBL %>% pull(County)) & !grepl("Balance of",TBL %>% pull(County))) + RES <- c() + for(i in 1:nrow(TBL)){ + RES[i] <- gsub(" County","",TBL[COUNTY_LOCATION[max(which(COUNTY_LOCATION<=i))],] %>% pull(County) ) + } + CITY_TO_COUNTY <- cbind(TBL$County,RES) %>% as_tibble + CITY_TO_COUNTY <- rbind(CITY_TO_COUNTY,c("Frannie","Big_Horn")) #Manually adding a defunct city in earlier data + CITY_TO_COUNTY <- rbind(CITY_TO_COUNTY,c('Wind_River_Res.','Wind_River_Res.')) #Add a reservation which spans two counties and can be labled as a single region + + + colnames(CITY_TO_COUNTY) <- c("City","County") + CITY_TO_COUNTY$City <- gsub(" ","_",gsub("Balance of","Unincorporated",gsub(" County","",gsub(" city","",gsub(" town","",CITY_TO_COUNTY$City,ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE)) +###########Continue with data collection TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population)) TBL$County <- gsub(" "," ",gsub("\n","",gsub("\r","",TBL %>% pull(County)))) + COUNTY_POP<- TBL[grep("COUNTY",TBL %>% pull(County),ignore.case=TRUE),] COUNTY_POP<- COUNTY_POP[grep("Balance",COUNTY_POP%>% pull(County),invert=TRUE,ignore.case=TRUE),] COUNTY_POP$County <- gsub(" ","_",gsub(" County","",COUNTY_POP$County)) @@ -307,22 +322,29 @@ sink() KEM_OLD <- c(843,1517,1884,2026,1667,2028,2292) #1910 forward until 1970 Year <- seq(1910,1970,by=10) - TEMP <- cbind(Year,rep("kemmerer",7),KEM_OLD) + TEMP <- cbind(Year,rep("Kemmerer",7),KEM_OLD) colnames(TEMP ) <- c("Year","City","Population") TEMP <- as_tibble(TEMP) CITY_POP <- rbind(TEMP,CITY_POP) DIAMOND_OLD <- c(696,726,812,586,415,398,485) - TEMP <- cbind(Year,rep("Diamondvile",7),DIAMOND_OLD) + TEMP <- cbind(Year,rep("Diamondville",7),DIAMOND_OLD) colnames(TEMP ) <- c("Year","City","Population") TEMP <- as_tibble(TEMP) CITY_POP <- rbind(TEMP,CITY_POP) %>% arrange(City,Year) #Remove empty values, ensure all numeric values are not saved as characters CITY_POP <- CITY_POP %>% filter(!is.na(Population) ) %>% mutate(Population=parse_number(Population),Year=parse_number(Year)) +#Fix names +CITY_POP[grep("Elk_Mtn",CITY_POP$City),"City"] <- "Elk_Mountain" +CITY_POP[grep("Frannie",CITY_POP$City),"City"] <- "Frannie" +CITY_POP[grep("Ft\\._La",CITY_POP$City),"City"] <- "Fort_Laramie" + #Add Other Data COUNTY_POP <- COUNTY_POP %>% mutate(Year=as.numeric(Year)) %>% unique WY_COUNTY_DATA_SET <- COUNTY_POP %>% left_join(WY_COUNTY_DATA_SET ) %>% mutate(Population=as.numeric(Population)) %>% unique +CITY_POP <- CITY_POP %>% left_join(CITY_TO_COUNTY) %>% select(Year,City,County,Population) +CITY_POP[is.na(CITY_POP$County),"County"] <- str_remove(CITY_POP[is.na(CITY_POP$County),] %>% pull(City),"Unincorporated_") #Fix missing Unincorportaed records without a county. ###Save Population Results if(!exists("SAVE_LOC_POP")){SAVE_LOC_POP <-"./Data/Cleaned_Data/Population_Data"} diff --git a/Scripts/1D_Use_ACS_Census_Data_to_Estimate_Kemmerer_Demographics.r b/Scripts/1D_Use_ACS_Census_Data_to_Estimate_Kemmerer_Demographics.r index 259457a..2bdeced 100644 --- a/Scripts/1D_Use_ACS_Census_Data_to_Estimate_Kemmerer_Demographics.r +++ b/Scripts/1D_Use_ACS_Census_Data_to_Estimate_Kemmerer_Demographics.r @@ -28,7 +28,7 @@ OLD <- DEMO_DATA_ALL %>% filter(Age>=85) %>% group_by(County,IN_KEM,Year,Se DEMO_DATA_ALL <- rbind(YOUNG,OLD) %>% arrange(County,Year,IN_KEM,Sex) DEMO_DATA_ALL <- DEMO_DATA_ALL %>% mutate(Population=round(Population)) %>% pivot_wider(values_from=Population,names_from=Sex,names_prefix="Num_") OTHER_LIN_DEMO_DATA <- DEMO_DATA_ALL %>% filter(IN_KEM==0) %>% rename(Region=IN_KEM) %>% mutate(Region='Lincoln_Other') -KEM_DEMO_DATA <- DEMO_DATA_ALL %>% filter(IN_KEM==1) %>% rename(Region=IN_KEM) %>% mutate(Region='Kemmerer') +KEM_DEMO_DATA <- DEMO_DATA_ALL %>% filter(IN_KEM==1) %>% rename(Region=IN_KEM) %>% mutate(Region='Kemmerer & Diamondville') diff --git a/Scripts/2A_Birth_Rate_Regression.r b/Scripts/2A_Birth_Rate_Regression.r index c60eeca..d7f89f8 100644 --- a/Scripts/2A_Birth_Rate_Regression.r +++ b/Scripts/2A_Birth_Rate_Regression.r @@ -19,55 +19,57 @@ MAKE_REG_DATA <- function(DEMO_DATA){ return(DEMO_DATA %>% mutate(Male_Window=Age>=18 & Age<=30,Female_Window=Age>=18 & Age<=28) %>% group_by(County,Region,Year) %>% summarize(Female_Birth_Group=sum(Num_Female*Female_Window,na.rm=TRUE),Male_Birth_Group=sum(Num_Male*Male_Window,na.rm=TRUE),Min_Birth_Group=ifelse(Female_Birth_Group% ungroup) } DEMOGRAPHIC_COUNTY_DATA <- readRDS(DEMOGRAPHIC_COUNTY_LOC) -COUNTY_POP <- readRDS(POPULATION_COUNTY_LOC) +COUNTY_POP <- readRDS(POPULATION_COUNTY_LOC) REG_DATA <- readRDS(POPULATION_COUNTY_LOC) %>% full_join(MAKE_REG_DATA(DEMOGRAPHIC_COUNTY_DATA)) REG_DATA <- REG_DATA %>% group_by(County,Region) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup - REG_DATA <- REG_DATA %>% select(-Female_Birth_Group,-Male_Birth_Group) #Store the data set of only the first year needing a birth forecast, to start the birth Monte Carlo simulations. + REG_DATA <- REG_DATA %>% select(-Female_Birth_Group,-Male_Birth_Group)%>% mutate(Region=County) %>% select(Year,County,Region,everything()) + #Store the data set of only the first year needing a birth forecast, to start the birth Monte Carlo simulations. ###Some of the years are missing births, previous births etc. Where missing fill this in by assuming all age zero children in the demographic data (DEMOGRAPHIC_LOC) were born in the last year. This makes a more complete data set. Some test find a near perfect 1 to 1 with this method #Data to fill in the missing records FILL_IN_DATA <- DEMOGRAPHIC_COUNTY_DATA %>% mutate(POP=Num_Male+Num_Female,BIRTHS=ifelse(Age==0,POP,0)) %>% group_by(County,Region,Year) %>% summarize(BIRTHS=sum(BIRTHS)) %>% arrange(County,Year) %>% mutate(ALT=lag(BIRTHS),ALT2=lag(BIRTHS,2)) %>% ungroup #Join and replace missing records REG_DATA <- REG_DATA %>% left_join(FILL_IN_DATA ) %>% mutate(Births=ifelse(is.na(Births),BIRTHS,Births),PREV_BIRTH=ifelse(is.na(PREV_BIRTH),ALT,PREV_BIRTH),PREV_TWO_BIRTH=ifelse(is.na(PREV_TWO_BIRTH),ALT2,PREV_TWO_BIRTH)) %>% select(-BIRTHS,-ALT,-ALT2) %>% select(Year,County,Region,everything()) %>% mutate(Region=County) +ST_LIN_REG <- REG_DATA %>% filter(County=="Lincoln",Year==2024) -###Working on Kemmerer data -DEMOGRAPHIC_KEM_DATA <- readRDS(DEMOGRAPHIC_KEM_LOC) -readRDS(POPULATION_CITY_LOC) %>% filter(City %in% c("Kemmerer","Diamondville")) %>% group_by(Year) %>% mutate(Population=sum(Population,na.rm=TRUE)) %>% mutate(City='Kemmerer') %>% rename(Region=City) -MAKE_REG_DATA(readRDS(DEMOGRAPHIC_KEM_LOC)) -REG_DATA -readRDS(DEMOGRAPHIC_KEM_LOC)%>% mutate(POP=Num_Male+Num_Female,Births=ifelse(Age==0,POP,0)) %>% group_by(County,Region,Year) %>% summarize(Births=sum(Births)) %>% arrange(County,Year) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup +####################Create same data set but for only the Kemmerer Diamondville area +KEM_DEMO_DATA <- readRDS(DEMOGRAPHIC_KEM_LOC)%>% mutate(POP=Num_Male+Num_Female,Births=ifelse(Age==0,POP,0)) %>% group_by(Year,County,Region) %>% summarize(Births=sum(Births)) %>% ungroup %>% arrange(Region,Year) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup %>% mutate(Deaths=NA,Migration=NA) %>% left_join(MAKE_REG_DATA(readRDS(DEMOGRAPHIC_KEM_LOC))) + +KEM_POP_DATA <- readRDS(POPULATION_CITY_LOC)%>% rename(Region=City) %>% filter(Region %in% c("Kemmerer","Diamondville")) %>% group_by(Year) %>% mutate(Population=sum(Population,na.rm=TRUE)) %>% ungroup %>% mutate(Region='Kemmerer & Diamondville') %>% unique %>% ungroup + +KEM_REG_DATA <- KEM_POP_DATA %>% left_join(KEM_DEMO_DATA) %>% select(colnames(REG_DATA)) +KEM_REG_DATA +ST_KEM_REG <- KEM_REG_DATA[KEM_REG_DATA$Year==2023,] +ST_KEM_REG$Year <-2024 +KEM_REG_DATA %>% tail +###The starting entry to predict births in the next period based on the current population +ST_KEM_REG[,"Population"] <- KEM_REG_DATA[KEM_REG_DATA$Year==2024,] %>% pull("Population") +####################Create same data set but for only parts of Lincoln not in the Kemmerer Diamondville area +OTHER_DEMO_DATA <- readRDS(DEMOGRAPHIC_OTHER_LIN_LOC)%>% mutate(POP=Num_Male+Num_Female,Births=ifelse(Age==0,POP,0)) %>% group_by(Year,County,Region) %>% summarize(Births=sum(Births)) %>% ungroup %>% arrange(Region,Year) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup %>% mutate(Deaths=NA,Migration=NA) %>% left_join(MAKE_REG_DATA(readRDS(DEMOGRAPHIC_OTHER_LIN_LOC))) - readRDS(DEMOGRAPHIC_KEM_LOC)%>% mutate(Male_Window=Age>=18 & Age<=30,Female_Window=Age>=18 & Age<=28) %>% group_by(County,Year) %>% summarize(Female_Birth_Group=sum(Num_Female*Female_Window,na.rm=TRUE),Male_Birth_Group=sum(Num_Male*Male_Window,na.rm=TRUE),Min_Birth_Group=ifelse(Female_Birth_Group% ungroup -DEMOGRAPHIC_DATA -TEST <- readRDS(POPULATION_COUNTY_LOC) -if(!("Births" %in% colnames(TEST))) -"Deaths" %in% colnames(TEST) -"Migration" %in% colnames(TEST) -"Migration" %in% colnames(TEST) +OTHER_POP_DATA <- readRDS(POPULATION_CITY_LOC)%>% rename(Region=City) %>% filter(!(Region %in% c("Kemmerer","Diamondville")),County=='Lincoln') %>% group_by(Year) %>% mutate(Population=sum(Population,na.rm=TRUE)) %>% ungroup %>% mutate(Region='Lincoln_Other') %>% unique %>% ungroup -readRDS(DEMOGRAPHIC_COUNTY_LOC) -readRDS(POPULATION_COUNTY_LOC) -COUNTY_REG_DATA <- MAKE_REG_DATA(readRDS(DEMOGRAPHIC_COUNTY_LOC),readRDS(POPULATION_COUNTY_LOC) ) -readRDS(DEMOGRAPHIC_KEM_LOC) -readRDS(POPULATION_CITY_LOC) %>% -readRDS(DEMOGRAPHIC_KEM_LOC) -readRDS(DEMOGRAPHIC_KEM_LOC) - MAKE_REG_DATA(readRDS(DEMOGRAPHIC_KEM_LOC),readRDS(POPULATION_CITY_LOC) ) %>% filter(!is.na(Region)) %>% pull(Region) %>% unique -%>% pull(Region) %>% unique -%>% filter(Region=='Kemmerer') - -readRDS(POPULATION_CITY_LOC) -MAKE_REG_DATA(readRDS("Data/Cleaned_Data/Demographic_Sex_Age_Data/RDS/Kemmerer_Diamondville_Demographics.Rds"),readRDS("Data/Cleaned_Data/Population_Data/RDS/All_Wyoming_City_Populations.Rds")) - - #Extract the population trend data to connect with demographics (Population,births,deaths) - POP_DATA <- readRDS(POPULATION_LOC) - #Merger the two data sets and drop any records that cannot be used in the regression (this makes the "predict" function output the right number of records) - REG_DATA <- POP_DATA %>% full_join(DEMOGRAPHIC_DATA) +OTHER_REG_DATA <- OTHER_POP_DATA %>% left_join(OTHER_DEMO_DATA) %>% select(colnames(REG_DATA)) +ST_OTHER_REG <- OTHER_REG_DATA[OTHER_REG_DATA$Year==2023,] +ST_OTHER_REG$Year <-2024 +###The starting entry to predict births in the next period based on the current population +ST_OTHER_REG[,"Population"] <- OTHER_REG_DATA[OTHER_REG_DATA$Year==2024,] %>% pull("Population") +####################################################3 +REG_DATA <- REG_DATA %>% rbind(OTHER_REG_DATA) %>% rbind(KEM_REG_DATA) ###Predict the number of Births -MOD_BIRTHS <- feols(log(Births)~log(PREV_BIRTH)+log(PREV_TWO_BIRTH)+log(Min_Birth_Group)+Year*County,cluster=~Year+County, data=REG_DATA ) #Higher AIC but worse acf +MOD_BIRTHS <- feols(log(Births)~log(PREV_BIRTH)+log(PREV_TWO_BIRTH)+log(Min_Birth_Group)+Year*Region,cluster=~Year+Region, data=REG_DATA ) #Higher AIC but worse acf +ST_OTHER_REG +#Current year prediction +exp(predict(MOD_BIRTHS,newdata=ST_KEM_REG)) #Kemmerer births +exp(predict(MOD_BIRTHS,newdata=ST_OTHER_REG)) #Other Lincoln births +exp(predict(MOD_BIRTHS,newdata=ST_LIN_REG)) #All Lincoln births +#Note that due to useing diffrent data sets Lincoln is NOT colinear with Kemmere+Other Lincoln. Either result can be downshifted by the amount of diffrence +ADJUST_RESULTS_FACTOR <- (exp(predict(MOD_BIRTHS,newdata=ST_KEM_REG))+exp(predict(MOD_BIRTHS,newdata=ST_OTHER_REG)))/exp(predict(MOD_BIRTHS,newdata=ST_LIN_REG)) + +ADJUST_RESULTS_FACTOR #MOD_BIRTHS_ALT <- feols(log(Births)~log(PREV_BIRTH)+log(Min_Birth_Group)+Year*County,cluster=~Year+County, data=REG_DATA ) #AIC(MOD_BIRTHS)