Working on birth reg, prelim data
This commit is contained in:
parent
9d0ba09898
commit
f20749c434
@ -86,10 +86,25 @@ COLUMNS <- c(1,which(TBL[ST-2,] %in% 1970:2025))
|
||||
NAMES <- TBL[4,COLUMNS][-1]
|
||||
TBL <- TBL[ST:END,COLUMNS ]
|
||||
colnames(TBL) <- c("County",NAMES)
|
||||
###Assign each city a county, this is used later to add in a county back to the city but is not used immediately
|
||||
COUNTY_LOCATION <- which(grepl(" County",TBL %>% pull(County)) & !grepl("Balance of",TBL %>% pull(County)))
|
||||
RES <- c()
|
||||
|
||||
for(i in 1:nrow(TBL)){
|
||||
RES[i] <- gsub(" County","",TBL[COUNTY_LOCATION[max(which(COUNTY_LOCATION<=i))],] %>% pull(County) )
|
||||
}
|
||||
CITY_TO_COUNTY <- cbind(TBL$County,RES) %>% as_tibble
|
||||
CITY_TO_COUNTY <- rbind(CITY_TO_COUNTY,c("Frannie","Big_Horn")) #Manually adding a defunct city in earlier data
|
||||
CITY_TO_COUNTY <- rbind(CITY_TO_COUNTY,c('Wind_River_Res.','Wind_River_Res.')) #Add a reservation which spans two counties and can be labled as a single region
|
||||
|
||||
|
||||
colnames(CITY_TO_COUNTY) <- c("City","County")
|
||||
CITY_TO_COUNTY$City <- gsub(" ","_",gsub("Balance of","Unincorporated",gsub(" County","",gsub(" city","",gsub(" town","",CITY_TO_COUNTY$City,ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE))
|
||||
###########Continue with data collection
|
||||
TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
|
||||
TBL$County <- gsub(" "," ",gsub("\n","",gsub("\r","",TBL %>% pull(County))))
|
||||
|
||||
|
||||
COUNTY_POP<- TBL[grep("COUNTY",TBL %>% pull(County),ignore.case=TRUE),]
|
||||
COUNTY_POP<- COUNTY_POP[grep("Balance",COUNTY_POP%>% pull(County),invert=TRUE,ignore.case=TRUE),]
|
||||
COUNTY_POP$County <- gsub(" ","_",gsub(" County","",COUNTY_POP$County))
|
||||
@ -307,22 +322,29 @@ sink()
|
||||
|
||||
KEM_OLD <- c(843,1517,1884,2026,1667,2028,2292) #1910 forward until 1970
|
||||
Year <- seq(1910,1970,by=10)
|
||||
TEMP <- cbind(Year,rep("kemmerer",7),KEM_OLD)
|
||||
TEMP <- cbind(Year,rep("Kemmerer",7),KEM_OLD)
|
||||
colnames(TEMP ) <- c("Year","City","Population")
|
||||
TEMP <- as_tibble(TEMP)
|
||||
CITY_POP <- rbind(TEMP,CITY_POP)
|
||||
|
||||
|
||||
DIAMOND_OLD <- c(696,726,812,586,415,398,485)
|
||||
TEMP <- cbind(Year,rep("Diamondvile",7),DIAMOND_OLD)
|
||||
TEMP <- cbind(Year,rep("Diamondville",7),DIAMOND_OLD)
|
||||
colnames(TEMP ) <- c("Year","City","Population")
|
||||
TEMP <- as_tibble(TEMP)
|
||||
CITY_POP <- rbind(TEMP,CITY_POP) %>% arrange(City,Year)
|
||||
#Remove empty values, ensure all numeric values are not saved as characters
|
||||
CITY_POP <- CITY_POP %>% filter(!is.na(Population) ) %>% mutate(Population=parse_number(Population),Year=parse_number(Year))
|
||||
#Fix names
|
||||
CITY_POP[grep("Elk_Mtn",CITY_POP$City),"City"] <- "Elk_Mountain"
|
||||
CITY_POP[grep("Frannie",CITY_POP$City),"City"] <- "Frannie"
|
||||
CITY_POP[grep("Ft\\._La",CITY_POP$City),"City"] <- "Fort_Laramie"
|
||||
|
||||
#Add Other Data
|
||||
COUNTY_POP <- COUNTY_POP %>% mutate(Year=as.numeric(Year)) %>% unique
|
||||
WY_COUNTY_DATA_SET <- COUNTY_POP %>% left_join(WY_COUNTY_DATA_SET ) %>% mutate(Population=as.numeric(Population)) %>% unique
|
||||
CITY_POP <- CITY_POP %>% left_join(CITY_TO_COUNTY) %>% select(Year,City,County,Population)
|
||||
CITY_POP[is.na(CITY_POP$County),"County"] <- str_remove(CITY_POP[is.na(CITY_POP$County),] %>% pull(City),"Unincorporated_") #Fix missing Unincorportaed records without a county.
|
||||
|
||||
###Save Population Results
|
||||
if(!exists("SAVE_LOC_POP")){SAVE_LOC_POP <-"./Data/Cleaned_Data/Population_Data"}
|
||||
|
||||
@ -28,7 +28,7 @@ OLD <- DEMO_DATA_ALL %>% filter(Age>=85) %>% group_by(County,IN_KEM,Year,Se
|
||||
DEMO_DATA_ALL <- rbind(YOUNG,OLD) %>% arrange(County,Year,IN_KEM,Sex)
|
||||
DEMO_DATA_ALL <- DEMO_DATA_ALL %>% mutate(Population=round(Population)) %>% pivot_wider(values_from=Population,names_from=Sex,names_prefix="Num_")
|
||||
OTHER_LIN_DEMO_DATA <- DEMO_DATA_ALL %>% filter(IN_KEM==0) %>% rename(Region=IN_KEM) %>% mutate(Region='Lincoln_Other')
|
||||
KEM_DEMO_DATA <- DEMO_DATA_ALL %>% filter(IN_KEM==1) %>% rename(Region=IN_KEM) %>% mutate(Region='Kemmerer')
|
||||
KEM_DEMO_DATA <- DEMO_DATA_ALL %>% filter(IN_KEM==1) %>% rename(Region=IN_KEM) %>% mutate(Region='Kemmerer & Diamondville')
|
||||
|
||||
|
||||
|
||||
|
||||
@ -22,52 +22,54 @@ DEMOGRAPHIC_COUNTY_DATA <- readRDS(DEMOGRAPHIC_COUNTY_LOC)
|
||||
COUNTY_POP <- readRDS(POPULATION_COUNTY_LOC)
|
||||
REG_DATA <- readRDS(POPULATION_COUNTY_LOC) %>% full_join(MAKE_REG_DATA(DEMOGRAPHIC_COUNTY_DATA))
|
||||
REG_DATA <- REG_DATA %>% group_by(County,Region) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup
|
||||
REG_DATA <- REG_DATA %>% select(-Female_Birth_Group,-Male_Birth_Group) #Store the data set of only the first year needing a birth forecast, to start the birth Monte Carlo simulations.
|
||||
REG_DATA <- REG_DATA %>% select(-Female_Birth_Group,-Male_Birth_Group)%>% mutate(Region=County) %>% select(Year,County,Region,everything())
|
||||
#Store the data set of only the first year needing a birth forecast, to start the birth Monte Carlo simulations.
|
||||
###Some of the years are missing births, previous births etc. Where missing fill this in by assuming all age zero children in the demographic data (DEMOGRAPHIC_LOC) were born in the last year. This makes a more complete data set. Some test find a near perfect 1 to 1 with this method
|
||||
#Data to fill in the missing records
|
||||
FILL_IN_DATA <- DEMOGRAPHIC_COUNTY_DATA %>% mutate(POP=Num_Male+Num_Female,BIRTHS=ifelse(Age==0,POP,0)) %>% group_by(County,Region,Year) %>% summarize(BIRTHS=sum(BIRTHS)) %>% arrange(County,Year) %>% mutate(ALT=lag(BIRTHS),ALT2=lag(BIRTHS,2)) %>% ungroup
|
||||
#Join and replace missing records
|
||||
REG_DATA <- REG_DATA %>% left_join(FILL_IN_DATA ) %>% mutate(Births=ifelse(is.na(Births),BIRTHS,Births),PREV_BIRTH=ifelse(is.na(PREV_BIRTH),ALT,PREV_BIRTH),PREV_TWO_BIRTH=ifelse(is.na(PREV_TWO_BIRTH),ALT2,PREV_TWO_BIRTH)) %>% select(-BIRTHS,-ALT,-ALT2) %>% select(Year,County,Region,everything()) %>% mutate(Region=County)
|
||||
ST_LIN_REG <- REG_DATA %>% filter(County=="Lincoln",Year==2024)
|
||||
|
||||
###Working on Kemmerer data
|
||||
DEMOGRAPHIC_KEM_DATA <- readRDS(DEMOGRAPHIC_KEM_LOC)
|
||||
readRDS(POPULATION_CITY_LOC) %>% filter(City %in% c("Kemmerer","Diamondville")) %>% group_by(Year) %>% mutate(Population=sum(Population,na.rm=TRUE)) %>% mutate(City='Kemmerer') %>% rename(Region=City)
|
||||
MAKE_REG_DATA(readRDS(DEMOGRAPHIC_KEM_LOC))
|
||||
REG_DATA
|
||||
readRDS(DEMOGRAPHIC_KEM_LOC)%>% mutate(POP=Num_Male+Num_Female,Births=ifelse(Age==0,POP,0)) %>% group_by(County,Region,Year) %>% summarize(Births=sum(Births)) %>% arrange(County,Year) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup
|
||||
####################Create same data set but for only the Kemmerer Diamondville area
|
||||
KEM_DEMO_DATA <- readRDS(DEMOGRAPHIC_KEM_LOC)%>% mutate(POP=Num_Male+Num_Female,Births=ifelse(Age==0,POP,0)) %>% group_by(Year,County,Region) %>% summarize(Births=sum(Births)) %>% ungroup %>% arrange(Region,Year) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup %>% mutate(Deaths=NA,Migration=NA) %>% left_join(MAKE_REG_DATA(readRDS(DEMOGRAPHIC_KEM_LOC)))
|
||||
|
||||
KEM_POP_DATA <- readRDS(POPULATION_CITY_LOC)%>% rename(Region=City) %>% filter(Region %in% c("Kemmerer","Diamondville")) %>% group_by(Year) %>% mutate(Population=sum(Population,na.rm=TRUE)) %>% ungroup %>% mutate(Region='Kemmerer & Diamondville') %>% unique %>% ungroup
|
||||
|
||||
KEM_REG_DATA <- KEM_POP_DATA %>% left_join(KEM_DEMO_DATA) %>% select(colnames(REG_DATA))
|
||||
KEM_REG_DATA
|
||||
ST_KEM_REG <- KEM_REG_DATA[KEM_REG_DATA$Year==2023,]
|
||||
ST_KEM_REG$Year <-2024
|
||||
KEM_REG_DATA %>% tail
|
||||
###The starting entry to predict births in the next period based on the current population
|
||||
ST_KEM_REG[,"Population"] <- KEM_REG_DATA[KEM_REG_DATA$Year==2024,] %>% pull("Population")
|
||||
####################Create same data set but for only parts of Lincoln not in the Kemmerer Diamondville area
|
||||
OTHER_DEMO_DATA <- readRDS(DEMOGRAPHIC_OTHER_LIN_LOC)%>% mutate(POP=Num_Male+Num_Female,Births=ifelse(Age==0,POP,0)) %>% group_by(Year,County,Region) %>% summarize(Births=sum(Births)) %>% ungroup %>% arrange(Region,Year) %>% mutate(PREV_BIRTH=lag(Births),PREV_TWO_BIRTH=lag(Births,2)) %>% ungroup %>% mutate(Deaths=NA,Migration=NA) %>% left_join(MAKE_REG_DATA(readRDS(DEMOGRAPHIC_OTHER_LIN_LOC)))
|
||||
|
||||
|
||||
readRDS(DEMOGRAPHIC_KEM_LOC)%>% mutate(Male_Window=Age>=18 & Age<=30,Female_Window=Age>=18 & Age<=28) %>% group_by(County,Year) %>% summarize(Female_Birth_Group=sum(Num_Female*Female_Window,na.rm=TRUE),Male_Birth_Group=sum(Num_Male*Male_Window,na.rm=TRUE),Min_Birth_Group=ifelse(Female_Birth_Group<Male_Birth_Group,Female_Birth_Group,Male_Birth_Group)) %>% ungroup
|
||||
DEMOGRAPHIC_DATA
|
||||
TEST <- readRDS(POPULATION_COUNTY_LOC)
|
||||
if(!("Births" %in% colnames(TEST)))
|
||||
"Deaths" %in% colnames(TEST)
|
||||
"Migration" %in% colnames(TEST)
|
||||
"Migration" %in% colnames(TEST)
|
||||
|
||||
OTHER_POP_DATA <- readRDS(POPULATION_CITY_LOC)%>% rename(Region=City) %>% filter(!(Region %in% c("Kemmerer","Diamondville")),County=='Lincoln') %>% group_by(Year) %>% mutate(Population=sum(Population,na.rm=TRUE)) %>% ungroup %>% mutate(Region='Lincoln_Other') %>% unique %>% ungroup
|
||||
|
||||
readRDS(DEMOGRAPHIC_COUNTY_LOC)
|
||||
readRDS(POPULATION_COUNTY_LOC)
|
||||
COUNTY_REG_DATA <- MAKE_REG_DATA(readRDS(DEMOGRAPHIC_COUNTY_LOC),readRDS(POPULATION_COUNTY_LOC) )
|
||||
readRDS(DEMOGRAPHIC_KEM_LOC)
|
||||
readRDS(POPULATION_CITY_LOC) %>%
|
||||
readRDS(DEMOGRAPHIC_KEM_LOC)
|
||||
readRDS(DEMOGRAPHIC_KEM_LOC)
|
||||
MAKE_REG_DATA(readRDS(DEMOGRAPHIC_KEM_LOC),readRDS(POPULATION_CITY_LOC) ) %>% filter(!is.na(Region)) %>% pull(Region) %>% unique
|
||||
%>% pull(Region) %>% unique
|
||||
%>% filter(Region=='Kemmerer')
|
||||
|
||||
readRDS(POPULATION_CITY_LOC)
|
||||
MAKE_REG_DATA(readRDS("Data/Cleaned_Data/Demographic_Sex_Age_Data/RDS/Kemmerer_Diamondville_Demographics.Rds"),readRDS("Data/Cleaned_Data/Population_Data/RDS/All_Wyoming_City_Populations.Rds"))
|
||||
|
||||
#Extract the population trend data to connect with demographics (Population,births,deaths)
|
||||
POP_DATA <- readRDS(POPULATION_LOC)
|
||||
#Merger the two data sets and drop any records that cannot be used in the regression (this makes the "predict" function output the right number of records)
|
||||
REG_DATA <- POP_DATA %>% full_join(DEMOGRAPHIC_DATA)
|
||||
OTHER_REG_DATA <- OTHER_POP_DATA %>% left_join(OTHER_DEMO_DATA) %>% select(colnames(REG_DATA))
|
||||
ST_OTHER_REG <- OTHER_REG_DATA[OTHER_REG_DATA$Year==2023,]
|
||||
ST_OTHER_REG$Year <-2024
|
||||
###The starting entry to predict births in the next period based on the current population
|
||||
ST_OTHER_REG[,"Population"] <- OTHER_REG_DATA[OTHER_REG_DATA$Year==2024,] %>% pull("Population")
|
||||
|
||||
####################################################3
|
||||
REG_DATA <- REG_DATA %>% rbind(OTHER_REG_DATA) %>% rbind(KEM_REG_DATA)
|
||||
|
||||
###Predict the number of Births
|
||||
MOD_BIRTHS <- feols(log(Births)~log(PREV_BIRTH)+log(PREV_TWO_BIRTH)+log(Min_Birth_Group)+Year*County,cluster=~Year+County, data=REG_DATA ) #Higher AIC but worse acf
|
||||
MOD_BIRTHS <- feols(log(Births)~log(PREV_BIRTH)+log(PREV_TWO_BIRTH)+log(Min_Birth_Group)+Year*Region,cluster=~Year+Region, data=REG_DATA ) #Higher AIC but worse acf
|
||||
ST_OTHER_REG
|
||||
#Current year prediction
|
||||
exp(predict(MOD_BIRTHS,newdata=ST_KEM_REG)) #Kemmerer births
|
||||
exp(predict(MOD_BIRTHS,newdata=ST_OTHER_REG)) #Other Lincoln births
|
||||
exp(predict(MOD_BIRTHS,newdata=ST_LIN_REG)) #All Lincoln births
|
||||
#Note that due to useing diffrent data sets Lincoln is NOT colinear with Kemmere+Other Lincoln. Either result can be downshifted by the amount of diffrence
|
||||
ADJUST_RESULTS_FACTOR <- (exp(predict(MOD_BIRTHS,newdata=ST_KEM_REG))+exp(predict(MOD_BIRTHS,newdata=ST_OTHER_REG)))/exp(predict(MOD_BIRTHS,newdata=ST_LIN_REG))
|
||||
|
||||
ADJUST_RESULTS_FACTOR
|
||||
#MOD_BIRTHS_ALT <- feols(log(Births)~log(PREV_BIRTH)+log(Min_Birth_Group)+Year*County,cluster=~Year+County, data=REG_DATA )
|
||||
#AIC(MOD_BIRTHS)<AIC(MOD_BIRTHS_ALT)
|
||||
#Optional: Review the ACF and PACF for validity. Model made on October 24nd appears to have uncorrelated lags of residuals accept year three.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user