#################IMPORTANT!!!! CLEAN UP DATA SCRIPT FOR USE #############################Clean up scirpt folders, simulations of deaths should be seperated from this file. Death rate simulation is complicated and should be commented, and turned into a seperate script. ##############WORKING ON REGRESSION OF BIRTHS FOR PREDICTIONS OF GROWTH, THINK ABOUT GOING FROM COUNTY TO CITY, SEPERATE REGRESION FROM DATA COLLOECTION library(rvest) library(tidyverse) library(readxl) #setwd("../") ########County, Death, Birth and Migration Data #Data found on the page http://eadiv.state.wy.us/pop/ PAGE <- read_html("http://eadiv.state.wy.us/pop/BirthDeathMig.htm") NODE <- html_element(PAGE ,"table") TBL <- html_table(NODE) ST <- which(toupper(TBL$X1)=="ALBANY") END <- which(toupper(TBL$X1)=="TOTAL") TYPES <- TBL[ST-2,1] ST_YEAR <- 1971 ALL_DATA <- list() TBL <- TBL[,c(1,which(!is.na(as.numeric(TBL[ST[1],]))))] TBL <- TBL[,-ncol(TBL)] colnames(TBL) <- c("County",(ST_YEAR:(ST_YEAR+ncol(TBL)-1))) TBL$Type <- NA for(i in 1:length(ST)){ TBL[ST[i]:END[i],"Type"]<- as.character(TYPES[i,1]) } TBL[ST[2]:END[2],"Type"] <- as.character(TYPES[2,1]) TBL$Type TBL <- TBL %>% filter(!is.na(Type)) %>% select(County,Type,everything()) GROUP <- colnames(TBL)[-1:-2] Data <- pivot_longer(TBL,all_of(GROUP),names_to="Year",values_to="Pop_Change") Data$County <- ifelse(toupper(Data$County)=="TOTAL","Wyoming",Data$County) WY_COUNTY_DATA_SET <- pivot_wider(Data,names_from=Type,values_from=Pop_Change) %>% rename("Migration"=`Net Migration`) %>% mutate(Year=as.integer(Year),Births=parse_number(Births),Deaths=parse_number(Deaths),Migration=parse_number(Migration)) %>% mutate(Year=Year-1) #Data apears to be one off from populaiton WY_COUNTY_DATA_SET[,"County"] <- gsub(" ","_",WY_COUNTY_DATA_SET %>% pull(County)) ########################City and County Population Data 2020 to 2024 PAGE <- read_html('http://eadiv.state.wy.us/pop/Place-24EST.htm') NODE <- html_element(PAGE ,"table") TBL <- html_table(NODE) ST <- which(toupper(TBL$X1)==toupper("Albany County")) END <- which(toupper(TBL$X1)==toupper("Balance of Weston County")) #More years than are pulled are listed to make more generic COLUMNS <- c(1,which(TBL[ST-2,] %in% 1970:2025)) NAMES <- TBL[4,COLUMNS][-1] TBL <- TBL[ST:END,COLUMNS ] colnames(TBL) <- c("County",NAMES) TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population)) TBL$County <- gsub(" "," ",gsub("\n","",gsub("\r","",TBL %>% pull(County)))) COUNTY_POP<- TBL[grep("COUNTY",TBL %>% pull(County),ignore.case=TRUE),] COUNTY_POP<- COUNTY_POP[grep("Balance",COUNTY_POP%>% pull(County),invert=TRUE,ignore.case=TRUE),] COUNTY_POP$County <- gsub(" ","_",gsub(" County","",COUNTY_POP$County)) CITY_POP <- TBL[sort(c(grep("County",TBL %>% pull(County),invert=TRUE,ignore.case=TRUE),grep("Balance",TBL %>% pull(County),ignore.case=TRUE))),] CITY_POP$County <- gsub(" ","_",gsub("Balance of","Unincorporated",gsub(" County","",gsub(" city","",gsub(" town","",CITY_POP$County,ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE)) CITY_POP <- CITY_POP %>% rename("City"=County) ########################City Population Data 2010 to 2020 PAGE <- read_html('http://eadiv.state.wy.us/pop/sub-est11-19.htm') NODE <- html_element(PAGE ,"table") TBL <- html_table(NODE) ST <- which(toupper(TBL$X1)==toupper("Afton town, Wyoming")) END <- which(toupper(TBL$X1)==toupper("Yoder town, Wyoming")) #More years than are pulled are listed to make more generic COLUMNS <- c(1,which(TBL[ST-1,] %in% 1970:2025)) NAMES <- TBL[3,COLUMNS][-1] TBL <- TBL[ST:END,COLUMNS ] colnames(TBL) <- c("City",NAMES) TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population)) TBL$City <- gsub(" ","_",gsub(" $","",gsub("\r|\n| Wyoming|,| town| city","",TBL$City,ignore.case=TRUE))) TBL <- TBL %>% filter(Year!=2020) CITY_POP <- rbind(TBL,CITY_POP) ########################County Population Data 2010 to 2020 PAGE <- read_html('http://eadiv.state.wy.us/pop/ctyest11-19.htm') NODE <- html_element(PAGE ,"table") TBL <- html_table(NODE) ST <- grep("Albany",TBL$X1) END <- grep("Weston",TBL$X1) #More years than are pulled are listed to make more generic COLUMNS <- c(1,which(TBL[ST-2,] %in% 1970:2025)) NAMES <- TBL[3,COLUMNS][-1] TBL <- TBL[ST:END,COLUMNS ] colnames(TBL) <- c("County",NAMES) TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population)) TBL$County <- gsub(" ","_",gsub(" "," ",gsub(" $","",gsub("\r|\n| Wyoming|,| town| city| County|\\.","",TBL$County,ignore.case=TRUE)))) TBL <- TBL %>% filter(Year!=2020) COUNTY_POP <- rbind(TBL,COUNTY_POP) ########################County and City Population Data 2000 to 2010 PAGE <- read_html('http://eadiv.state.wy.us/pop/sub-est01-09.htm') NODE <- html_element(PAGE ,"table") TBL <- html_table(NODE) ST <- which(toupper(TBL$X1)==toupper("Albany County")) END <- which(toupper(TBL$X1)==toupper("Balance of Weston County")) #More years than are pulled are listed to make more generic COLUMNS <- c(1,which(TBL[ST-4,] %in% 1970:2025)) NAMES <- TBL[4,COLUMNS][-1] TBL <- TBL[ST:END,COLUMNS ] colnames(TBL) <- c("County",NAMES) TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population)) TBL <- TBL %>% filter(Year!=2010) TBL$County <- gsub(" "," ",gsub("\n","",gsub("\r","",TBL %>% pull(County)))) COUNTY_TBL <- TBL[grep("COUNTY",TBL %>% pull(County),ignore.case=TRUE),] COUNTY_TBL <-COUNTY_TBL[grep("Balance",COUNTY_TBL%>% pull(County),invert=TRUE,ignore.case=TRUE),] COUNTY_TBL$County <-gsub("_(pt.)","", gsub(" ","_",gsub(" County","",COUNTY_TBL$County))) CITY_TBL <- TBL[sort(c(grep("County",TBL %>% pull(County),invert=TRUE,ignore.case=TRUE),grep("Balance",TBL %>% pull(County),ignore.case=TRUE))),] CITY_TBL$County <- gsub(" ","_",gsub("Balance of","Unincorporated",gsub(" County","",gsub(" city","",gsub(" town","",CITY_TBL$County,ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE)) CITY_TBL <- CITY_TBL %>% rename("City"=County) CITY_POP <- rbind(CITY_TBL,CITY_POP) #Cleanup names CITY_POP$City <- gsub("LaGrange","La_Grange",CITY_POP$City) COUNTY_POP <- rbind(COUNTY_TBL,COUNTY_POP) ####################County and City Population Data for 1990-2000 if(!file.exists("./Data/Pop_1990s.xls")){download.file('http://eadiv.state.wy.us/pop/c&sc90_00.xls',"./Data/Pop_1990s.xls")} TEMP <- read_xls("Data/Pop_1990s.xls",skip=2)[-1:-4,] colnames(TEMP)[1] <- "County" tail(TEMP) TEMP <- TEMP[1:which(TEMP[,1]=="Wind River Res."),] TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population)) TEMP <- TEMP %>% filter(Year!=2000) TEMP_COUNTY <- TEMP[grepl("Cnty",TEMP %>% pull(County),ignore.case=TRUE),] TEMP_COUNTY$County <- gsub(" ","_",gsub(" "," ",gsub(" Cnty","",TEMP_COUNTY$County,ignore.case=TRUE))) TEMP_CITY <- TEMP[grep("Cnty",TEMP %>% pull(County),ignore.case=TRUE,invert=TRUE),] TEMP_CITY$County <- gsub("E_Therm","East_Therm",gsub(" ","_",gsub(" ","",TEMP_CITY %>% pull(County)))) TEMP_CITY <- TEMP_CITY %>% rename(City=County) TEMP_CITY %>% pull(City) %>% unique %>% sort CITY_POP <- rbind(TEMP_CITY,CITY_POP) CITY_POP %>% pull(City) %>% unique %>% sort COUNTY_POP <- rbind(TEMP_COUNTY,COUNTY_POP) TEMP_CITY <- TEMP_CITY %>% filter(Year!=2000) try(rm(TEMP_CITY,TEMP_COUNTY,TEMP)) ####################County and City Population Data for 1980-1990 if(!file.exists("./Data/Pop_1980s.xls")){download.file('http://eadiv.state.wy.us/pop/C&SC8090.xls',"./Data/Pop_1980s.xls")} TEMP <- read_xls("Data/Pop_1980s.xls",skip=2)[-1:-4,] colnames(TEMP)[1] <- "County" TEMP <- TEMP[2:which(TEMP[,1]=="Upton"),1:(min(which(is.na(TEMP[2,])))-1)] TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population)) TEMP_COUNTY <- TEMP[grepl("Cty",TEMP %>% pull(County),ignore.case=TRUE),] TEMP_COUNTY$County <- gsub(" ","_",gsub(" "," ",gsub(" Cty","",TEMP_COUNTY$County,ignore.case=TRUE))) TEMP_CITY <- TEMP[grep("Cty",TEMP %>% pull(County),ignore.case=TRUE,invert=TRUE),] TEMP_CITY$County <-gsub("Frannie_","Frannie", gsub("Mtn._View","Mountain_View",gsub("E._Therm","East_Therm",gsub(" ","_",gsub(" ","",TEMP_CITY %>% pull(County)))))) TEMP_CITY <- TEMP_CITY %>% rename(City=County) TEMP_CITY <- TEMP_CITY %>% filter(Year!=1990) TEMP_COUNTY <- TEMP_COUNTY %>% filter(Year!=1990) CITY_POP <- rbind(TEMP_CITY,CITY_POP) COUNTY_POP <- rbind(TEMP_COUNTY,COUNTY_POP) #ggplot(aes(x=Year,y=Population,group=County,color=County),data=COUNTY_POP)+geom_line() try(rm(TEMP_CITY,TEMP_COUNTY,TEMP)) ####################County Population Data for 1970-1980 if(!file.exists("./Data/Pop_1970s.xls")){download.file('http://eadiv.state.wy.us/pop/Cnty7080.xls',"./Data/Pop_1970s.xls")} TEMP <- read_xls("Data/Pop_1970s.xls",skip=2)[-1:-4,] colnames(TEMP)[1] <- "County" TEMP <- TEMP[1:which(TEMP[,1]=="Weston"),] TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population)) TEMP$County <- gsub(" ","_",TEMP$County) TEMP <- TEMP %>% filter(Year!=1980) COUNTY_POP <- rbind(TEMP,COUNTY_POP) #ggplot(aes(x=Year,y=Population,group=County,color=County),data=COUNTY_POP)+geom_line() try(rm(TEMP)) ###########Old data addtion:Period Ends in 1970 #See in part http://eadiv.state.wy.us/demog_data/cntycity_hist.htm LN_OLD <- c(12487,10894,10286,9023,9018,8640) #Missing in 1910 Year <- seq(1920,1970,by=10) TEMP <- cbind(Year,rep("Lincoln",6),LN_OLD) colnames(TEMP ) <- c("Year","County","Population") TEMP <- as_tibble(TEMP) COUNTY_POP <- rbind(TEMP,COUNTY_POP) %>% arrange(County,Year) KEM_OLD <- c(843,1517,1884,2026,1667,2028,2292) #1910 forward until 1970 Year <- seq(1910,1970,by=10) TEMP <- cbind(Year,rep("kemmerer",7),KEM_OLD) colnames(TEMP ) <- c("Year","City","Population") TEMP <- as_tibble(TEMP) CITY_POP <- rbind(TEMP,CITY_POP) DIAMOND_OLD <- c(696,726,812,586,415,398,485) TEMP <- cbind(Year,rep("Diamondvile",7),DIAMOND_OLD) colnames(TEMP ) <- c("Year","City","Population") TEMP <- as_tibble(TEMP) CITY_POP <- rbind(TEMP,CITY_POP) %>% arrange(City,Year) #Add Other Data COUNTY_POP <- COUNTY_POP %>% mutate(Year=as.integer(Year)) %>% unique WY_COUNTY_DATA_SET <- COUNTY_POP %>% left_join(WY_COUNTY_DATA_SET ) %>% mutate(Population=as.numeric(Population)) %>% unique ###Save Population Results write_csv(CITY_POP,file="./Data/Cleaned_Data/Wyoming_City_Population.csv") write_csv(WY_COUNTY_DATA_SET ,file="./Data/Cleaned_Data/Wyoming_County_Population.csv") saveRDS(CITY_POP,file="./Data/Cleaned_Data/Wyoming_City_Population.Rds") saveRDS(WY_COUNTY_DATA_SET ,file="./Data/Cleaned_Data/Wyoming_County_Population.Rds") ###################Demographics if(!file.exists("./Data/Demo_Single_Year_2020s.xls")){download.file('http://eadiv.state.wy.us/Pop/CO_SYASEX24.xlsx',"./Data/Demo_Single_Year_2020s.xls")} TEMP <- read_xlsx("./Data/Demo_Single_Year_2020s.xls",skip=2)[,-1] TEMP <- TEMP[1:(min(which(is.na(TEMP[,1])))-1),] TEMP <- TEMP[!grepl("Base",TEMP$YEAR,ignore.case=TRUE),] #There are two population values provided. I believe one is the census baseline, and one is a estimate in July. Keep the later estimate, to line up with the same seasonal collection pattern of the rest of the data TEMP$YEAR <- year(as.Date(substr((TEMP$YEAR),1,8),format="%m/%d/%Y")) colnames(TEMP) <- c("County","Year","Age","Number","Num_Male","Num_Female") TEMP$County <- gsub(" County","",TEMP$County,ignore.case=TRUE) DEM_2020 <- TEMP %>% select(-Number) ###Demogrpahics all DEM_DATA <- read_delim("Data/County_Demographics_Census/wy.1969_2023.singleages.through89.90plus.txt",delim=" ",col_names=c("ID","VALUES"),col_types=list('c','c')) DEM_DATA$Year <- as.integer(substr(DEM_DATA$ID,1,4)) DEM_DATA$fips<- substr(DEM_DATA$ID,7,11) COUNTY_LIST <- read_csv("https://github.com/kjhealy/fips-codes/raw/refs/heads/master/county_fips_master.csv",col_types=list('c','c')) %>% filter(state_abbr=="WY") %>% select(fips,County=county_name) %>% mutate(County=gsub(" ","_",gsub(" County","",County,ignore.case=TRUE))) DEM_DATA <- DEM_DATA %>% left_join(COUNTY_LIST) %>% select(-fips) #16=3 DEM_DATA$Sex <- ifelse(substr(DEM_DATA$VALUES,3,3)==1,"Male","Female") DEM_DATA$Age <- parse_number(substr(DEM_DATA$VALUES,4,5)) DEM_DATA$Number <- parse_number(substr(DEM_DATA$VALUES,6,14)) DEM_DATA <- DEM_DATA %>% select(-ID,-VALUES) DEM_DATA <- DEM_DATA %>% group_by(Year,County,Sex,Age) %>% summarize(Number=sum(Number)) %>% ungroup()#Aggregate to sex and age level #The Wyoming census data seems newer than this data set from SEER cancer data source. Drop any of these records that overlap with the Wyoming data before merging. Arrange so the column order is the same between the two data sets, so they can be easily bound together. DEM_DATA <- pivot_wider(DEM_DATA,names_from=Sex,values_from=Number) %>% rename(Num_Female=Female,Num_Male=Male) %>% select(colnames(DEM_2020)) %>% filter(Year% unique DEM_DATA <- rbind(DEM_2020,DEM_DATA) %>% ungroup %>% arrange(Year,Age) %>% unique ###Save demographic data set LIN_DEM <- DEM_DATA %>% filter(County=='Lincoln') write_csv(LIN_DEM,file="./Data/Cleaned_Data/Lincoln_Demographic_Data.csv") write_csv(DEM_DATA,file="./Data/Cleaned_Data/Wyoming_County_Demographic_Data.csv") saveRDS(LIN_DEM,file="./Data/Cleaned_Data/Lincoln_Demographic_Data.Rds") saveRDS(DEM_DATA,file="./Data/Cleaned_Data/Wyoming_County_Demographic_Data.Rds") #########################################Mortality Rate GET_MORTALITY_DATA <- function(FILE,SEX,LOWER_AGE,UPPER_AGE){ #Create clean mortality rate data #Data gathered from https://hdpulse.nimhd.nih.gov/data-portal/mortality/table?cod=247&cod_options=cod_15&ratetype=aa&ratetype_options=ratetype_2&race=00&race_options=race_6&sex=2&sex_options=sex_3&age=177&age_options=age_11&ruralurban=0&ruralurban_options=ruralurban_3&yeargroup=5&yeargroup_options=year5yearmort_1&statefips=56&statefips_options=area_states&county=56000&county_options=counties_wyoming&comparison=counties_to_us&comparison_options=comparison_counties&radio_comparison=areas&radio_comparison_options=cods_or_areas NAMES <- c("County","FIPS","Death_Rate","Lower_Rate","Upper_Rate","Deaths","Trend_Category","Trend","Lower_Trend","Upper_Trend") DF <- read_csv(FILE,skip=5,col_names=NAMES,col_types=list('c',"i",'d','d','d','d','c','d','d','d')) %>% filter(grepl("County|Wyoming",County)|County=="United States") %>% mutate(Rate_SD=(Upper_Rate-Lower_Rate)/(2*1.96),Trend_SD=(Upper_Trend-Lower_Trend)/(2*1.96)) %>% select(County,Death_Rate,Rate_SD,Trend,Trend_SD) DF$County <- gsub(" County","",DF$County,ignore.case=TRUE) DF[,-1] <- DF[,-1]/100000 WYOMING_TREND <- pull(DF[DF$County=="Wyoming",],"Trend") US_TREND <- pull(DF[DF$County=="United States",],"Trend") WYOMING_RATE <- pull(DF[DF$County=="Wyoming",],"Death_Rate") US_RATE <- pull(DF[DF$County=="United States",],"Death_Rate") DF$Imparted_Trend <- FALSE if(is.na(WYOMING_TREND)){ DF[1,4:5] <- DF[2,4:5] DF[1,6] <- TRUE } DF$Imparted_Rate <- FALSE if(is.na(WYOMING_RATE)){ DF[1,4:5] <- DF[2,2:3] DF[1,6] <- TRUE } WYOMING_BASELINE_TREND <-cbind (DF[1,4:5] ,TRUE) WYOMING_BASELINE_RATE <-DF[1,2:3] for(i in 3:nrow(DF)){ #Impart any missing trends based on higher levels if(is.na(pull(DF[i,],"Trend"))){ DF[i,4:6] <- WYOMING_BASELINE_TREND} #Impart any missing death rates based on higher levels if(is.na(pull(DF[i,],"Death_Rate"))){ DF[i,2:3] <- WYOMING_BASELINE_RATE DF[i,"Imparted_Rate"] <- TRUE } } DF$Sex <- SEX DF$Min_Age <- LOWER_AGE DF$Max_Age <- UPPER_AGE DF <- DF %>% select(County,Sex,Min_Age,Max_Age,Death_Rate,Rate_SD,Imparted_Rate,everything()) return(DF) } MORTALITY_DATA_ALL <- rbind( GET_MORTALITY_DATA("Data/Mortality_Rates/Female/A_Under1.csv","Female",0,0), GET_MORTALITY_DATA("Data/Mortality_Rates/Female/B_1_9.csv","Female",1,9), GET_MORTALITY_DATA("Data/Mortality_Rates/Female/C_10_19.csv","Female",10,19), GET_MORTALITY_DATA("Data/Mortality_Rates/Female/D_20_39.csv","Female",20,39), GET_MORTALITY_DATA("Data/Mortality_Rates/Female/E_40_64.csv","Female",40,64), GET_MORTALITY_DATA("Data/Mortality_Rates/Female/G_65_74.csv","Female",65,74), GET_MORTALITY_DATA("Data/Mortality_Rates/Female/H_75_84.csv","Female",75,84), GET_MORTALITY_DATA("Data/Mortality_Rates/Female/I_85+.csv","Female",85,Inf), GET_MORTALITY_DATA("Data/Mortality_Rates/Male/A_Under1.csv","Male",0,0), GET_MORTALITY_DATA("Data/Mortality_Rates/Male/B_1_9.csv","Male",1,9), GET_MORTALITY_DATA("Data/Mortality_Rates/Male/C_10_19.csv","Male",10,19), GET_MORTALITY_DATA("Data/Mortality_Rates/Male/D_20_39.csv","Male",20,39), GET_MORTALITY_DATA("Data/Mortality_Rates/Male/E_40_64.csv","Male",40,64), GET_MORTALITY_DATA("Data/Mortality_Rates/Male/G_65_74.csv","Male",65,74), GET_MORTALITY_DATA("Data/Mortality_Rates/Male/H_75_84.csv","Male",75,84), GET_MORTALITY_DATA("Data/Mortality_Rates/Male/I_85+.csv","Male",85,Inf) ) LIN_MORTALITY <- MORTALITY_DATA_ALL %>% filter(County=="Lincoln") ###Save Mortality Rate data set write_csv(LIN_MORTALITY,file="./Data/Cleaned_Data/Lincoln_Mortality_Rate.csv") write_csv(MORTALITY_DATA_ALL,file="./Data/Cleaned_Data/Not_Used/Wyoming_County_Mortality_Rate.csv") saveRDS(LIN_MORTALITY,file="./Data/Cleaned_Data/Lincoln_Mortality_Rate.Rds") saveRDS(MORTALITY_DATA_ALL,file="./Data/Cleaned_Data/Not_Used/Wyoming_County_Mortality_Rate.Rds") #Clean all data from memory. Will be loaded in any paticular script as needed rm(list = ls()) gc()