320 lines
18 KiB
R

#################IMPORTANT!!!! CLEAN UP DATA SCRIPT FOR USE
#############################Clean up scirpt folders, simulations of deaths should be seperated from this file. Death rate simulation is complicated and should be commented, and turned into a seperate script.
##############WORKING ON REGRESSION OF BIRTHS FOR PREDICTIONS OF GROWTH, THINK ABOUT GOING FROM COUNTY TO CITY, SEPERATE REGRESION FROM DATA COLLOECTION
library(rvest)
library(tidyverse)
library(readxl)
#setwd("../")
########County, Death, Birth and Migration Data
#Data found on the page http://eadiv.state.wy.us/pop/
PAGE <- read_html("http://eadiv.state.wy.us/pop/BirthDeathMig.htm")
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- which(toupper(TBL$X1)=="ALBANY")
END <- which(toupper(TBL$X1)=="TOTAL")
TYPES <- TBL[ST-2,1]
ST_YEAR <- 1971
ALL_DATA <- list()
TBL <- TBL[,c(1,which(!is.na(as.numeric(TBL[ST[1],]))))]
TBL <- TBL[,-ncol(TBL)]
colnames(TBL) <- c("County",(ST_YEAR:(ST_YEAR+ncol(TBL)-1)))
TBL$Type <- NA
for(i in 1:length(ST)){
TBL[ST[i]:END[i],"Type"]<- as.character(TYPES[i,1])
}
TBL[ST[2]:END[2],"Type"] <- as.character(TYPES[2,1])
TBL$Type
TBL <- TBL %>% filter(!is.na(Type)) %>% select(County,Type,everything())
GROUP <- colnames(TBL)[-1:-2]
Data <- pivot_longer(TBL,all_of(GROUP),names_to="Year",values_to="Pop_Change")
Data$County <- ifelse(toupper(Data$County)=="TOTAL","Wyoming",Data$County)
WY_COUNTY_DATA_SET <- pivot_wider(Data,names_from=Type,values_from=Pop_Change) %>% rename("Migration"=`Net Migration`) %>% mutate(Year=as.integer(Year),Births=parse_number(Births),Deaths=parse_number(Deaths),Migration=parse_number(Migration)) %>% mutate(Year=Year-1) #Data apears to be one off from populaiton
WY_COUNTY_DATA_SET[,"County"] <- gsub(" ","_",WY_COUNTY_DATA_SET %>% pull(County))
########################City and County Population Data 2020 to 2024
PAGE <- read_html('http://eadiv.state.wy.us/pop/Place-24EST.htm')
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- which(toupper(TBL$X1)==toupper("Albany County"))
END <- which(toupper(TBL$X1)==toupper("Balance of Weston County"))
#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-2,] %in% 1970:2025))
NAMES <- TBL[4,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("County",NAMES)
TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL$County <- gsub(" "," ",gsub("\n","",gsub("\r","",TBL %>% pull(County))))
COUNTY_POP<- TBL[grep("COUNTY",TBL %>% pull(County),ignore.case=TRUE),]
COUNTY_POP<- COUNTY_POP[grep("Balance",COUNTY_POP%>% pull(County),invert=TRUE,ignore.case=TRUE),]
COUNTY_POP$County <- gsub(" ","_",gsub(" County","",COUNTY_POP$County))
CITY_POP <- TBL[sort(c(grep("County",TBL %>% pull(County),invert=TRUE,ignore.case=TRUE),grep("Balance",TBL %>% pull(County),ignore.case=TRUE))),]
CITY_POP$County <- gsub(" ","_",gsub("Balance of","Unincorporated",gsub(" County","",gsub(" city","",gsub(" town","",CITY_POP$County,ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE))
CITY_POP <- CITY_POP %>% rename("City"=County)
########################City Population Data 2010 to 2020
PAGE <- read_html('http://eadiv.state.wy.us/pop/sub-est11-19.htm')
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- which(toupper(TBL$X1)==toupper("Afton town, Wyoming"))
END <- which(toupper(TBL$X1)==toupper("Yoder town, Wyoming"))
#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-1,] %in% 1970:2025))
NAMES <- TBL[3,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("City",NAMES)
TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL$City <- gsub(" ","_",gsub(" $","",gsub("\r|\n| Wyoming|,| town| city","",TBL$City,ignore.case=TRUE)))
TBL <- TBL %>% filter(Year!=2020)
CITY_POP <- rbind(TBL,CITY_POP)
########################County Population Data 2010 to 2020
PAGE <- read_html('http://eadiv.state.wy.us/pop/ctyest11-19.htm')
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- grep("Albany",TBL$X1)
END <- grep("Weston",TBL$X1)
#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-2,] %in% 1970:2025))
NAMES <- TBL[3,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("County",NAMES)
TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL$County <- gsub(" ","_",gsub(" "," ",gsub(" $","",gsub("\r|\n| Wyoming|,| town| city| County|\\.","",TBL$County,ignore.case=TRUE))))
TBL <- TBL %>% filter(Year!=2020)
COUNTY_POP <- rbind(TBL,COUNTY_POP)
########################County and City Population Data 2000 to 2010
PAGE <- read_html('http://eadiv.state.wy.us/pop/sub-est01-09.htm')
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- which(toupper(TBL$X1)==toupper("Albany County"))
END <- which(toupper(TBL$X1)==toupper("Balance of Weston County"))
#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-4,] %in% 1970:2025))
NAMES <- TBL[4,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("County",NAMES)
TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL <- TBL %>% filter(Year!=2010)
TBL$County <- gsub(" "," ",gsub("\n","",gsub("\r","",TBL %>% pull(County))))
COUNTY_TBL <- TBL[grep("COUNTY",TBL %>% pull(County),ignore.case=TRUE),]
COUNTY_TBL <-COUNTY_TBL[grep("Balance",COUNTY_TBL%>% pull(County),invert=TRUE,ignore.case=TRUE),]
COUNTY_TBL$County <-gsub("_(pt.)","", gsub(" ","_",gsub(" County","",COUNTY_TBL$County)))
CITY_TBL <- TBL[sort(c(grep("County",TBL %>% pull(County),invert=TRUE,ignore.case=TRUE),grep("Balance",TBL %>% pull(County),ignore.case=TRUE))),]
CITY_TBL$County <- gsub(" ","_",gsub("Balance of","Unincorporated",gsub(" County","",gsub(" city","",gsub(" town","",CITY_TBL$County,ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE))
CITY_TBL <- CITY_TBL %>% rename("City"=County)
CITY_POP <- rbind(CITY_TBL,CITY_POP)
#Cleanup names
CITY_POP$City <- gsub("LaGrange","La_Grange",CITY_POP$City)
COUNTY_POP <- rbind(COUNTY_TBL,COUNTY_POP)
####################County and City Population Data for 1990-2000
if(!file.exists("./Data/Pop_1990s.xls")){download.file('http://eadiv.state.wy.us/pop/c&sc90_00.xls',"./Data/Pop_1990s.xls")}
TEMP <- read_xls("Data/Pop_1990s.xls",skip=2)[-1:-4,]
colnames(TEMP)[1] <- "County"
tail(TEMP)
TEMP <- TEMP[1:which(TEMP[,1]=="Wind River Res."),]
TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population))
TEMP <- TEMP %>% filter(Year!=2000)
TEMP_COUNTY <- TEMP[grepl("Cnty",TEMP %>% pull(County),ignore.case=TRUE),]
TEMP_COUNTY$County <- gsub(" ","_",gsub(" "," ",gsub(" Cnty","",TEMP_COUNTY$County,ignore.case=TRUE)))
TEMP_CITY <- TEMP[grep("Cnty",TEMP %>% pull(County),ignore.case=TRUE,invert=TRUE),]
TEMP_CITY$County <- gsub("E_Therm","East_Therm",gsub(" ","_",gsub(" ","",TEMP_CITY %>% pull(County))))
TEMP_CITY <- TEMP_CITY %>% rename(City=County)
TEMP_CITY %>% pull(City) %>% unique %>% sort
CITY_POP <- rbind(TEMP_CITY,CITY_POP)
CITY_POP %>% pull(City) %>% unique %>% sort
COUNTY_POP <- rbind(TEMP_COUNTY,COUNTY_POP)
TEMP_CITY <- TEMP_CITY %>% filter(Year!=2000)
try(rm(TEMP_CITY,TEMP_COUNTY,TEMP))
####################County and City Population Data for 1980-1990
if(!file.exists("./Data/Pop_1980s.xls")){download.file('http://eadiv.state.wy.us/pop/C&SC8090.xls',"./Data/Pop_1980s.xls")}
TEMP <- read_xls("Data/Pop_1980s.xls",skip=2)[-1:-4,]
colnames(TEMP)[1] <- "County"
TEMP <- TEMP[2:which(TEMP[,1]=="Upton"),1:(min(which(is.na(TEMP[2,])))-1)]
TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population))
TEMP_COUNTY <- TEMP[grepl("Cty",TEMP %>% pull(County),ignore.case=TRUE),]
TEMP_COUNTY$County <- gsub(" ","_",gsub(" "," ",gsub(" Cty","",TEMP_COUNTY$County,ignore.case=TRUE)))
TEMP_CITY <- TEMP[grep("Cty",TEMP %>% pull(County),ignore.case=TRUE,invert=TRUE),]
TEMP_CITY$County <-gsub("Frannie_","Frannie", gsub("Mtn._View","Mountain_View",gsub("E._Therm","East_Therm",gsub(" ","_",gsub(" ","",TEMP_CITY %>% pull(County))))))
TEMP_CITY <- TEMP_CITY %>% rename(City=County)
TEMP_CITY <- TEMP_CITY %>% filter(Year!=1990)
TEMP_COUNTY <- TEMP_COUNTY %>% filter(Year!=1990)
CITY_POP <- rbind(TEMP_CITY,CITY_POP)
COUNTY_POP <- rbind(TEMP_COUNTY,COUNTY_POP)
#ggplot(aes(x=Year,y=Population,group=County,color=County),data=COUNTY_POP)+geom_line()
try(rm(TEMP_CITY,TEMP_COUNTY,TEMP))
####################County Population Data for 1970-1980
if(!file.exists("./Data/Pop_1970s.xls")){download.file('http://eadiv.state.wy.us/pop/Cnty7080.xls',"./Data/Pop_1970s.xls")}
TEMP <- read_xls("Data/Pop_1970s.xls",skip=2)[-1:-4,]
colnames(TEMP)[1] <- "County"
TEMP <- TEMP[1:which(TEMP[,1]=="Weston"),]
TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population))
TEMP$County <- gsub(" ","_",TEMP$County)
TEMP <- TEMP %>% filter(Year!=1980)
COUNTY_POP <- rbind(TEMP,COUNTY_POP)
#ggplot(aes(x=Year,y=Population,group=County,color=County),data=COUNTY_POP)+geom_line()
try(rm(TEMP))
###########Old data addtion:Period Ends in 1970
#See in part http://eadiv.state.wy.us/demog_data/cntycity_hist.htm
LN_OLD <- c(12487,10894,10286,9023,9018,8640) #Missing in 1910
Year <- seq(1920,1970,by=10)
TEMP <- cbind(Year,rep("Lincoln",6),LN_OLD)
colnames(TEMP ) <- c("Year","County","Population")
TEMP <- as_tibble(TEMP)
COUNTY_POP <- rbind(TEMP,COUNTY_POP) %>% arrange(County,Year)
KEM_OLD <- c(843,1517,1884,2026,1667,2028,2292) #1910 forward until 1970
Year <- seq(1910,1970,by=10)
TEMP <- cbind(Year,rep("kemmerer",7),KEM_OLD)
colnames(TEMP ) <- c("Year","City","Population")
TEMP <- as_tibble(TEMP)
CITY_POP <- rbind(TEMP,CITY_POP)
DIAMOND_OLD <- c(696,726,812,586,415,398,485)
TEMP <- cbind(Year,rep("Diamondvile",7),DIAMOND_OLD)
colnames(TEMP ) <- c("Year","City","Population")
TEMP <- as_tibble(TEMP)
CITY_POP <- rbind(TEMP,CITY_POP) %>% arrange(City,Year)
#Add Other Data
COUNTY_POP <- COUNTY_POP %>% mutate(Year=as.integer(Year)) %>% unique
WY_COUNTY_DATA_SET <- COUNTY_POP %>% left_join(WY_COUNTY_DATA_SET ) %>% mutate(Population=as.numeric(Population)) %>% unique
###Save Population Results
write_csv(CITY_POP,file="./Data/Cleaned_Data/Wyoming_City_Population.csv")
write_csv(WY_COUNTY_DATA_SET ,file="./Data/Cleaned_Data/Wyoming_County_Population.csv")
saveRDS(CITY_POP,file="./Data/Cleaned_Data/Wyoming_City_Population.Rds")
saveRDS(WY_COUNTY_DATA_SET ,file="./Data/Cleaned_Data/Wyoming_County_Population.Rds")
###################Demographics
if(!file.exists("./Data/Demo_Single_Year_2020s.xls")){download.file('http://eadiv.state.wy.us/Pop/CO_SYASEX24.xlsx',"./Data/Demo_Single_Year_2020s.xls")}
TEMP <- read_xlsx("./Data/Demo_Single_Year_2020s.xls",skip=2)[,-1]
TEMP <- TEMP[1:(min(which(is.na(TEMP[,1])))-1),]
TEMP <- TEMP[!grepl("Base",TEMP$YEAR,ignore.case=TRUE),] #There are two population values provided. I believe one is the census baseline, and one is a estimate in July. Keep the later estimate, to line up with the same seasonal collection pattern of the rest of the data
TEMP$YEAR <- year(as.Date(substr((TEMP$YEAR),1,8),format="%m/%d/%Y"))
colnames(TEMP) <- c("County","Year","Age","Number","Num_Male","Num_Female")
TEMP$County <- gsub(" County","",TEMP$County,ignore.case=TRUE)
DEM_2020 <- TEMP %>% select(-Number)
###Demogrpahics all
DEM_DATA <- read_delim("Data/County_Demographics_Census/wy.1969_2023.singleages.through89.90plus.txt",delim=" ",col_names=c("ID","VALUES"),col_types=list('c','c'))
DEM_DATA$Year <- as.integer(substr(DEM_DATA$ID,1,4))
DEM_DATA$fips<- substr(DEM_DATA$ID,7,11)
COUNTY_LIST <- read_csv("https://github.com/kjhealy/fips-codes/raw/refs/heads/master/county_fips_master.csv",col_types=list('c','c')) %>% filter(state_abbr=="WY") %>% select(fips,County=county_name) %>% mutate(County=gsub(" ","_",gsub(" County","",County,ignore.case=TRUE)))
DEM_DATA <- DEM_DATA %>% left_join(COUNTY_LIST) %>% select(-fips)
#16=3
DEM_DATA$Sex <- ifelse(substr(DEM_DATA$VALUES,3,3)==1,"Male","Female")
DEM_DATA$Age <- parse_number(substr(DEM_DATA$VALUES,4,5))
DEM_DATA$Number <- parse_number(substr(DEM_DATA$VALUES,6,14))
DEM_DATA <- DEM_DATA %>% select(-ID,-VALUES)
DEM_DATA <- DEM_DATA %>% group_by(Year,County,Sex,Age) %>% summarize(Number=sum(Number)) %>% ungroup()#Aggregate to sex and age level
#The Wyoming census data seems newer than this data set from SEER cancer data source. Drop any of these records that overlap with the Wyoming data before merging. Arrange so the column order is the same between the two data sets, so they can be easily bound together.
DEM_DATA <- pivot_wider(DEM_DATA,names_from=Sex,values_from=Number) %>% rename(Num_Female=Female,Num_Male=Male) %>% select(colnames(DEM_2020)) %>% filter(Year<min(DEM_2020$Year)) %>% unique
DEM_DATA <- rbind(DEM_2020,DEM_DATA) %>% ungroup %>% arrange(Year,Age) %>% unique
###Save demographic data set
LIN_DEM <- DEM_DATA %>% filter(County=='Lincoln')
write_csv(LIN_DEM,file="./Data/Cleaned_Data/Lincoln_Demographic_Data.csv")
write_csv(DEM_DATA,file="./Data/Cleaned_Data/Wyoming_County_Demographic_Data.csv")
saveRDS(LIN_DEM,file="./Data/Cleaned_Data/Lincoln_Demographic_Data.Rds")
saveRDS(DEM_DATA,file="./Data/Cleaned_Data/Wyoming_County_Demographic_Data.Rds")
#########################################Mortality Rate
GET_MORTALITY_DATA <- function(FILE,SEX,LOWER_AGE,UPPER_AGE){
#Create clean mortality rate data
#Data gathered from https://hdpulse.nimhd.nih.gov/data-portal/mortality/table?cod=247&cod_options=cod_15&ratetype=aa&ratetype_options=ratetype_2&race=00&race_options=race_6&sex=2&sex_options=sex_3&age=177&age_options=age_11&ruralurban=0&ruralurban_options=ruralurban_3&yeargroup=5&yeargroup_options=year5yearmort_1&statefips=56&statefips_options=area_states&county=56000&county_options=counties_wyoming&comparison=counties_to_us&comparison_options=comparison_counties&radio_comparison=areas&radio_comparison_options=cods_or_areas
NAMES <- c("County","FIPS","Death_Rate","Lower_Rate","Upper_Rate","Deaths","Trend_Category","Trend","Lower_Trend","Upper_Trend")
DF <- read_csv(FILE,skip=5,col_names=NAMES,col_types=list('c',"i",'d','d','d','d','c','d','d','d')) %>% filter(grepl("County|Wyoming",County)|County=="United States") %>% mutate(Rate_SD=(Upper_Rate-Lower_Rate)/(2*1.96),Trend_SD=(Upper_Trend-Lower_Trend)/(2*1.96)) %>% select(County,Death_Rate,Rate_SD,Trend,Trend_SD)
DF$County <- gsub(" County","",DF$County,ignore.case=TRUE)
DF[,-1] <- DF[,-1]/100000
WYOMING_TREND <- pull(DF[DF$County=="Wyoming",],"Trend")
US_TREND <- pull(DF[DF$County=="United States",],"Trend")
WYOMING_RATE <- pull(DF[DF$County=="Wyoming",],"Death_Rate")
US_RATE <- pull(DF[DF$County=="United States",],"Death_Rate")
DF$Imparted_Trend <- FALSE
if(is.na(WYOMING_TREND)){
DF[1,4:5] <- DF[2,4:5]
DF[1,6] <- TRUE
}
DF$Imparted_Rate <- FALSE
if(is.na(WYOMING_RATE)){
DF[1,4:5] <- DF[2,2:3]
DF[1,6] <- TRUE
}
WYOMING_BASELINE_TREND <-cbind (DF[1,4:5] ,TRUE)
WYOMING_BASELINE_RATE <-DF[1,2:3]
for(i in 3:nrow(DF)){
#Impart any missing trends based on higher levels
if(is.na(pull(DF[i,],"Trend"))){ DF[i,4:6] <- WYOMING_BASELINE_TREND}
#Impart any missing death rates based on higher levels
if(is.na(pull(DF[i,],"Death_Rate"))){
DF[i,2:3] <- WYOMING_BASELINE_RATE
DF[i,"Imparted_Rate"] <- TRUE
}
}
DF$Sex <- SEX
DF$Min_Age <- LOWER_AGE
DF$Max_Age <- UPPER_AGE
DF <- DF %>% select(County,Sex,Min_Age,Max_Age,Death_Rate,Rate_SD,Imparted_Rate,everything())
return(DF)
}
MORTALITY_DATA_ALL <- rbind(
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/A_Under1.csv","Female",0,0),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/B_1_9.csv","Female",1,9),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/C_10_19.csv","Female",10,19),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/D_20_39.csv","Female",20,39),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/E_40_64.csv","Female",40,64),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/G_65_74.csv","Female",65,74),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/H_75_84.csv","Female",75,84),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/I_85+.csv","Female",85,Inf),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/A_Under1.csv","Male",0,0),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/B_1_9.csv","Male",1,9),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/C_10_19.csv","Male",10,19),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/D_20_39.csv","Male",20,39),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/E_40_64.csv","Male",40,64),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/G_65_74.csv","Male",65,74),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/H_75_84.csv","Male",75,84),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/I_85+.csv","Male",85,Inf)
)
LIN_MORTALITY <- MORTALITY_DATA_ALL %>% filter(County=="Lincoln")
###Save Mortality Rate data set
write_csv(LIN_MORTALITY,file="./Data/Cleaned_Data/Lincoln_Mortality_Rate.csv")
write_csv(MORTALITY_DATA_ALL,file="./Data/Cleaned_Data/Not_Used/Wyoming_County_Mortality_Rate.csv")
saveRDS(LIN_MORTALITY,file="./Data/Cleaned_Data/Lincoln_Mortality_Rate.Rds")
saveRDS(MORTALITY_DATA_ALL,file="./Data/Cleaned_Data/Not_Used/Wyoming_County_Mortality_Rate.Rds")
#Clean all data from memory. Will be loaded in any paticular script as needed
rm(list = ls())
gc()