Population_Study/Scripts/Data_Load.r

#################IMPORTANT!!!! CLEAN UP DATA SCRIPT FOR USE
#############################Clean up scirpt folders, simulations of deaths should be seperated from this file. Death rate simulation is complicated and should be commented, and turned into a seperate script.
##############WORKING ON REGRESSION OF BIRTHS FOR PREDICTIONS OF GROWTH, THINK ABOUT GOING FROM COUNTY TO CITY, SEPERATE REGRESION FROM DATA COLLOECTION
library(rvest)
library(tidyverse)
library(readxl)
#setwd("../")
########County, Death, Birth and Migration Data
#Data found on the page http://eadiv.state.wy.us/pop/
PAGE <- read_html("http://eadiv.state.wy.us/pop/BirthDeathMig.htm")
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)

ST <- which(toupper(TBL$X1)=="ALBANY")
END <- which(toupper(TBL$X1)=="TOTAL")
TYPES <- TBL[ST-2,1]

ST_YEAR <- 1971
ALL_DATA <- list()
TBL <- TBL[,c(1,which(!is.na(as.numeric(TBL[ST[1],]))))]
TBL <- TBL[,-ncol(TBL)]
colnames(TBL) <- c("County",(ST_YEAR:(ST_YEAR+ncol(TBL)-1)))
TBL$Type  <- NA
for(i in 1:length(ST)){
	TBL[ST[i]:END[i],"Type"]<- as.character(TYPES[i,1])
}
TBL[ST[2]:END[2],"Type"] <-  as.character(TYPES[2,1])
TBL$Type
TBL <- TBL %>% filter(!is.na(Type)) %>% select(County,Type,everything())
GROUP <- colnames(TBL)[-1:-2]
Data <- pivot_longer(TBL,all_of(GROUP),names_to="Year",values_to="Pop_Change")
Data$County <- ifelse(toupper(Data$County)=="TOTAL","Wyoming",Data$County)
WY_COUNTY_DATA_SET <- pivot_wider(Data,names_from=Type,values_from=Pop_Change) %>% rename("Migration"=`Net Migration`) %>% mutate(Year=as.integer(Year),Births=parse_number(Births),Deaths=parse_number(Deaths),Migration=parse_number(Migration))

########################City and County Population Data 2020 to 2024
PAGE <- read_html('http://eadiv.state.wy.us/pop/Place-24EST.htm')
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)

ST <- which(toupper(TBL$X1)==toupper("Albany County"))
END <- which(toupper(TBL$X1)==toupper("Balance of Weston County"))

#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-2,] %in% 1970:2025))
NAMES <- TBL[4,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("County",NAMES)

TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL$County <- gsub("  "," ",gsub("\n","",gsub("\r","",TBL %>% pull(County))))

COUNTY_POP<- TBL[grep("COUNTY",TBL %>% pull(County),ignore.case=TRUE),]
COUNTY_POP<-	COUNTY_POP[grep("Balance",COUNTY_POP%>% pull(County),invert=TRUE,ignore.case=TRUE),]
COUNTY_POP$County <- gsub(" ","_",gsub(" County","",COUNTY_POP$County))
CITY_POP <- TBL[sort(c(grep("County",TBL %>% pull(County),invert=TRUE,ignore.case=TRUE),grep("Balance",TBL %>% pull(County),ignore.case=TRUE))),]
CITY_POP$County <- gsub(" ","_",gsub("Balance of","Unincorporated",gsub(" County","",gsub(" city","",gsub(" town","",CITY_POP$County,ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE))
CITY_POP <- CITY_POP %>% rename("City"=County)
########################City Population Data 2010 to 2020
PAGE <- read_html('http://eadiv.state.wy.us/pop/sub-est11-19.htm')
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- which(toupper(TBL$X1)==toupper("Afton town, Wyoming"))
END <- which(toupper(TBL$X1)==toupper("Yoder town, Wyoming"))
#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-1,] %in% 1970:2025))
NAMES <- TBL[3,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("City",NAMES)
TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL$City <- gsub(" ","_",gsub(" $","",gsub("\r|\n| Wyoming|,| town| city","",TBL$City,ignore.case=TRUE)))
TBL <- TBL %>% filter(Year!=2020)
CITY_POP <- rbind(TBL,CITY_POP)
########################County Population Data 2010 to 2020
PAGE <- read_html('http://eadiv.state.wy.us/pop/ctyest11-19.htm')
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- grep("Albany",TBL$X1)
END <- grep("Weston",TBL$X1)
#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-2,] %in% 1970:2025))
NAMES <- TBL[3,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("County",NAMES)
TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL$County <-  gsub(" ","_",gsub("  "," ",gsub(" $","",gsub("\r|\n| Wyoming|,| town| city| County|\\.","",TBL$County,ignore.case=TRUE))))
TBL <- TBL %>% filter(Year!=2020)
COUNTY_POP <- rbind(TBL,COUNTY_POP)

########################County and City Population Data 2000 to 2010
PAGE <- read_html('http://eadiv.state.wy.us/pop/sub-est01-09.htm')
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)

ST <- which(toupper(TBL$X1)==toupper("Albany County"))
END <- which(toupper(TBL$X1)==toupper("Balance of Weston County"))

#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-4,] %in% 1970:2025))
NAMES <- TBL[4,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("County",NAMES)
TBL <-       	pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL <- TBL %>% filter(Year!=2010)
TBL$County <- gsub("  "," ",gsub("\n","",gsub("\r","",TBL %>% pull(County))))

COUNTY_TBL <- TBL[grep("COUNTY",TBL %>% pull(County),ignore.case=TRUE),]
COUNTY_TBL <-COUNTY_TBL[grep("Balance",COUNTY_TBL%>% pull(County),invert=TRUE,ignore.case=TRUE),]
COUNTY_TBL$County <-gsub("_(pt.)","", gsub(" ","_",gsub(" County","",COUNTY_TBL$County)))

CITY_TBL <- TBL[sort(c(grep("County",TBL %>% pull(County),invert=TRUE,ignore.case=TRUE),grep("Balance",TBL %>% pull(County),ignore.case=TRUE))),]
CITY_TBL$County <- gsub(" ","_",gsub("Balance of","Unincorporated",gsub(" County","",gsub(" city","",gsub(" town","",CITY_TBL$County,ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE))
CITY_TBL <- CITY_TBL %>% rename("City"=County)
CITY_POP <- rbind(CITY_TBL,CITY_POP)
#Cleanup names
CITY_POP$City <- gsub("LaGrange","La_Grange",CITY_POP$City)
COUNTY_POP <- rbind(COUNTY_TBL,COUNTY_POP)
####################County and City Population Data for 1990-2000
if(!file.exists("./Data/Pop_1990s.xls")){download.file('http://eadiv.state.wy.us/pop/c&sc90_00.xls',"./Data/Pop_1990s.xls")}
TEMP <- read_xls("Data/Pop_1990s.xls",skip=2)[-1:-4,]
colnames(TEMP)[1] <- "County"
tail(TEMP)
TEMP <- TEMP[1:which(TEMP[,1]=="Wind River Res."),]

TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population))
TEMP <- TEMP %>% filter(Year!=2000)
TEMP_COUNTY <- TEMP[grepl("Cnty",TEMP %>% pull(County),ignore.case=TRUE),]
TEMP_COUNTY$County <- gsub(" ","_",gsub("  "," ",gsub(" Cnty","",TEMP_COUNTY$County,ignore.case=TRUE)))

TEMP_CITY <- TEMP[grep("Cnty",TEMP %>% pull(County),ignore.case=TRUE,invert=TRUE),]
TEMP_CITY$County <- gsub("E_Therm","East_Therm",gsub(" ","_",gsub("  ","",TEMP_CITY %>% pull(County))))
TEMP_CITY <- TEMP_CITY %>% rename(City=County)
TEMP_CITY %>% pull(City) %>% unique %>% sort
CITY_POP <- rbind(TEMP_CITY,CITY_POP)
CITY_POP %>% pull(City) %>% unique %>% sort
COUNTY_POP <- rbind(TEMP_COUNTY,COUNTY_POP)
TEMP_CITY  <- TEMP_CITY %>% filter(Year!=2000)

try(rm(TEMP_CITY,TEMP_COUNTY,TEMP))
####################County and City Population Data for 1980-1990
if(!file.exists("./Data/Pop_1980s.xls")){download.file('http://eadiv.state.wy.us/pop/C&SC8090.xls',"./Data/Pop_1980s.xls")}
TEMP <- read_xls("Data/Pop_1980s.xls",skip=2)[-1:-4,]
colnames(TEMP)[1] <- "County"
TEMP <- TEMP[2:which(TEMP[,1]=="Upton"),1:(min(which(is.na(TEMP[2,])))-1)]
TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population))

TEMP_COUNTY <- TEMP[grepl("Cty",TEMP %>% pull(County),ignore.case=TRUE),]
TEMP_COUNTY$County <- gsub(" ","_",gsub("  "," ",gsub(" Cty","",TEMP_COUNTY$County,ignore.case=TRUE)))

TEMP_CITY <- TEMP[grep("Cty",TEMP %>% pull(County),ignore.case=TRUE,invert=TRUE),]
TEMP_CITY$County <-gsub("Frannie_","Frannie", gsub("Mtn._View","Mountain_View",gsub("E._Therm","East_Therm",gsub(" ","_",gsub("  ","",TEMP_CITY %>% pull(County))))))
TEMP_CITY <- TEMP_CITY %>% rename(City=County)

TEMP_CITY  <- TEMP_CITY %>% filter(Year!=1990)
TEMP_COUNTY  <- TEMP_COUNTY %>% filter(Year!=1990)


CITY_POP <- rbind(TEMP_CITY,CITY_POP)
COUNTY_POP <- rbind(TEMP_COUNTY,COUNTY_POP)

#ggplot(aes(x=Year,y=Population,group=County,color=County),data=COUNTY_POP)+geom_line()
try(rm(TEMP_CITY,TEMP_COUNTY,TEMP))

####################County Population Data for 1970-1980
if(!file.exists("./Data/Pop_1970s.xls")){download.file('http://eadiv.state.wy.us/pop/Cnty7080.xls',"./Data/Pop_1970s.xls")}
TEMP <- read_xls("Data/Pop_1970s.xls",skip=2)[-1:-4,]
colnames(TEMP)[1] <- "County"
TEMP <- TEMP[1:which(TEMP[,1]=="Weston"),]
TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population))

TEMP$County <- gsub(" ","_",TEMP$County)
TEMP <- TEMP %>% filter(Year!=1980)

COUNTY_POP <- rbind(TEMP,COUNTY_POP)
#ggplot(aes(x=Year,y=Population,group=County,color=County),data=COUNTY_POP)+geom_line()
try(rm(TEMP))
###########Old data addtion:Period Ends in 1970
	#See in part http://eadiv.state.wy.us/demog_data/cntycity_hist.htm
	LN_OLD <- c(12487,10894,10286,9023,9018,8640) #Missing in 1910
	Year <-  seq(1920,1970,by=10)
	TEMP <-	cbind(Year,rep("Lincoln",6),LN_OLD)
	colnames(TEMP ) <- c("Year","County","Population")
	TEMP <- as_tibble(TEMP)
	COUNTY_POP <- rbind(TEMP,COUNTY_POP) %>% arrange(County,Year)


	KEM_OLD <- c(843,1517,1884,2026,1667,2028,2292) #1910 forward until 1970
	Year <-  seq(1910,1970,by=10)
	TEMP <-	cbind(Year,rep("kemmerer",7),KEM_OLD)
	colnames(TEMP ) <- c("Year","City","Population")
	TEMP <- as_tibble(TEMP)
	CITY_POP <- rbind(TEMP,CITY_POP)


	DIAMOND_OLD <- c(696,726,812,586,415,398,485)
	TEMP <-	cbind(Year,rep("Diamondvile",7),DIAMOND_OLD)
	colnames(TEMP ) <- c("Year","City","Population")
	TEMP <- as_tibble(TEMP)
	CITY_POP <- rbind(TEMP,CITY_POP)  %>% arrange(City,Year)
#Add Other Data
COUNTY_POP <- COUNTY_POP %>% mutate(Year=as.integer(Year)) %>% unique
WY_COUNTY_DATA_SET <- COUNTY_POP %>% left_join(WY_COUNTY_DATA_SET ) %>% mutate(Population=as.numeric(Population)) %>% unique

###Save Population Results
write_csv(CITY_POP,file="./Data/Cleaned_Data/Wyoming_City_Population.csv")
write_csv(WY_COUNTY_DATA_SET ,file="./Data/Cleaned_Data/Wyoming_County_Population.csv")
saveRDS(CITY_POP,file="./Data/Cleaned_Data/Wyoming_City_Population.Rds")
saveRDS(WY_COUNTY_DATA_SET ,file="./Data/Cleaned_Data/Wyoming_County_Population.Rds")


###################Demographics
if(!file.exists("./Data/Demo_Single_Year_2020s.xls")){download.file('http://eadiv.state.wy.us/Pop/CO_SYASEX24.xlsx',"./Data/Demo_Single_Year_2020s.xls")}
TEMP <- read_xlsx("./Data/Demo_Single_Year_2020s.xls",skip=2)[,-1]
TEMP <- TEMP[1:(min(which(is.na(TEMP[,1])))-1),]
TEMP <- TEMP[!grepl("Base",TEMP$YEAR,ignore.case=TRUE),] #There are two population values provided. I believe one is the census baseline, and one is a estimate in July. Keep the later estimate, to line up with the same seasonal collection pattern of the rest of the data
TEMP$YEAR <- year(as.Date(substr((TEMP$YEAR),1,8),format="%m/%d/%Y"))
colnames(TEMP) <- c("County","Year","Age","Number","Num_Male","Num_Female")
TEMP$County <- gsub(" County","",TEMP$County,ignore.case=TRUE)
DEM_2020 <- TEMP %>% select(-Number)
###Demogrpahics all
DEM_DATA <- read_delim("Data/County_Demographics_Census/wy.1969_2023.singleages.through89.90plus.txt",delim="  ",col_names=c("ID","VALUES"),col_types=list('c','c'))
DEM_DATA$Year <- as.integer(substr(DEM_DATA$ID,1,4))
DEM_DATA$fips<- substr(DEM_DATA$ID,7,11)
COUNTY_LIST <- read_csv("https://github.com/kjhealy/fips-codes/raw/refs/heads/master/county_fips_master.csv",col_types=list('c','c')) %>% filter(state_abbr=="WY") %>% select(fips,County=county_name) %>% mutate(County=gsub(" ","_",gsub(" County","",County,ignore.case=TRUE)))
DEM_DATA <- DEM_DATA %>% left_join(COUNTY_LIST) %>% select(-fips)
#16=3
DEM_DATA$Sex <- ifelse(substr(DEM_DATA$VALUES,3,3)==1,"Male","Female")
DEM_DATA$Age <- parse_number(substr(DEM_DATA$VALUES,4,5))
DEM_DATA$Number <- parse_number(substr(DEM_DATA$VALUES,6,14))
DEM_DATA <- DEM_DATA %>% select(-ID,-VALUES)


DEM_DATA <- DEM_DATA %>% group_by(Year,County,Sex,Age) %>% summarize(Number=sum(Number)) %>% ungroup()#Aggregate to sex and age level

#The Wyoming census data seems newer than this data set from SEER cancer data source. Drop any of these records that overlap with the Wyoming data before merging. Arrange so the column order is the same between the two data sets, so they can be easily bound together.
DEM_DATA <- pivot_wider(DEM_DATA,names_from=Sex,values_from=Number) %>% rename(Num_Female=Female,Num_Male=Male) %>% select(colnames(DEM_2020)) %>% filter(Year<min(DEM_2020$Year)) %>% unique
DEM_DATA <- rbind(DEM_2020,DEM_DATA) %>% ungroup %>% arrange(Year,Age)  %>% unique

###Save demographic data set
	LIN_DEM <- DEM_DATA %>% filter(County=='Lincoln')
	write_csv(LIN_DEM,file="./Data/Cleaned_Data/Lincoln_Demographic_Data.csv")
	write_csv(DEM_DATA,file="./Data/Cleaned_Data/Wyoming_County_Demographic_Data.csv")
	saveRDS(LIN_DEM,file="./Data/Cleaned_Data/Lincoln_Demographic_Data.Rds")
	saveRDS(DEM_DATA,file="./Data/Cleaned_Data/Wyoming_County_Demographic_Data.Rds")

#########################################Mortality Rate
GET_MORTALITY_DATA <- function(FILE,SEX,LOWER_AGE,UPPER_AGE){
	#Create clean moratlity rate data
	#Data gathered from https://hdpulse.nimhd.nih.gov/data-portal/mortality/table?cod=247&cod_options=cod_15&ratetype=aa&ratetype_options=ratetype_2&race=00&race_options=race_6&sex=2&sex_options=sex_3&age=177&age_options=age_11&ruralurban=0&ruralurban_options=ruralurban_3&yeargroup=5&yeargroup_options=year5yearmort_1&statefips=56&statefips_options=area_states&county=56000&county_options=counties_wyoming&comparison=counties_to_us&comparison_options=comparison_counties&radio_comparison=areas&radio_comparison_options=cods_or_areas

	NAMES <- c("County","FIPS","Death_Rate","Lower_Rate","Upper_Rate","Deaths","Trend_Category","Trend","Lower_Trend","Upper_Trend")
	DF <- read_csv(FILE,skip=5,col_names=NAMES,col_types=list('c',"i",'d','d','d','d','c','d','d','d')) %>% filter(grepl("County|Wyoming",County)|County=="United States") %>% mutate(Rate_SD=(Upper_Rate-Lower_Rate)/(2*1.96),Trend_SD=(Upper_Trend-Lower_Trend)/(2*1.96)) %>% select(County,Death_Rate,Rate_SD,Trend,Trend_SD)
	DF$County <- gsub(" County","",DF$County,ignore.case=TRUE)
	DF[,-1] <- DF[,-1]/100000
	WYOMING_TREND <- pull(DF[DF$County=="Wyoming",],"Trend")
	US_TREND <- pull(DF[DF$County=="United States",],"Trend")

	WYOMING_RATE <- pull(DF[DF$County=="Wyoming",],"Death_Rate")
	US_RATE <- pull(DF[DF$County=="United States",],"Death_Rate")

	DF$Imparted_Trend <- FALSE
	if(is.na(WYOMING_TREND)){
		DF[1,4:5] <- DF[2,4:5]
		DF[1,6] <- TRUE
	}

	DF$Imparted_Rate <- FALSE
	if(is.na(WYOMING_RATE)){
		DF[1,4:5] <- DF[2,2:3]
		DF[1,6] <- TRUE
	}
	WYOMING_BASELINE_TREND <-cbind (DF[1,4:5] ,TRUE)
	WYOMING_BASELINE_RATE <-DF[1,2:3]

	for(i in 3:nrow(DF)){
		#Impart any missing trends based on higher levels
		if(is.na(pull(DF[i,],"Trend"))){ DF[i,4:6] <- WYOMING_BASELINE_TREND}
		#Impart any missing death rates based on higher levels
		if(is.na(pull(DF[i,],"Death_Rate"))){
			DF[i,2:3] <- WYOMING_BASELINE_RATE
			DF[i,"Imparted_Rate"] <- TRUE
		}
	}
	DF$Sex <- SEX
	DF$Min_Age <- LOWER_AGE
	DF$Max_Age  <- UPPER_AGE
	DF <- DF %>% select(County,Sex,Min_Age,Max_Age,Death_Rate,Rate_SD,Imparted_Rate,everything())
	return(DF)
}
MORTALITY_DATA_ALL <- rbind(
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/A_Under1.csv","Female",0,0),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/B_1_9.csv","Female",1,9),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/C_10_19.csv","Female",10,19),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/D_20_39.csv","Female",20,39),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/E_40_64.csv","Female",40,64),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/G_65_74.csv","Female",65,74),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/H_75_84.csv","Female",75,84),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/I_85+.csv","Female",85,Inf),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/A_Under1.csv","Male",0,0),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/B_1_9.csv","Male",1,9),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/C_10_19.csv","Male",10,19),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/D_20_39.csv","Male",20,39),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/E_40_64.csv","Male",40,64),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/G_65_74.csv","Male",65,74),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/H_75_84.csv","Male",75,84),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/I_85+.csv","Male",85,Inf)

)
LIN_MORTALITY <- MORTALITY_DATA_ALL  %>% filter(County=="Lincoln")
###Save Mortality Rate data set
	write_csv(LIN_MORTALITY,file="./Data/Cleaned_Data/Lincoln_Mortality_Rate.csv")
	write_csv(MORTALITY_DATA_ALL,file="./Data/Cleaned_Data/Not_Used/Wyoming_County_Mortality_Rate.csv")
	saveRDS(LIN_MORTALITY,file="./Data/Cleaned_Data/Lincoln_Mortality_Rate.Rds")
	saveRDS(MORTALITY_DATA_ALL,file="./Data/Cleaned_Data/Not_Used/Wyoming_County_Mortality_Rate.Rds")

#Clean all data from memory. Will be loaded in any paticular script as needed
rm(list = ls())
gc()