Population_Study/Scripts/Data_Load.r

#################SEE FINAL LINES FOR NEXT STEPS
library(rvest)
library(tidyverse)
library(readxl)
#setwd("../")
########County, Death, Birth and Migration Data
#Data found on the page http://eadiv.state.wy.us/pop/
PAGE <- read_html("http://eadiv.state.wy.us/pop/BirthDeathMig.htm")
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)

ST <- which(toupper(TBL$X1)=="ALBANY")
END <- which(toupper(TBL$X1)=="TOTAL")
TYPES <- TBL[ST-2,1]

ST_YEAR <- 1971
ALL_DATA <- list()
TBL <- TBL[,c(1,which(!is.na(as.numeric(TBL[ST[1],]))))]
TBL <- TBL[,-ncol(TBL)]
colnames(TBL) <- c("County",(ST_YEAR:(ST_YEAR+ncol(TBL)-1)))
TBL$Type  <- NA
for(i in 1:length(ST)){
	TBL[ST[i]:END[i],"Type"]<- as.character(TYPES[i,1])
}
TBL[ST[2]:END[2],"Type"] <-  as.character(TYPES[2,1])
TBL$Type
TBL <- TBL %>% filter(!is.na(Type)) %>% select(County,Type,everything())
GROUP <- colnames(TBL)[-1:-2]
Data <- pivot_longer(TBL,all_of(GROUP),names_to="Year",values_to="Pop_Change")
Data$County <- ifelse(toupper(Data$County)=="TOTAL","Wyoming",Data$County)
WY_COUNTY_DATA_SET <- pivot_wider(Data,names_from=Type,values_from=Pop_Change) %>% rename("Migration"=`Net Migration`) %>% mutate(Year=as.integer(Year),Births=parse_number(Births),Deaths=parse_number(Deaths),Migration=parse_number(Migration))

########################City and County Population Data 2020 to 2024
PAGE <- read_html('http://eadiv.state.wy.us/pop/Place-24EST.htm')
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)

ST <- which(toupper(TBL$X1)==toupper("Albany County"))
END <- which(toupper(TBL$X1)==toupper("Balance of Weston County"))

#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-2,] %in% 1970:2025))
NAMES <- TBL[4,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("County",NAMES)

TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL$County <- gsub("  "," ",gsub("\n","",gsub("\r","",TBL %>% pull(County))))

COUNTY_POP<- TBL[grep("COUNTY",TBL %>% pull(County),ignore.case=TRUE),]
COUNTY_POP<-	COUNTY_POP[grep("Balance",COUNTY_POP%>% pull(County),invert=TRUE,ignore.case=TRUE),]
COUNTY_POP$County <- gsub(" ","_",gsub(" County","",COUNTY_POP$County))
CITY_POP <- TBL[sort(c(grep("County",TBL %>% pull(County),invert=TRUE,ignore.case=TRUE),grep("Balance",TBL %>% pull(County),ignore.case=TRUE))),]
CITY_POP$County <- gsub(" ","_",gsub("Balance of","Unincorporated",gsub(" County","",gsub(" city","",gsub(" town","",CITY_POP$County,ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE))
CITY_POP <- CITY_POP %>% rename("City"=County)
########################City Population Data 2010 to 2020
PAGE <- read_html('http://eadiv.state.wy.us/pop/sub-est11-19.htm')
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- which(toupper(TBL$X1)==toupper("Afton town, Wyoming"))
END <- which(toupper(TBL$X1)==toupper("Yoder town, Wyoming"))
#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-1,] %in% 1970:2025))
NAMES <- TBL[3,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("City",NAMES)
TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL$City <- gsub(" ","_",gsub(" $","",gsub("\r|\n| Wyoming|,| town| city","",TBL$City,ignore.case=TRUE)))
CITY_POP <- rbind(TBL,CITY_POP)
########################County Population Data 2010 to 2020
PAGE <- read_html('http://eadiv.state.wy.us/pop/ctyest11-19.htm')
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- grep("Albany",TBL$X1)
END <- grep("Weston",TBL$X1)
#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-2,] %in% 1970:2025))
NAMES <- TBL[3,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("County",NAMES)
TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL$County <-  gsub(" ","_",gsub("  "," ",gsub(" $","",gsub("\r|\n| Wyoming|,| town| city| County|\\.","",TBL$County,ignore.case=TRUE))))

COUNTY_POP <- rbind(TBL,COUNTY_POP)

########################County and City Population Data 2000 to 2010
PAGE <- read_html('http://eadiv.state.wy.us/pop/sub-est01-09.htm')
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)

ST <- which(toupper(TBL$X1)==toupper("Albany County"))
END <- which(toupper(TBL$X1)==toupper("Balance of Weston County"))

#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-4,] %in% 1970:2025))
NAMES <- TBL[4,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("County",NAMES)
TBL <-       	pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL$County <- gsub("  "," ",gsub("\n","",gsub("\r","",TBL %>% pull(County))))

COUNTY_TBL <- TBL[grep("COUNTY",TBL %>% pull(County),ignore.case=TRUE),]
COUNTY_TBL <-COUNTY_TBL[grep("Balance",COUNTY_TBL%>% pull(County),invert=TRUE,ignore.case=TRUE),]
COUNTY_TBL$County <-gsub("_(pt.)","", gsub(" ","_",gsub(" County","",COUNTY_TBL$County)))

CITY_TBL <- TBL[sort(c(grep("County",TBL %>% pull(County),invert=TRUE,ignore.case=TRUE),grep("Balance",TBL %>% pull(County),ignore.case=TRUE))),]
CITY_TBL$County <- gsub(" ","_",gsub("Balance of","Unincorporated",gsub(" County","",gsub(" city","",gsub(" town","",CITY_TBL$County,ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE))
CITY_TBL <- CITY_TBL %>% rename("City"=County)
CITY_POP <- rbind(CITY_TBL,CITY_POP)
#Cleanup names
CITY_POP$City <- gsub("LaGrange","La_Grange",CITY_POP$City)
COUNTY_POP <- rbind(COUNTY_TBL,COUNTY_POP)
####################County and City Population Data for 1990-2000
if(!file.exists("./Data/Pop_1990s.xls")){download.file('http://eadiv.state.wy.us/pop/c&sc90_00.xls',"./Data/Pop_1990s.xls")}
TEMP <- read_xls("Data/Pop_1990s.xls",skip=2)[-1:-4,]
colnames(TEMP)[1] <- "County"
tail(TEMP)
TEMP <- TEMP[1:which(TEMP[,1]=="Wind River Res."),]

TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population))
TEMP_COUNTY <- TEMP[grepl("Cnty",TEMP %>% pull(County),ignore.case=TRUE),]
TEMP_COUNTY$County <- gsub(" ","_",gsub("  "," ",gsub(" Cnty","",TEMP_COUNTY$County,ignore.case=TRUE)))

TEMP_CITY <- TEMP[grep("Cnty",TEMP %>% pull(County),ignore.case=TRUE,invert=TRUE),]
TEMP_CITY$County <- gsub("E_Therm","East_Therm",gsub(" ","_",gsub("  ","",TEMP_CITY %>% pull(County))))
TEMP_CITY <- TEMP_CITY %>% rename(City=County)
TEMP_CITY %>% pull(City) %>% unique %>% sort
CITY_POP <- rbind(TEMP_CITY,CITY_POP)
CITY_POP %>% pull(City) %>% unique %>% sort
COUNTY_POP <- rbind(TEMP_COUNTY,COUNTY_POP)
#ggplot(aes(x=Year,y=Population,group=County,color=County),data=COUNTY_POP)+geom_line()
try(rm(TEMP_CITY,TEMP_COUNTY,TEMP))
####################County and City Population Data for 1980-1990
if(!file.exists("./Data/Pop_1980s.xls")){download.file('http://eadiv.state.wy.us/pop/C&SC8090.xls',"./Data/Pop_1980s.xls")}
TEMP <- read_xls("Data/Pop_1980s.xls",skip=2)[-1:-4,]
colnames(TEMP)[1] <- "County"
TEMP <- TEMP[2:which(TEMP[,1]=="Upton"),1:(min(which(is.na(TEMP[2,])))-1)]
TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population))

TEMP_COUNTY <- TEMP[grepl("Cty",TEMP %>% pull(County),ignore.case=TRUE),]
TEMP_COUNTY$County <- gsub(" ","_",gsub("  "," ",gsub(" Cty","",TEMP_COUNTY$County,ignore.case=TRUE)))

TEMP_CITY <- TEMP[grep("Cty",TEMP %>% pull(County),ignore.case=TRUE,invert=TRUE),]
TEMP_CITY$County <-gsub("Frannie_","Frannie", gsub("Mtn._View","Mountain_View",gsub("E._Therm","East_Therm",gsub(" ","_",gsub("  ","",TEMP_CITY %>% pull(County))))))
TEMP_CITY <- TEMP_CITY %>% rename(City=County)

CITY_POP <- rbind(TEMP_CITY,CITY_POP)
COUNTY_POP <- rbind(TEMP_COUNTY,COUNTY_POP)
#ggplot(aes(x=Year,y=Population,group=County,color=County),data=COUNTY_POP)+geom_line()
try(rm(TEMP_CITY,TEMP_COUNTY,TEMP))

####################County Population Data for 1980-1990
if(!file.exists("./Data/Pop_1970s.xls")){download.file('http://eadiv.state.wy.us/pop/Cnty7080.xls',"./Data/Pop_1970s.xls")}
TEMP <- read_xls("Data/Pop_1970s.xls",skip=2)[-1:-4,]
colnames(TEMP)[1] <- "County"
TEMP <- TEMP[1:which(TEMP[,1]=="Weston"),]
TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population))

TEMP$County <- gsub(" ","_",TEMP$County)

COUNTY_POP <- rbind(TEMP,COUNTY_POP)
#ggplot(aes(x=Year,y=Population,group=County,color=County),data=COUNTY_POP)+geom_line()
try(rm(TEMP))
###########Old data addtion:Period Ends in 1970
	#See in part http://eadiv.state.wy.us/demog_data/cntycity_hist.htm
	LN_OLD <- c(12487,10894,10286,9023,9018,8640) #Missing in 1910
	Year <-  seq(1920,1970,by=10)
	TEMP <-	cbind(Year,rep("Lincoln",6),LN_OLD)
	colnames(TEMP ) <- c("Year","County","Population")
	TEMP <- as_tibble(TEMP)
	COUNTY_POP <- rbind(TEMP,COUNTY_POP) %>% arrange(County,Year)


	KEM_OLD <- c(843,1517,1884,2026,1667,2028,2292) #1910 forward until 1970
	Year <-  seq(1910,1970,by=10)
	TEMP <-	cbind(Year,rep("kemmerer",7),KEM_OLD)
	colnames(TEMP ) <- c("Year","City","Population")
	TEMP <- as_tibble(TEMP)
	CITY_POP <- rbind(TEMP,CITY_POP)


	DIAMOND_OLD <- c(696,726,812,586,415,398,485)
	TEMP <-	cbind(Year,rep("Diamondvile",7),DIAMOND_OLD)
	colnames(TEMP ) <- c("Year","City","Population")
	TEMP <- as_tibble(TEMP)
	CITY_POP <- rbind(TEMP,CITY_POP)  %>% arrange(City,Year)
#Add Other Data
COUNTY_POP <- COUNTY_POP %>% mutate(Year=as.integer(Year))
WY_COUNTY_DATA_SET <- COUNTY_POP %>% left_join(WY_COUNTY_DATA_SET )
###Save Population Results
write_csv(CITY_POP,file="./Data/Cleaned_Data/Wyoming_City_Population.csv")
write_csv(WY_COUNTY_DATA_SET ,file="./Data/Cleaned_Data/Wyoming_County_Population.csv")
saveRDS(CITY_POP,file="./Data/Cleaned_Data/Wyoming_City_Population.Rds")
saveRDS(WY_COUNTY_DATA_SET ,file="./Data/Cleaned_Data/Wyoming_County_Population.Rds")


###################Demographics
if(!file.exists("./Data/Demo_Single_Year_2020s.xls")){download.file('http://eadiv.state.wy.us/Pop/CO_SYASEX24.xlsx',"./Data/Demo_Single_Year_2020s.xls")}
TEMP <- read_xlsx("./Data/Demo_Single_Year_2020s.xls",skip=2)[,-1]
TEMP <- TEMP[1:(min(which(is.na(TEMP[,1])))-1),]
TEMP$YEAR <- year(as.Date(substr((TEMP$YEAR),1,8),format="%m/%d/%Y"))
colnames(TEMP) <- c("County","Year","Age","Number","Num_Male","Num_Female")
TEMP$County <- gsub(" County","",TEMP$County,ignore.case=TRUE)
DEM_2020 <- TEMP %>% select(-Year,-Number)
DEM_2020

##Mortality Rate
GET_MORTALITY_DATA <- function(FILE,SEX,LOWER_AGE,UPPER_AGE){
	#Create clean moratlity rate data
	#Data gathered from https://hdpulse.nimhd.nih.gov/data-portal/mortality/table?cod=247&cod_options=cod_15&ratetype=aa&ratetype_options=ratetype_2&race=00&race_options=race_6&sex=2&sex_options=sex_3&age=177&age_options=age_11&ruralurban=0&ruralurban_options=ruralurban_3&yeargroup=5&yeargroup_options=year5yearmort_1&statefips=56&statefips_options=area_states&county=56000&county_options=counties_wyoming&comparison=counties_to_us&comparison_options=comparison_counties&radio_comparison=areas&radio_comparison_options=cods_or_areas

	NAMES <- c("County","FIPS","Death_Rate","Lower_Rate","Upper_Rate","Deaths","Trend_Category","Trend","Lower_Trend","Upper_Trend")
	DF <- read_csv(FILE,skip=5,col_names=NAMES,col_types=list('c',"i",'d','d','d','d','c','d','d','d')) %>% filter(grepl("County|Wyoming",County)|County=="United States") %>% mutate(Rate_SD=(Upper_Rate-Lower_Rate)/(2*1.96),Trend_SD=(Upper_Trend-Lower_Trend)/(2*1.96)) %>% select(County,Death_Rate,Rate_SD,Trend,Trend_SD)
	DF$County <- gsub(" County","",DF$County,ignore.case=TRUE)
	DF[,-1] <- DF[,-1]/100000
	WYOMING_TREND <- pull(DF[DF$County=="Wyoming",],"Trend")
	US_TREND <- pull(DF[DF$County=="United States",],"Trend")

	WYOMING_RATE <- pull(DF[DF$County=="Wyoming",],"Death_Rate")
	US_RATE <- pull(DF[DF$County=="United States",],"Death_Rate")

	DF$Imparted_Trend <- FALSE
	if(is.na(WYOMING_TREND)){
		DF[1,4:5] <- DF[2,4:5]
		DF[1,6] <- TRUE
	}

	DF$Imparted_Rate <- FALSE
	if(is.na(WYOMING_RATE)){
		DF[1,4:5] <- DF[2,2:3]
		DF[1,6] <- TRUE
	}
	WYOMING_BASELINE_TREND <-cbind (DF[1,4:5] ,TRUE)
	WYOMING_BASELINE_RATE <-DF[1,2:3]

	for(i in 3:nrow(DF)){
		#Impart any missing trends based on higher levels
		if(is.na(pull(DF[i,],"Trend"))){ DF[i,4:6] <- WYOMING_BASELINE_TREND}
		#Impart any missing death rates based on higher levels
		if(is.na(pull(DF[i,],"Death_Rate"))){
			DF[i,2:3] <- WYOMING_BASELINE_RATE
			DF[i,"Imparted_Rate"] <- TRUE
		}
	}
	DF$Sex <- SEX
	DF$Min_Age <- LOWER_AGE
	DF$Max_Age  <- UPPER_AGE
	DF <- DF %>% select(County,Sex,Min_Age,Max_Age,Death_Rate,Rate_SD,Imparted_Rate,everything())
	return(DF)
}
MORTALITY_DATA_ALL <- rbind(
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/A_Under1.csv","Female",0,0),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/B_1_9.csv","Female",1,9),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/C_10_19.csv","Female",10,19),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/D_20_39.csv","Female",20,39),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/E_40_64.csv","Female",40,64),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/G_65_74.csv","Female",65,74),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/H_75_84.csv","Female",75,84),
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/I_85+.csv","Female",85,Inf),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/A_Under1.csv","Male",0,0),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/B_1_9.csv","Male",1,9),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/C_10_19.csv","Male",10,19),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/D_20_39.csv","Male",20,39),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/E_40_64.csv","Male",40,64),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/G_65_74.csv","Male",65,74),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/H_75_84.csv","Male",75,84),
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/I_85+.csv","Male",85,Inf)

)
####NEXT STEP JOIN MORTALITY TO POPULATION!!!!!!!!!!!!!!!
LIN_MORTALITY <- MORTALITY_DATA_ALL  %>% filter(County=="Lincoln")
LIN_DEM <- DEM_2020 %>% filter(County=='Lincoln')
CURRENT <- LIN_DEM[1,]
VALUES <- LIN_MORTALITY %>% filter(Sex=="Male",Min_Age<=0,Max_Age>=0) %>% select(Death_Rate,Rate_SD,Trend,Trend_SD)
VALUES
VALUES$RATE_SD
TREND_SIM <- function(NUM_YEARS_FORWARD,VALUES){
	RES <- c()
	C_YEAR_VALUE <- VALUES$Death_Rate
		for(i in 1:100){
			RES[length(RES)+1] <- C_YEAR_VALUE
			C_YEAR_VALUE <- rnorm(1,mean=VALUES$Trend,sd=VALUES$Trend_SD)+C_YEAR_VALUE
		}
	return(RES)
}
TREND_SIM(100,VALUES)


rnorm(50,mean=VALUES$Death_Rate,sd=VALUES$Rate_SD)
YEARS_AHEAD<-10
for(i in 0:YEARS_AHEAD)
rnorm(50,mean=VALUES$Death_Rate,sd=VALUES$Rate_SD)