Population_Study/Scripts/1_Download_and_Process_Population_Data.r
2025-11-06 17:21:41 -07:00

344 lines
19 KiB
R

#############################Clean up script folders, simulations of deaths should be separated from this file. Death rate simulation is complicated and should be commented, and turned into a separate script.
library(rvest)
library(tidyverse)
library(readxl)
library(curl) #To archive the html files
#setwd("../")
###Create Location to Save raw data sets
if(!exists("SAVE_LOC_RAW")){SAVE_LOC_RAW <-"./Data/Raw_Data/"}
dir.create(SAVE_LOC_RAW, recursive = TRUE, showWarnings = FALSE)
SAVE_LOC_RAW_POP <- paste0(SAVE_LOC_RAW,"Population/")
dir.create(SAVE_LOC_RAW_POP , recursive = TRUE, showWarnings = FALSE)
RAW_HTML_LOC <- paste0(SAVE_LOC_RAW_POP,"HTML_Population_Files/")
dir.create(RAW_HTML_LOC, recursive = TRUE, showWarnings = FALSE)
RAW_EXCEL_LOC <- paste0(SAVE_LOC_RAW_POP,"Excel_Population_Files/")
dir.create(RAW_EXCEL_LOC, recursive = TRUE, showWarnings = FALSE)
##Start a log file about the data
sink(file=paste0(SAVE_LOC_RAW_POP,"README_POPULATION_DATA.txt"),append=FALSE)
cat("Most data is supplied by the Wyoming Department of Administration & Information Economic Analysis Division (WIEAD). Their data is either directly pulled from other sources like the census or is interpolated using this data (such as deaths, and migration).\n This folder saves the raw HTML of a few population data source, which were stored as a web table rather than a CSV or excel file so that in the future if the web pages are removed or changed the code will still run, and can be updated with new information.\nSome files are supplied as HTML tables from a webpage, while others are excel files. ")
sink()
########County, Death, Birth and Migration Data
#Data found on the page http://eadiv.state.wy.us/pop/
#Website States: Wyoming Economic Analysis Division based on U.S. Census Bureau's population estimation and vital stats above
BIRTH_DEATH_MIGRATION_HTML_LOC <- paste0(RAW_HTML_LOC,"County_Migration_Deaths_Births.html")
try(curl_download("http://eadiv.state.wy.us/pop/BirthDeathMig.htm",destfile=BIRTH_DEATH_MIGRATION_HTML_LOC )) #Download the file rather than directly use the website, so a backup is available if it ever goes offline.
#Add to the notes
sink(file=paste0(SAVE_LOC_RAW_POP,"/README_POPULATION_DATA.txt"),append=TRUE)
cat("\n\n 1) Wyoming births, deaths, and net migration by county of residence: 1971 - 2023:
County_Migration_Deaths_Births.htm comes from http://eadiv.state.wy.us/pop/BirthDeathMig.htm
Data Type: HTML table
Data Source: WIEAD
Original Source: Wyoming Department of Health and U.S. Census Bureau")
sink()
PAGE <- read_html(BIRTH_DEATH_MIGRATION_HTML_LOC )
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- which(toupper(TBL$X1)=="ALBANY")
END <- which(toupper(TBL$X1)=="TOTAL")
TYPES <- TBL[ST-2,1]
ST_YEAR <- 1971
ALL_DATA <- list()
TBL <- TBL[,c(1,which(!is.na(as.numeric(TBL[ST[1],]))))]
TBL <- TBL[,-ncol(TBL)]
colnames(TBL) <- c("County",(ST_YEAR:(ST_YEAR+ncol(TBL)-1)))
TBL$Type <- NA
for(i in 1:length(ST)){
TBL[ST[i]:END[i],"Type"]<- as.character(TYPES[i,1])
}
TBL[ST[2]:END[2],"Type"] <- as.character(TYPES[2,1])
TBL$Type
TBL <- TBL %>% filter(!is.na(Type)) %>% select(County,Type,everything())
GROUP <- colnames(TBL)[-1:-2]
Data <- pivot_longer(TBL,all_of(GROUP),names_to="Year",values_to="Pop_Change")
Data$County <- ifelse(toupper(Data$County)=="TOTAL","Wyoming",Data$County)
WY_COUNTY_DATA_SET <- pivot_wider(Data,names_from=Type,values_from=Pop_Change) %>% rename("Migration"=`Net Migration`) %>% mutate(Year=as.integer(Year),Births=parse_number(Births),Deaths=parse_number(Deaths),Migration=parse_number(Migration)) %>% mutate(Year=Year-1) #Data appears to be one off from population
WY_COUNTY_DATA_SET[,"County"] <- gsub(" ","_",WY_COUNTY_DATA_SET %>% pull(County))
########################City and County Population Data 2020 to 2024
CITY_POPULATION_A <- paste0(RAW_HTML_LOC,"Wyoming_City_and_County_Population_2020_2024.html")
try(curl_download("http://eadiv.state.wy.us/pop/Place-24EST.htm",destfile=CITY_POPULATION_A)) #Download the file rather than directly use the website, so a backup is available if it ever goes offline.
#Add to the notes
sink(file=paste0(SAVE_LOC_RAW_POP,"README_POPULATION_DATA.txt"),append=TRUE)
cat("\n 2) Wyoming Incorporated Place Population Estimates: April 1, 2020 to July 1, 2024
Wyoming_City_and_County_Population_2020_2024.html comes from http://eadiv.state.wy.us/pop/Place-24EST.htm
Data Type: HTML table
Data Source: WIEAD
Original Source: U.S. Census Bureau, Population Division Wyoming Department of Health and U.S. Census Bureau")
sink()
PAGE <- read_html(CITY_POPULATION_A)
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- which(toupper(TBL$X1)==toupper("Albany County"))
END <- which(toupper(TBL$X1)==toupper("Balance of Weston County"))
#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-2,] %in% 1970:2025))
NAMES <- TBL[4,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("County",NAMES)
TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL$County <- gsub(" "," ",gsub("\n","",gsub("\r","",TBL %>% pull(County))))
COUNTY_POP<- TBL[grep("COUNTY",TBL %>% pull(County),ignore.case=TRUE),]
COUNTY_POP<- COUNTY_POP[grep("Balance",COUNTY_POP%>% pull(County),invert=TRUE,ignore.case=TRUE),]
COUNTY_POP$County <- gsub(" ","_",gsub(" County","",COUNTY_POP$County))
CITY_POP <- TBL[sort(c(grep("County",TBL %>% pull(County),invert=TRUE,ignore.case=TRUE),grep("Balance",TBL %>% pull(County),ignore.case=TRUE))),]
CITY_POP$County <- gsub(" ","_",gsub("Balance of","Unincorporated",gsub(" County","",gsub(" city","",gsub(" town","",CITY_POP$County,ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE))
CITY_POP <- CITY_POP %>% rename("City"=County)
########################City Population Data 2010 to 2020
CITY_POPULATION_B <- paste0(RAW_HTML_LOC,"Wyoming_City_and_County_Population_2010_2020.html")
try(curl_download('http://eadiv.state.wy.us/pop/sub-est11-19.htm',destfile=CITY_POPULATION_B)) #Download the file rather than directly use the website, so a backup is available if it ever goes offline.
#Add to the notes
sink(file=paste0(SAVE_LOC_RAW_POP,"README_POPULATION_DATA.txt"),append=TRUE)
cat("\n 3) Intercensal Estimates of the Resident Population for Incorporated Places in Wyoming: April 1, 2010 to April 1, 2020
Data Type: HTML table
Wyoming_City_and_County_Population_2010_2020.html comes from http://eadiv.state.wy.us/pop/sub-est11-19.htm
Data Source: WIEAD
Original Source: Source: U.S. Census Bureau, Population Division" )
sink()
PAGE <- read_html(CITY_POPULATION_B )
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- which(toupper(TBL$X1)==toupper("Afton town, Wyoming"))
END <- which(toupper(TBL$X1)==toupper("Yoder town, Wyoming"))
#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-1,] %in% 1970:2025))
NAMES <- TBL[3,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("City",NAMES)
TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL$City <- gsub(" ","_",gsub(" $","",gsub("\r|\n| Wyoming|,| town| city","",TBL$City,ignore.case=TRUE)))
TBL <- TBL %>% filter(Year!=2020)
CITY_POP <- rbind(TBL,CITY_POP)
########################County Population Data 2010 to 2020
COUNTY_POPULATION_B <- paste0(RAW_HTML_LOC,"Wyoming_County_Population_2010_2020.html")
try(curl_download('http://eadiv.state.wy.us/pop/ctyest11-19.htm',destfile=COUNTY_POPULATION_B)) #Download the file rather than directly use the website, so a backup is available if it ever goes offline.
#Add to the notes
sink(file=paste0(SAVE_LOC_RAW_POP,"README_POPULATION_DATA.txt"),append=TRUE)
cat("\n 4) Intercensal Estimates of the Resident Population for Counties in Wyoming: April 1, 2010 to April 1, 2020
Wyoming_County_Population_2010_2020.html comes from http://eadiv.state.wy.us/pop/ctyest11-19.htm
Data Type: HTML table
Data Source: WIEAD
Original Source: U.S. Census Bureau, Population Division" )
sink()
PAGE <- read_html(COUNTY_POPULATION_B )
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- grep("Albany",TBL$X1)
END <- grep("Weston",TBL$X1)
#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-2,] %in% 1970:2025))
NAMES <- TBL[3,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("County",NAMES)
TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL$County <- gsub(" ","_",gsub(" "," ",gsub(" $","",gsub("\r|\n| Wyoming|,| town| city| County|\\.","",TBL$County,ignore.case=TRUE))))
TBL <- TBL %>% filter(Year!=2020)
COUNTY_POP <- rbind(TBL,COUNTY_POP)
########################County and City Population Data 2000 to 2010
CITY_POPULATION_C <- paste0(RAW_HTML_LOC,"Wyoming_City_and_County_Population_2000_2009.html")
try(curl_download('http://eadiv.state.wy.us/pop/sub-est01-09.htm',destfile=CITY_POPULATION_C)) #Download the file rather than directly use the website, so a backup is available if it ever goes offline.
#Add to the notes
sink(file=paste0(SAVE_LOC_RAW_POP,"README_POPULATION_DATA.txt"),append=TRUE)
cat("\n 5) Intercensal Estimates of the Resident Population for Cities and Towns of Wyoming: April 1, 2000 to July 1, 2010
Wyoming_City_and_County_Population_2000_2009.html comes from 'http://eadiv.state.wy.us/pop/sub-est01-09.htm'
Data Type: HTML table
Data Source: WIEAD
Original Source: U.S. Census Bureau, Population Division" )
sink()
PAGE <- read_html(CITY_POPULATION_C)
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- which(toupper(TBL$X1)==toupper("Albany County"))
END <- which(toupper(TBL$X1)==toupper("Balance of Weston County"))
#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-4,] %in% 1970:2025))
NAMES <- TBL[4,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("County",NAMES)
TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL <- TBL %>% filter(Year!=2010)
TBL$County <- gsub(" "," ",gsub("\n","",gsub("\r","",TBL %>% pull(County))))
COUNTY_TBL <- TBL[grep("COUNTY",TBL %>% pull(County),ignore.case=TRUE),]
COUNTY_TBL <-COUNTY_TBL[grep("Balance",COUNTY_TBL%>% pull(County),invert=TRUE,ignore.case=TRUE),]
COUNTY_TBL$County <-gsub("_(pt.)","", gsub(" ","_",gsub(" County","",COUNTY_TBL$County)))
CITY_TBL <- TBL[sort(c(grep("County",TBL %>% pull(County),invert=TRUE,ignore.case=TRUE),grep("Balance",TBL %>% pull(County),ignore.case=TRUE))),]
CITY_TBL$County <- gsub(" ","_",gsub("Balance of","Unincorporated",gsub(" County","",gsub(" city","",gsub(" town","",CITY_TBL$County,ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE))
CITY_TBL <- CITY_TBL %>% rename("City"=County)
CITY_POP <- rbind(CITY_TBL,CITY_POP)
#Cleanup names
CITY_POP$City <- gsub("LaGrange","La_Grange",CITY_POP$City)
COUNTY_POP <- rbind(COUNTY_TBL,COUNTY_POP)
####################County and City Population Data for 1990-2000
#Location to save any raw population files. Most files are not saved since they are pulled from a html and not a excel file, but older files are only available as excel files
POP_FILE_1990 <- paste0(RAW_EXCEL_LOC,"Wyoming_County_Population_1990_2000.xls")
try(if(!file.exists(POP_FILE_1990)){download.file('http://eadiv.state.wy.us/pop/c&sc90_00.xls',POP_FILE_1990)})
#Add to the notes
sink(file=paste0(SAVE_LOC_RAW_POP,"README_POPULATION_DATA.txt"),append=TRUE)
cat("\n 6) Population for Wyoming, Counties, Cities, and Towns: 1990 to 2000
Wyoming_County_Population_1980_1990.xls comes from 'http://eadiv.state.wy.us/pop/c&sc90_00.xls'
Data Type: Excel file
Data Source: WIEAD
Original Source: U.S. Census Bureau, Population Division" )
sink()
TEMP <- read_xls(POP_FILE_1990,skip=2)[-1:-4,]
colnames(TEMP)[1] <- "County"
TEMP <- TEMP[1:which(TEMP[,1]=="Wind River Res."),]
TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population))
TEMP <- TEMP %>% filter(Year!=2000)
TEMP_COUNTY <- TEMP[grepl("Cnty",TEMP %>% pull(County),ignore.case=TRUE),]
TEMP_COUNTY$County <- gsub(" ","_",gsub(" "," ",gsub(" Cnty","",TEMP_COUNTY$County,ignore.case=TRUE)))
TEMP_CITY <- TEMP[grep("Cnty",TEMP %>% pull(County),ignore.case=TRUE,invert=TRUE),]
TEMP_CITY$County <- gsub("E_Therm","East_Therm",gsub(" ","_",gsub(" ","",TEMP_CITY %>% pull(County))))
TEMP_CITY <- TEMP_CITY %>% rename(City=County)
TEMP_CITY %>% pull(City) %>% unique %>% sort
CITY_POP <- rbind(TEMP_CITY,CITY_POP)
CITY_POP %>% pull(City) %>% unique %>% sort
COUNTY_POP <- rbind(TEMP_COUNTY,COUNTY_POP)
TEMP_CITY <- TEMP_CITY %>% filter(Year!=2000)
try(rm(TEMP_CITY,TEMP_COUNTY,TEMP))
####################County and City Population Data for 1980-1990
POP_FILE_1980 <- paste0(RAW_EXCEL_LOC,"/Wyoming_County_Population_1980_1990.xls")
try(if(!file.exists(POP_FILE_1980)){download.file('http://eadiv.state.wy.us/pop/C&SC8090.xls',POP_FILE_1980)})
#Add to the notes
sink(file=paste0(SAVE_LOC_RAW_POP,"README_POPULATION_DATA.txt"),append=TRUE)
cat("\n 7) Population for Wyoming, Counties and Municipalities: 1980 to 1990
Wyoming_County_Population_1980_1990.xls comes from 'http://eadiv.state.wy.us/pop/C&SC8090.xls'
Data Type: Excel file
Data Source: WIEAD
Original Source: U.S. Census Bureau, Population Division" )
sink()
TEMP <- read_xls(POP_FILE_1980,skip=2)[-1:-4,]
colnames(TEMP)[1] <- "County"
TEMP <- TEMP[2:which(TEMP[,1]=="Upton"),1:(min(which(is.na(TEMP[2,])))-1)]
TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population))
TEMP_COUNTY <- TEMP[grepl("Cty",TEMP %>% pull(County),ignore.case=TRUE),]
TEMP_COUNTY$County <- gsub(" ","_",gsub(" "," ",gsub(" Cty","",TEMP_COUNTY$County,ignore.case=TRUE)))
TEMP_CITY <- TEMP[grep("Cty",TEMP %>% pull(County),ignore.case=TRUE,invert=TRUE),]
TEMP_CITY$County <-gsub("Frannie_","Frannie", gsub("Mtn._View","Mountain_View",gsub("E._Therm","East_Therm",gsub(" ","_",gsub(" ","",TEMP_CITY %>% pull(County))))))
TEMP_CITY <- TEMP_CITY %>% rename(City=County)
TEMP_CITY <- TEMP_CITY %>% filter(Year!=1990)
TEMP_COUNTY <- TEMP_COUNTY %>% filter(Year!=1990)
CITY_POP <- rbind(TEMP_CITY,CITY_POP)
COUNTY_POP <- rbind(TEMP_COUNTY,COUNTY_POP)
#ggplot(aes(x=Year,y=Population,group=County,color=County),data=COUNTY_POP)+geom_line()
try(rm(TEMP_CITY,TEMP_COUNTY,TEMP))
####################County Population Data for 1970-1980
POP_FILE_1970 <- paste0(RAW_EXCEL_LOC,"/Wyoming_County_Population_1970_1980.xls")
try(if(!file.exists(POP_FILE_1970)){download.file('http://eadiv.state.wy.us/pop/Cnty7080.xls',POP_FILE_1970)})
#Add to the notes
sink(file=paste0(SAVE_LOC_RAW_POP,"README_POPULATION_DATA.txt"),append=TRUE)
cat("\n 8) Wyoming and County Population: 1970 to 1980
Wyoming_County_Population_1970_1980.xls comes from 'http://eadiv.state.wy.us/pop/Cnty7080.xls'
Data Type: Excel file
Data Source: WIEAD
Original Source: U.S. Census Bureau, Population Division" )
sink()
if(!file.exists(POP_FILE_1970)){download.file('http://eadiv.state.wy.us/pop/Cnty7080.xls',POP_FILE_1970)}
TEMP <- read_xls(POP_FILE_1970,skip=2)[-1:-4,]
colnames(TEMP)[1] <- "County"
TEMP <- TEMP[1:which(TEMP[,1]=="Weston"),]
TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population))
TEMP$County <- gsub(" ","_",TEMP$County)
TEMP <- TEMP %>% filter(Year!=1980)
COUNTY_POP <- rbind(TEMP,COUNTY_POP)
#ggplot(aes(x=Year,y=Population,group=County,color=County),data=COUNTY_POP)+geom_line()
try(rm(TEMP))
###########Old data addition:Period Ends in 1970
POP_FILE_OLD <- paste0(RAW_HTML_LOC,"Wyoming_City_and_County_Population_Prior_to_1970.htm")
try(curl_download('http://eadiv.state.wy.us/demog_data/cntycity_hist.htm',destfile=POP_FILE_OLD))
#Add to the notes
sink(file=paste0(SAVE_LOC_RAW_POP,"README_POPULATION_DATA.txt"),append=TRUE)
cat("\n 9) Historical decennial census population for Wyoming counties, cities, and towns
Wyoming_City_and_County_Population_Prior_to_1970.htm comes from 'http://eadiv.state.wy.us/demog_data/cntycity_hist.htm'
Data Type: HTML Tables
Data Source: WIEAD
Original Source: U.S. Census Bureau, Population Division
Note: Two tables are included complicating extraction. The values are manually entered in R rather than scarped like the other data sets" )
sink()
#See in part http://eadiv.state.wy.us/demog_data/cntycity_hist.htm
LN_OLD <- c(12487,10894,10286,9023,9018,8640) #Missing in 1910
Year <- seq(1920,1970,by=10)
TEMP <- cbind(Year,rep("Lincoln",6),LN_OLD)
colnames(TEMP ) <- c("Year","County","Population")
TEMP <- as_tibble(TEMP)
COUNTY_POP <- rbind(TEMP,COUNTY_POP) %>% arrange(County,Year)
KEM_OLD <- c(843,1517,1884,2026,1667,2028,2292) #1910 forward until 1970
Year <- seq(1910,1970,by=10)
TEMP <- cbind(Year,rep("kemmerer",7),KEM_OLD)
colnames(TEMP ) <- c("Year","City","Population")
TEMP <- as_tibble(TEMP)
CITY_POP <- rbind(TEMP,CITY_POP)
DIAMOND_OLD <- c(696,726,812,586,415,398,485)
TEMP <- cbind(Year,rep("Diamondvile",7),DIAMOND_OLD)
colnames(TEMP ) <- c("Year","City","Population")
TEMP <- as_tibble(TEMP)
CITY_POP <- rbind(TEMP,CITY_POP) %>% arrange(City,Year)
#Remove empty values, ensure all numeric values are not saved as characters
CITY_POP <- CITY_POP %>% filter(!is.na(Population) ) %>% mutate(Population=parse_number(Population),Year=parse_number(Year))
#Add Other Data
COUNTY_POP <- COUNTY_POP %>% mutate(Year=as.numeric(Year)) %>% unique
WY_COUNTY_DATA_SET <- COUNTY_POP %>% left_join(WY_COUNTY_DATA_SET ) %>% mutate(Population=as.numeric(Population)) %>% unique
###Save Population Results
if(!exists("SAVE_LOC_POP")){SAVE_LOC_POP <-"./Data/Cleaned_Data/Population_Data"}
CSV_SAVE <- paste0(SAVE_LOC_POP,"/CSV")
RDS_SAVE <- paste0(SAVE_LOC_POP,"/RDS")
dir.create(CSV_SAVE, recursive = TRUE, showWarnings = FALSE)
dir.create(RDS_SAVE, recursive = TRUE, showWarnings = FALSE)
saveRDS(CITY_POP,paste0(RDS_SAVE,"/All_Wyoming_City_Populations.Rds" ))
write_csv(CITY_POP,paste0(CSV_SAVE,"/All_Wyoming_City_Populations.csv" ))
saveRDS(WY_COUNTY_DATA_SET,paste0(RDS_SAVE,"/All_Wyoming_County_Populations.Rds" ))
write_csv(WY_COUNTY_DATA_SET,paste0(CSV_SAVE,"/All_Wyoming_County_Populations.csv" ))
run_datetime <- format(Sys.time(), "%Y-%m-%d %H:%M:%S")
sink(file=paste0(SAVE_LOC_RAW_POP,"README_POPULATION_DATA.txt"),append=TRUE)
cat(paste0("\n--- Run Date: ", run_datetime, " ---\n"))
sink()