Population_Study/Scripts/1_Download_and_Process_Population_Data.r
2025-11-05 16:20:20 -07:00

229 lines
12 KiB
R

#############################Clean up script folders, simulations of deaths should be separated from this file. Death rate simulation is complicated and should be commented, and turned into a separate script.
library(rvest)
library(tidyverse)
library(readxl)
#setwd("../")
###Create Location to Save raw data sets
if(!exists("SAVE_LOC_RAW")){SAVE_LOC_RAW <-"./Data/Raw_Data/"}
dir.create(SAVE_LOC_RAW, recursive = TRUE, showWarnings = FALSE)
########County, Death, Birth and Migration Data
#Data found on the page http://eadiv.state.wy.us/pop/
#Website States: Wyoming Economic Analysis Division based on U.S. Census Bureau's population estimation and vital stats above
PAGE <- read_html("http://eadiv.state.wy.us/pop/BirthDeathMig.htm")
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- which(toupper(TBL$X1)=="ALBANY")
END <- which(toupper(TBL$X1)=="TOTAL")
TYPES <- TBL[ST-2,1]
ST_YEAR <- 1971
ALL_DATA <- list()
TBL <- TBL[,c(1,which(!is.na(as.numeric(TBL[ST[1],]))))]
TBL <- TBL[,-ncol(TBL)]
colnames(TBL) <- c("County",(ST_YEAR:(ST_YEAR+ncol(TBL)-1)))
TBL$Type <- NA
for(i in 1:length(ST)){
TBL[ST[i]:END[i],"Type"]<- as.character(TYPES[i,1])
}
TBL[ST[2]:END[2],"Type"] <- as.character(TYPES[2,1])
TBL$Type
TBL <- TBL %>% filter(!is.na(Type)) %>% select(County,Type,everything())
GROUP <- colnames(TBL)[-1:-2]
Data <- pivot_longer(TBL,all_of(GROUP),names_to="Year",values_to="Pop_Change")
Data$County <- ifelse(toupper(Data$County)=="TOTAL","Wyoming",Data$County)
WY_COUNTY_DATA_SET <- pivot_wider(Data,names_from=Type,values_from=Pop_Change) %>% rename("Migration"=`Net Migration`) %>% mutate(Year=as.integer(Year),Births=parse_number(Births),Deaths=parse_number(Deaths),Migration=parse_number(Migration)) %>% mutate(Year=Year-1) #Data appears to be one off from population
WY_COUNTY_DATA_SET[,"County"] <- gsub(" ","_",WY_COUNTY_DATA_SET %>% pull(County))
########################City and County Population Data 2020 to 2024
PAGE <- read_html('http://eadiv.state.wy.us/pop/Place-24EST.htm')
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- which(toupper(TBL$X1)==toupper("Albany County"))
END <- which(toupper(TBL$X1)==toupper("Balance of Weston County"))
#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-2,] %in% 1970:2025))
NAMES <- TBL[4,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("County",NAMES)
TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL$County <- gsub(" "," ",gsub("\n","",gsub("\r","",TBL %>% pull(County))))
COUNTY_POP<- TBL[grep("COUNTY",TBL %>% pull(County),ignore.case=TRUE),]
COUNTY_POP<- COUNTY_POP[grep("Balance",COUNTY_POP%>% pull(County),invert=TRUE,ignore.case=TRUE),]
COUNTY_POP$County <- gsub(" ","_",gsub(" County","",COUNTY_POP$County))
CITY_POP <- TBL[sort(c(grep("County",TBL %>% pull(County),invert=TRUE,ignore.case=TRUE),grep("Balance",TBL %>% pull(County),ignore.case=TRUE))),]
CITY_POP$County <- gsub(" ","_",gsub("Balance of","Unincorporated",gsub(" County","",gsub(" city","",gsub(" town","",CITY_POP$County,ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE))
CITY_POP <- CITY_POP %>% rename("City"=County)
########################City Population Data 2010 to 2020
PAGE <- read_html('http://eadiv.state.wy.us/pop/sub-est11-19.htm')
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- which(toupper(TBL$X1)==toupper("Afton town, Wyoming"))
END <- which(toupper(TBL$X1)==toupper("Yoder town, Wyoming"))
#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-1,] %in% 1970:2025))
NAMES <- TBL[3,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("City",NAMES)
TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL$City <- gsub(" ","_",gsub(" $","",gsub("\r|\n| Wyoming|,| town| city","",TBL$City,ignore.case=TRUE)))
TBL <- TBL %>% filter(Year!=2020)
CITY_POP <- rbind(TBL,CITY_POP)
########################County Population Data 2010 to 2020
PAGE <- read_html('http://eadiv.state.wy.us/pop/ctyest11-19.htm')
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- grep("Albany",TBL$X1)
END <- grep("Weston",TBL$X1)
#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-2,] %in% 1970:2025))
NAMES <- TBL[3,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("County",NAMES)
TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL$County <- gsub(" ","_",gsub(" "," ",gsub(" $","",gsub("\r|\n| Wyoming|,| town| city| County|\\.","",TBL$County,ignore.case=TRUE))))
TBL <- TBL %>% filter(Year!=2020)
COUNTY_POP <- rbind(TBL,COUNTY_POP)
########################County and City Population Data 2000 to 2010
PAGE <- read_html('http://eadiv.state.wy.us/pop/sub-est01-09.htm')
NODE <- html_element(PAGE ,"table")
TBL <- html_table(NODE)
ST <- which(toupper(TBL$X1)==toupper("Albany County"))
END <- which(toupper(TBL$X1)==toupper("Balance of Weston County"))
#More years than are pulled are listed to make more generic
COLUMNS <- c(1,which(TBL[ST-4,] %in% 1970:2025))
NAMES <- TBL[4,COLUMNS][-1]
TBL <- TBL[ST:END,COLUMNS ]
colnames(TBL) <- c("County",NAMES)
TBL <- pivot_longer(TBL,all_of(colnames(TBL)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=parse_number(Population))
TBL <- TBL %>% filter(Year!=2010)
TBL$County <- gsub(" "," ",gsub("\n","",gsub("\r","",TBL %>% pull(County))))
COUNTY_TBL <- TBL[grep("COUNTY",TBL %>% pull(County),ignore.case=TRUE),]
COUNTY_TBL <-COUNTY_TBL[grep("Balance",COUNTY_TBL%>% pull(County),invert=TRUE,ignore.case=TRUE),]
COUNTY_TBL$County <-gsub("_(pt.)","", gsub(" ","_",gsub(" County","",COUNTY_TBL$County)))
CITY_TBL <- TBL[sort(c(grep("County",TBL %>% pull(County),invert=TRUE,ignore.case=TRUE),grep("Balance",TBL %>% pull(County),ignore.case=TRUE))),]
CITY_TBL$County <- gsub(" ","_",gsub("Balance of","Unincorporated",gsub(" County","",gsub(" city","",gsub(" town","",CITY_TBL$County,ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE),ignore.case=TRUE))
CITY_TBL <- CITY_TBL %>% rename("City"=County)
CITY_POP <- rbind(CITY_TBL,CITY_POP)
#Cleanup names
CITY_POP$City <- gsub("LaGrange","La_Grange",CITY_POP$City)
COUNTY_POP <- rbind(COUNTY_TBL,COUNTY_POP)
####################County and City Population Data for 1990-2000
#Location to save any raw population files. Most files are not saved since they are pulled from a html and not a excel file, but older files are only available as excel files
SAVE_LOC_RAW_POP <- paste0(SAVE_LOC_RAW,"/Population")
dir.create(SAVE_LOC_RAW_POP , recursive = TRUE, showWarnings = FALSE)
POP_FILE_1990 <- paste0(SAVE_LOC_RAW_POP,"/Pop_1990s.xls")
if(!file.exists(POP_FILE_1990)){download.file('http://eadiv.state.wy.us/pop/c&sc90_00.xls',POP_FILE_1990)}
TEMP <- read_xls(POP_FILE_1990,skip=2)[-1:-4,]
colnames(TEMP)[1] <- "County"
TEMP <- TEMP[1:which(TEMP[,1]=="Wind River Res."),]
TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population))
TEMP <- TEMP %>% filter(Year!=2000)
TEMP_COUNTY <- TEMP[grepl("Cnty",TEMP %>% pull(County),ignore.case=TRUE),]
TEMP_COUNTY$County <- gsub(" ","_",gsub(" "," ",gsub(" Cnty","",TEMP_COUNTY$County,ignore.case=TRUE)))
TEMP_CITY <- TEMP[grep("Cnty",TEMP %>% pull(County),ignore.case=TRUE,invert=TRUE),]
TEMP_CITY$County <- gsub("E_Therm","East_Therm",gsub(" ","_",gsub(" ","",TEMP_CITY %>% pull(County))))
TEMP_CITY <- TEMP_CITY %>% rename(City=County)
TEMP_CITY %>% pull(City) %>% unique %>% sort
CITY_POP <- rbind(TEMP_CITY,CITY_POP)
CITY_POP %>% pull(City) %>% unique %>% sort
COUNTY_POP <- rbind(TEMP_COUNTY,COUNTY_POP)
TEMP_CITY <- TEMP_CITY %>% filter(Year!=2000)
try(rm(TEMP_CITY,TEMP_COUNTY,TEMP))
####################County and City Population Data for 1980-1990
POP_FILE_1980 <- paste0(SAVE_LOC_RAW_POP ,"/Pop_1980s.xls")
if(!file.exists(POP_FILE_1980)){download.file('http://eadiv.state.wy.us/pop/C&SC8090.xls',POP_FILE_1980)}
TEMP <- read_xls(POP_FILE_1980,skip=2)[-1:-4,]
colnames(TEMP)[1] <- "County"
TEMP <- TEMP[2:which(TEMP[,1]=="Upton"),1:(min(which(is.na(TEMP[2,])))-1)]
TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population))
TEMP_COUNTY <- TEMP[grepl("Cty",TEMP %>% pull(County),ignore.case=TRUE),]
TEMP_COUNTY$County <- gsub(" ","_",gsub(" "," ",gsub(" Cty","",TEMP_COUNTY$County,ignore.case=TRUE)))
TEMP_CITY <- TEMP[grep("Cty",TEMP %>% pull(County),ignore.case=TRUE,invert=TRUE),]
TEMP_CITY$County <-gsub("Frannie_","Frannie", gsub("Mtn._View","Mountain_View",gsub("E._Therm","East_Therm",gsub(" ","_",gsub(" ","",TEMP_CITY %>% pull(County))))))
TEMP_CITY <- TEMP_CITY %>% rename(City=County)
TEMP_CITY <- TEMP_CITY %>% filter(Year!=1990)
TEMP_COUNTY <- TEMP_COUNTY %>% filter(Year!=1990)
CITY_POP <- rbind(TEMP_CITY,CITY_POP)
COUNTY_POP <- rbind(TEMP_COUNTY,COUNTY_POP)
#ggplot(aes(x=Year,y=Population,group=County,color=County),data=COUNTY_POP)+geom_line()
try(rm(TEMP_CITY,TEMP_COUNTY,TEMP))
####################County Population Data for 1970-1980
POP_FILE_1970 <- paste0(SAVE_LOC_RAW_POP ,"/Pop_1970s.xls")
if(!file.exists(POP_FILE_1970)){download.file('http://eadiv.state.wy.us/pop/Cnty7080.xls',POP_FILE_1970)}
TEMP <- read_xls(POP_FILE_1970,skip=2)[-1:-4,]
colnames(TEMP)[1] <- "County"
TEMP <- TEMP[1:which(TEMP[,1]=="Weston"),]
TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population))
TEMP$County <- gsub(" ","_",TEMP$County)
TEMP <- TEMP %>% filter(Year!=1980)
COUNTY_POP <- rbind(TEMP,COUNTY_POP)
#ggplot(aes(x=Year,y=Population,group=County,color=County),data=COUNTY_POP)+geom_line()
try(rm(TEMP))
###########Old data addtion:Period Ends in 1970
#See in part http://eadiv.state.wy.us/demog_data/cntycity_hist.htm
LN_OLD <- c(12487,10894,10286,9023,9018,8640) #Missing in 1910
Year <- seq(1920,1970,by=10)
TEMP <- cbind(Year,rep("Lincoln",6),LN_OLD)
colnames(TEMP ) <- c("Year","County","Population")
TEMP <- as_tibble(TEMP)
COUNTY_POP <- rbind(TEMP,COUNTY_POP) %>% arrange(County,Year)
KEM_OLD <- c(843,1517,1884,2026,1667,2028,2292) #1910 forward until 1970
Year <- seq(1910,1970,by=10)
TEMP <- cbind(Year,rep("kemmerer",7),KEM_OLD)
colnames(TEMP ) <- c("Year","City","Population")
TEMP <- as_tibble(TEMP)
CITY_POP <- rbind(TEMP,CITY_POP)
DIAMOND_OLD <- c(696,726,812,586,415,398,485)
TEMP <- cbind(Year,rep("Diamondvile",7),DIAMOND_OLD)
colnames(TEMP ) <- c("Year","City","Population")
TEMP <- as_tibble(TEMP)
CITY_POP <- rbind(TEMP,CITY_POP) %>% arrange(City,Year)
#Remove empty values, ensure all numeric values are not saved as characters
CITY_POP <- CITY_POP %>% filter(!is.na(Population) ) %>% mutate(Population=parse_number(Population),Year=parse_number(Year))
#Add Other Data
COUNTY_POP <- COUNTY_POP %>% mutate(Year=as.numeric(Year)) %>% unique
WY_COUNTY_DATA_SET <- COUNTY_POP %>% left_join(WY_COUNTY_DATA_SET ) %>% mutate(Population=as.numeric(Population)) %>% unique
###Save Population Results
if(!exists("SAVE_LOC_POP")){SAVE_LOC_POP <-"./Data/Cleaned_Data/Population_Data"}
CSV_SAVE <- paste0(SAVE_LOC_POP,"/CSV")
RDS_SAVE <- paste0(SAVE_LOC_POP,"/RDS")
dir.create(CSV_SAVE, recursive = TRUE, showWarnings = FALSE)
dir.create(RDS_SAVE, recursive = TRUE, showWarnings = FALSE)
saveRDS(CITY_POP,paste0(RDS_SAVE,"/All_Wyoming_City_Populations.Rds" ))
write_csv(CITY_POP,paste0(CSV_SAVE,"/All_Wyoming_City_Populations.csv" ))
saveRDS(WY_COUNTY_DATA_SET,paste0(RDS_SAVE,"/All_Wyoming_County_Populations.Rds" ))
write_csv(WY_COUNTY_DATA_SET,paste0(CSV_SAVE,"/All_Wyoming_County_Populations.csv" ))