diff --git a/.gitignore b/.gitignore index daea6da..087ce63 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ # *.png *.csv +Data/Cleaned_Data/ Results/ #Don't save any major data files on the server, can be regenerated after pulling diff --git a/Data/Cleaned_Data/Wyoming_City_Population.Rds b/Data/Cleaned_Data/Wyoming_City_Population.Rds index c2a078d..1ca7616 100644 Binary files a/Data/Cleaned_Data/Wyoming_City_Population.Rds and b/Data/Cleaned_Data/Wyoming_City_Population.Rds differ diff --git a/Data/Cleaned_Data/Wyoming_County_Population.Rds b/Data/Cleaned_Data/Wyoming_County_Population.Rds index 0691573..43b07ee 100644 Binary files a/Data/Cleaned_Data/Wyoming_County_Population.Rds and b/Data/Cleaned_Data/Wyoming_County_Population.Rds differ diff --git a/Data/Mortality_Rates/Female/A_Under1.csv b/Data/Raw_Data/Mortality_Rates/Female/A_Under1.csv similarity index 100% rename from Data/Mortality_Rates/Female/A_Under1.csv rename to Data/Raw_Data/Mortality_Rates/Female/A_Under1.csv diff --git a/Data/Mortality_Rates/Female/B_1_9.csv b/Data/Raw_Data/Mortality_Rates/Female/B_1_9.csv similarity index 100% rename from Data/Mortality_Rates/Female/B_1_9.csv rename to Data/Raw_Data/Mortality_Rates/Female/B_1_9.csv diff --git a/Data/Mortality_Rates/Female/C_10_19.csv b/Data/Raw_Data/Mortality_Rates/Female/C_10_19.csv similarity index 100% rename from Data/Mortality_Rates/Female/C_10_19.csv rename to Data/Raw_Data/Mortality_Rates/Female/C_10_19.csv diff --git a/Data/Mortality_Rates/Female/D_20_39.csv b/Data/Raw_Data/Mortality_Rates/Female/D_20_39.csv similarity index 100% rename from Data/Mortality_Rates/Female/D_20_39.csv rename to Data/Raw_Data/Mortality_Rates/Female/D_20_39.csv diff --git a/Data/Mortality_Rates/Female/E_40_64.csv b/Data/Raw_Data/Mortality_Rates/Female/E_40_64.csv similarity index 100% rename from Data/Mortality_Rates/Female/E_40_64.csv rename to Data/Raw_Data/Mortality_Rates/Female/E_40_64.csv diff --git a/Data/Mortality_Rates/Female/F_65+.csv b/Data/Raw_Data/Mortality_Rates/Female/F_65+.csv similarity index 100% rename from Data/Mortality_Rates/Female/F_65+.csv rename to Data/Raw_Data/Mortality_Rates/Female/F_65+.csv diff --git a/Data/Mortality_Rates/Female/G_65_74.csv b/Data/Raw_Data/Mortality_Rates/Female/G_65_74.csv similarity index 100% rename from Data/Mortality_Rates/Female/G_65_74.csv rename to Data/Raw_Data/Mortality_Rates/Female/G_65_74.csv diff --git a/Data/Mortality_Rates/Female/H_75_84.csv b/Data/Raw_Data/Mortality_Rates/Female/H_75_84.csv similarity index 100% rename from Data/Mortality_Rates/Female/H_75_84.csv rename to Data/Raw_Data/Mortality_Rates/Female/H_75_84.csv diff --git a/Data/Mortality_Rates/Female/I_85+.csv b/Data/Raw_Data/Mortality_Rates/Female/I_85+.csv similarity index 100% rename from Data/Mortality_Rates/Female/I_85+.csv rename to Data/Raw_Data/Mortality_Rates/Female/I_85+.csv diff --git a/Data/Mortality_Rates/Male/A_Under1.csv b/Data/Raw_Data/Mortality_Rates/Male/A_Under1.csv similarity index 100% rename from Data/Mortality_Rates/Male/A_Under1.csv rename to Data/Raw_Data/Mortality_Rates/Male/A_Under1.csv diff --git a/Data/Mortality_Rates/Male/B_1_9.csv b/Data/Raw_Data/Mortality_Rates/Male/B_1_9.csv similarity index 100% rename from Data/Mortality_Rates/Male/B_1_9.csv rename to Data/Raw_Data/Mortality_Rates/Male/B_1_9.csv diff --git a/Data/Mortality_Rates/Male/C_10_19.csv b/Data/Raw_Data/Mortality_Rates/Male/C_10_19.csv similarity index 100% rename from Data/Mortality_Rates/Male/C_10_19.csv rename to Data/Raw_Data/Mortality_Rates/Male/C_10_19.csv diff --git a/Data/Mortality_Rates/Male/D_20_39.csv b/Data/Raw_Data/Mortality_Rates/Male/D_20_39.csv similarity index 100% rename from Data/Mortality_Rates/Male/D_20_39.csv rename to Data/Raw_Data/Mortality_Rates/Male/D_20_39.csv diff --git a/Data/Mortality_Rates/Male/E_40_64.csv b/Data/Raw_Data/Mortality_Rates/Male/E_40_64.csv similarity index 100% rename from Data/Mortality_Rates/Male/E_40_64.csv rename to Data/Raw_Data/Mortality_Rates/Male/E_40_64.csv diff --git a/Data/Mortality_Rates/Male/G_65_74.csv b/Data/Raw_Data/Mortality_Rates/Male/G_65_74.csv similarity index 100% rename from Data/Mortality_Rates/Male/G_65_74.csv rename to Data/Raw_Data/Mortality_Rates/Male/G_65_74.csv diff --git a/Data/Mortality_Rates/Male/H_75_84.csv b/Data/Raw_Data/Mortality_Rates/Male/H_75_84.csv similarity index 100% rename from Data/Mortality_Rates/Male/H_75_84.csv rename to Data/Raw_Data/Mortality_Rates/Male/H_75_84.csv diff --git a/Data/Mortality_Rates/Male/I_85+.csv b/Data/Raw_Data/Mortality_Rates/Male/I_85+.csv similarity index 100% rename from Data/Mortality_Rates/Male/I_85+.csv rename to Data/Raw_Data/Mortality_Rates/Male/I_85+.csv diff --git a/Data/Raw_Data/Mortality_Rates/README_MORTALITY.txt b/Data/Raw_Data/Mortality_Rates/README_MORTALITY.txt new file mode 100644 index 0000000..eacecbf --- /dev/null +++ b/Data/Raw_Data/Mortality_Rates/README_MORTALITY.txt @@ -0,0 +1,5 @@ +Data files gathered manually from: + +https://hdpulse.nimhd.nih.gov/data-portal/mortality/table?cod=247&cod_options=cod_15&ratetype=aa&ratetype_options=ratetype_2&race=00&race_options=race_6&sex=2&sex_options=sex_3&age=177&age_options=age_11&ruralurban=0&ruralurban_options=ruralurban_3&yeargroup=5&yeargroup_options=year5yearmort_1&statefips=56&statefips_options=area_states&county=56000&county_options=counties_wyoming&comparison=counties_to_us&comparison_options=comparison_counties&radio_comparison=areas&radio_comparison_options=cods_or_areas + +Each file is single age group, so age weighting does not apply despite the variable names diff --git a/Data/Raw_Data/Mortality_Rates/README_MORTALITY_DATA.txt b/Data/Raw_Data/Mortality_Rates/README_MORTALITY_DATA.txt new file mode 100644 index 0000000..58887b8 --- /dev/null +++ b/Data/Raw_Data/Mortality_Rates/README_MORTALITY_DATA.txt @@ -0,0 +1 @@ +[1] "Data files gathered manually from:\n\n\n\n\nhttps://hdpulse.nimhd.nih.gov/data-portal/mortality/table?cod=247&cod_options=cod_15&ratetype=aa&ratetype_options=ratetype_2&race=00&race_options=race_6&sex=2&sex_options=sex_3&age=177&age_options=age_11&ruralurban=0&ruralurban_options=ruralurban_3&yeargroup=5&yeargroup_options=year5yearmort_1&statefips=56&statefips_options=area_states&county=56000&county_options=counties_wyoming&comparison=counties_to_us&comparison_options=comparison_counties&radio_comparison=areas&radio_comparison_options=cods_or_areas\n\n\n\n\nEach file is single age group, so age weighting does not apply despite the variable names" diff --git a/Data/Raw_Data/Population/Pop_1970s.xls b/Data/Raw_Data/Population/Pop_1970s.xls new file mode 100644 index 0000000..d41a466 Binary files /dev/null and b/Data/Raw_Data/Population/Pop_1970s.xls differ diff --git a/Data/Raw_Data/Population/Pop_1980s.xls b/Data/Raw_Data/Population/Pop_1980s.xls new file mode 100644 index 0000000..e1e14fc Binary files /dev/null and b/Data/Raw_Data/Population/Pop_1980s.xls differ diff --git a/Data/Raw_Data/Population/Pop_1990s.xls b/Data/Raw_Data/Population/Pop_1990s.xls new file mode 100644 index 0000000..d8d219d Binary files /dev/null and b/Data/Raw_Data/Population/Pop_1990s.xls differ diff --git a/Demographic_Split.r b/Demographic_Split.r index 1e53026..2354d02 100644 --- a/Demographic_Split.r +++ b/Demographic_Split.r @@ -4,6 +4,32 @@ ACS_FILE_LOC <- "Data/Cleaned_Data/ACS_Census_Demographic_Data.Rds" ACS_YEAR <- 2023 #Year of the ACS data if(!file.exists(ACS_FILE_LOC)){source("Scripts/Load_ACS_Census_Data.r")}else{AGE_DATA <- readRDS(ACS_FILE_LOC)} LIN_DEMOGRAPHICS <- readRDS("Data/Cleaned_Data/Lincoln_Demographic_Data.Rds") %>% filter(Year==ACS_YEAR) +LIN_DEMOGRAPHICS +GET_INTIAL_DATA_SUMMARY <- function(DEMO_DATA,FILE_SAVE_NAME,ST_POP=NA,Area=NA,ST_YEAR=NA,SAVE_DIR='Data/Cleaned_Data/Intiate_Simulation/'){ + ST_BIRTH <- round(sum((DEMO_DATA%>% filter(Age==0))[4:5])) + ST_BIRTH_TWO_PREV <- round(sum((DEMO_DATA%>% filter(Age==1))[4:5])) + if(is.na(ST_YEAR)){ST_YEAR=max(POP_DATA$Year)} + +if(is.na(ST_POP)){ + if(Area=='Kemmerer'){ +ST_POP<- POP_DATA %>% filter(City %in% c('Kemmerer','Diamondville'),Year==ST_YEAR) %>% mutate(Population=as.numeric(Population)) %>% pull(Population) %>% sum() + } else if(Area=='Other Lincoln'){ + ST_POP_KEM<- POP_DATA %>% filter(City %in% c('Kemmerer','Diamondville'),Year==ST_YEAR) %>% mutate(Population=as.numeric(Population)) %>% pull(Population) %>% sum() + + ST_POP_LIN <- readRDS("Data/Cleaned_Data/Wyoming_County_Population.Rds") %>% filter(Year==ST_YEAR,County==Area) %>% pull(Population) #Grab the year of data for the county + ST_POP <- ST_POP_LIN-ST_POP_KEM + + } else{ + ST_POP <- readRDS("Data/Cleaned_Data/Wyoming_County_Population.Rds") %>% filter(Year==ST_YEAR,County==Area) %>% pull(Population) #Grab the year of data for the county + } +} + + INTIATE_DATA <- DEMO_DATA %>% mutate(Male_Window=Age>=18 & Age<=30,Female_Window=Age>=18 & Age<=28) %>% group_by(County,Year) %>% summarize(Female_Birth_Group=sum(Num_Female*Female_Window),Male_Birth_Group=sum(Num_Male*Male_Window),Min_Birth_Group=ifelse(Female_Birth_Group% mutate(Births=NA,Deaths=NA,Migration=NA) %>% select(Year,County,Population,Births,Deaths,Migration,Min_Birth_Group,PREV_BIRTH,PREV_TWO_BIRTH) %>% ungroup + saveRDS(INTIATE_DATA ,paste0(SAVE_DIR,FILE_SAVE_NAME,".Rds")) + write_csv(INTIATE_DATA ,paste0(SAVE_DIR,FILE_SAVE_NAME,".csv")) + return(INTIATE_DATA) +GET_INTIAL_DATA_SUMMARY(LIN_DEMOGRAPHICS,"test") + for(i in 1:nrow(AGE_DATA)){ if(i==1 & exists("RES")){rm(RES)} @@ -31,6 +57,7 @@ KEM_DEMOGRAPHICS <- RES %>% filter(IN_KEM==1) %>% select(Age,Sex,Per_Pop) %>% mu INTIATE_KEMMER <- KEM_DEMOGRAPHICS %>% mutate(Male_Window=Age>=18 & Age<=30,Female_Window=Age>=18 & Age<=28) %>% group_by(County,Year) %>% summarize(Female_Birth_Group=sum(Num_Female*Female_Window),Male_Birth_Group=sum(Num_Male*Male_Window),Min_Birth_Group=ifelse(Female_Birth_Group% mutate(Births=NA,Deaths=NA,Migration=NA) %>% select(Year,County,Population,Births,Deaths,Migration,Min_Birth_Group,PREV_BIRTH,PREV_TWO_BIRTH) %>% ungroup saveRDS(INTIATE_KEMMER ,"Data/Cleaned_Data/Intiate_Simulation/Kemmerer_Summary_Start_Data.Rds") write_csv(INTIATE_KEMMER ,"Data/Cleaned_Data/Intiate_Simulation/Kemmerer_Summary_Start_Data.csv") + ###Create data sets that contain all people not in Loncoln but not Kemmerer or Diamondville, for a seperate comparison run LIN_DEMOGRAPHICS_RECENT <- readRDS("Data/Cleaned_Data/Lincoln_Demographic_Data.Rds") %>% filter(Year==max(Year)) #Grab the most recent year of data for all of the county LIN_OTHER_DEMOGRAPHICS <- LIN_DEMOGRAPHICS_RECENT %>% left_join(KEM_DEMOGRAPHICS %>% select(-County,ADJ_MALE=Num_Male,ADJ_FEMALE=Num_Female)) %>% mutate(Num_Male=Num_Male-ADJ_MALE,Num_Female=Num_Female-ADJ_FEMALE) %>% select(-ADJ_MALE,-ADJ_FEMALE) %>% ungroup#Deduct the populaiton estimates of Kemmerer to create a "Other Lincoln" data set diff --git a/Scripts/Data_Load.r b/Scripts/1_Download_and_Process_Population_Data.r similarity index 54% rename from Scripts/Data_Load.r rename to Scripts/1_Download_and_Process_Population_Data.r index 658b7ff..9ad04bb 100644 --- a/Scripts/Data_Load.r +++ b/Scripts/1_Download_and_Process_Population_Data.r @@ -1,12 +1,14 @@ -#################IMPORTANT!!!! CLEAN UP DATA SCRIPT FOR USE -#############################Clean up scirpt folders, simulations of deaths should be seperated from this file. Death rate simulation is complicated and should be commented, and turned into a seperate script. -##############WORKING ON REGRESSION OF BIRTHS FOR PREDICTIONS OF GROWTH, THINK ABOUT GOING FROM COUNTY TO CITY, SEPERATE REGRESION FROM DATA COLLOECTION +#############################Clean up script folders, simulations of deaths should be separated from this file. Death rate simulation is complicated and should be commented, and turned into a separate script. library(rvest) library(tidyverse) library(readxl) #setwd("../") +###Create Location to Save raw data sets + if(!exists("SAVE_LOC_RAW")){SAVE_LOC_RAW <-"./Data/Raw_Data/"} + dir.create(SAVE_LOC_RAW, recursive = TRUE, showWarnings = FALSE) ########County, Death, Birth and Migration Data #Data found on the page http://eadiv.state.wy.us/pop/ + #Website States: Wyoming Economic Analysis Division based on U.S. Census Bureau's population estimation and vital stats above PAGE <- read_html("http://eadiv.state.wy.us/pop/BirthDeathMig.htm") NODE <- html_element(PAGE ,"table") TBL <- html_table(NODE) @@ -30,7 +32,7 @@ TBL <- TBL %>% filter(!is.na(Type)) %>% select(County,Type,everything()) GROUP <- colnames(TBL)[-1:-2] Data <- pivot_longer(TBL,all_of(GROUP),names_to="Year",values_to="Pop_Change") Data$County <- ifelse(toupper(Data$County)=="TOTAL","Wyoming",Data$County) -WY_COUNTY_DATA_SET <- pivot_wider(Data,names_from=Type,values_from=Pop_Change) %>% rename("Migration"=`Net Migration`) %>% mutate(Year=as.integer(Year),Births=parse_number(Births),Deaths=parse_number(Deaths),Migration=parse_number(Migration)) %>% mutate(Year=Year-1) #Data apears to be one off from populaiton +WY_COUNTY_DATA_SET <- pivot_wider(Data,names_from=Type,values_from=Pop_Change) %>% rename("Migration"=`Net Migration`) %>% mutate(Year=as.integer(Year),Births=parse_number(Births),Deaths=parse_number(Deaths),Migration=parse_number(Migration)) %>% mutate(Year=Year-1) #Data appears to be one off from population WY_COUNTY_DATA_SET[,"County"] <- gsub(" ","_",WY_COUNTY_DATA_SET %>% pull(County)) ########################City and County Population Data 2020 to 2024 @@ -116,12 +118,15 @@ CITY_POP <- rbind(CITY_TBL,CITY_POP) CITY_POP$City <- gsub("LaGrange","La_Grange",CITY_POP$City) COUNTY_POP <- rbind(COUNTY_TBL,COUNTY_POP) ####################County and City Population Data for 1990-2000 -if(!file.exists("./Data/Pop_1990s.xls")){download.file('http://eadiv.state.wy.us/pop/c&sc90_00.xls',"./Data/Pop_1990s.xls")} -TEMP <- read_xls("Data/Pop_1990s.xls",skip=2)[-1:-4,] -colnames(TEMP)[1] <- "County" -tail(TEMP) -TEMP <- TEMP[1:which(TEMP[,1]=="Wind River Res."),] +#Location to save any raw population files. Most files are not saved since they are pulled from a html and not a excel file, but older files are only available as excel files +SAVE_LOC_RAW_POP <- paste0(SAVE_LOC_RAW,"/Population") +dir.create(SAVE_LOC_RAW_POP , recursive = TRUE, showWarnings = FALSE) +POP_FILE_1990 <- paste0(SAVE_LOC_RAW_POP,"/Pop_1990s.xls") +if(!file.exists(POP_FILE_1990)){download.file('http://eadiv.state.wy.us/pop/c&sc90_00.xls',POP_FILE_1990)} +TEMP <- read_xls(POP_FILE_1990,skip=2)[-1:-4,] +colnames(TEMP)[1] <- "County" +TEMP <- TEMP[1:which(TEMP[,1]=="Wind River Res."),] TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population)) TEMP <- TEMP %>% filter(Year!=2000) TEMP_COUNTY <- TEMP[grepl("Cnty",TEMP %>% pull(County),ignore.case=TRUE),] @@ -138,8 +143,10 @@ TEMP_CITY <- TEMP_CITY %>% filter(Year!=2000) try(rm(TEMP_CITY,TEMP_COUNTY,TEMP)) ####################County and City Population Data for 1980-1990 -if(!file.exists("./Data/Pop_1980s.xls")){download.file('http://eadiv.state.wy.us/pop/C&SC8090.xls',"./Data/Pop_1980s.xls")} -TEMP <- read_xls("Data/Pop_1980s.xls",skip=2)[-1:-4,] +POP_FILE_1980 <- paste0(SAVE_LOC_RAW_POP ,"/Pop_1980s.xls") +if(!file.exists(POP_FILE_1980)){download.file('http://eadiv.state.wy.us/pop/C&SC8090.xls',POP_FILE_1980)} + +TEMP <- read_xls(POP_FILE_1980,skip=2)[-1:-4,] colnames(TEMP)[1] <- "County" TEMP <- TEMP[2:which(TEMP[,1]=="Upton"),1:(min(which(is.na(TEMP[2,])))-1)] TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population)) @@ -162,8 +169,10 @@ COUNTY_POP <- rbind(TEMP_COUNTY,COUNTY_POP) try(rm(TEMP_CITY,TEMP_COUNTY,TEMP)) ####################County Population Data for 1970-1980 -if(!file.exists("./Data/Pop_1970s.xls")){download.file('http://eadiv.state.wy.us/pop/Cnty7080.xls',"./Data/Pop_1970s.xls")} -TEMP <- read_xls("Data/Pop_1970s.xls",skip=2)[-1:-4,] +POP_FILE_1970 <- paste0(SAVE_LOC_RAW_POP ,"/Pop_1970s.xls") +if(!file.exists(POP_FILE_1970)){download.file('http://eadiv.state.wy.us/pop/Cnty7080.xls',POP_FILE_1970)} + +TEMP <- read_xls(POP_FILE_1970,skip=2)[-1:-4,] colnames(TEMP)[1] <- "County" TEMP <- TEMP[1:which(TEMP[,1]=="Weston"),] TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population)) @@ -197,123 +206,23 @@ try(rm(TEMP)) colnames(TEMP ) <- c("Year","City","Population") TEMP <- as_tibble(TEMP) CITY_POP <- rbind(TEMP,CITY_POP) %>% arrange(City,Year) + #Remove empty values, ensure all numeric values are not saved as characters + CITY_POP <- CITY_POP %>% filter(!is.na(Population) ) %>% mutate(Population=parse_number(Population),Year=parse_number(Year)) #Add Other Data -COUNTY_POP <- COUNTY_POP %>% mutate(Year=as.integer(Year)) %>% unique +COUNTY_POP <- COUNTY_POP %>% mutate(Year=as.numeric(Year)) %>% unique WY_COUNTY_DATA_SET <- COUNTY_POP %>% left_join(WY_COUNTY_DATA_SET ) %>% mutate(Population=as.numeric(Population)) %>% unique ###Save Population Results -write_csv(CITY_POP,file="./Data/Cleaned_Data/Wyoming_City_Population.csv") -write_csv(WY_COUNTY_DATA_SET ,file="./Data/Cleaned_Data/Wyoming_County_Population.csv") -saveRDS(CITY_POP,file="./Data/Cleaned_Data/Wyoming_City_Population.Rds") -saveRDS(WY_COUNTY_DATA_SET ,file="./Data/Cleaned_Data/Wyoming_County_Population.Rds") + if(!exists("SAVE_LOC_POP")){SAVE_LOC_POP <-"./Data/Cleaned_Data/Population_Data"} + CSV_SAVE <- paste0(SAVE_LOC_POP,"/CSV") + RDS_SAVE <- paste0(SAVE_LOC_POP,"/RDS") + dir.create(CSV_SAVE, recursive = TRUE, showWarnings = FALSE) + dir.create(RDS_SAVE, recursive = TRUE, showWarnings = FALSE) + +saveRDS(CITY_POP,paste0(RDS_SAVE,"/All_Wyoming_City_Populations.Rds" )) +write_csv(CITY_POP,paste0(CSV_SAVE,"/All_Wyoming_City_Populations.csv" )) + +saveRDS(WY_COUNTY_DATA_SET,paste0(RDS_SAVE,"/All_Wyoming_County_Populations.Rds" )) +write_csv(WY_COUNTY_DATA_SET,paste0(CSV_SAVE,"/All_Wyoming_County_Populations.csv" )) -###################Demographics -if(!file.exists("./Data/Demo_Single_Year_2020s.xls")){download.file('http://eadiv.state.wy.us/Pop/CO_SYASEX24.xlsx',"./Data/Demo_Single_Year_2020s.xls")} -TEMP <- read_xlsx("./Data/Demo_Single_Year_2020s.xls",skip=2)[,-1] -TEMP <- TEMP[1:(min(which(is.na(TEMP[,1])))-1),] -TEMP <- TEMP[!grepl("Base",TEMP$YEAR,ignore.case=TRUE),] #There are two population values provided. I believe one is the census baseline, and one is a estimate in July. Keep the later estimate, to line up with the same seasonal collection pattern of the rest of the data -TEMP$YEAR <- year(as.Date(substr((TEMP$YEAR),1,8),format="%m/%d/%Y")) -colnames(TEMP) <- c("County","Year","Age","Number","Num_Male","Num_Female") -TEMP$County <- gsub(" County","",TEMP$County,ignore.case=TRUE) -DEM_2020 <- TEMP %>% select(-Number) -###Demogrpahics all -DEM_DATA <- read_delim("Data/County_Demographics_Census/wy.1969_2023.singleages.through89.90plus.txt",delim=" ",col_names=c("ID","VALUES"),col_types=list('c','c')) -DEM_DATA$Year <- as.integer(substr(DEM_DATA$ID,1,4)) -DEM_DATA$fips<- substr(DEM_DATA$ID,7,11) -COUNTY_LIST <- read_csv("https://github.com/kjhealy/fips-codes/raw/refs/heads/master/county_fips_master.csv",col_types=list('c','c')) %>% filter(state_abbr=="WY") %>% select(fips,County=county_name) %>% mutate(County=gsub(" ","_",gsub(" County","",County,ignore.case=TRUE))) -DEM_DATA <- DEM_DATA %>% left_join(COUNTY_LIST) %>% select(-fips) -#16=3 -DEM_DATA$Sex <- ifelse(substr(DEM_DATA$VALUES,3,3)==1,"Male","Female") -DEM_DATA$Age <- parse_number(substr(DEM_DATA$VALUES,4,5)) -DEM_DATA$Number <- parse_number(substr(DEM_DATA$VALUES,6,14)) -DEM_DATA <- DEM_DATA %>% select(-ID,-VALUES) - - -DEM_DATA <- DEM_DATA %>% group_by(Year,County,Sex,Age) %>% summarize(Number=sum(Number)) %>% ungroup()#Aggregate to sex and age level - -#The Wyoming census data seems newer than this data set from SEER cancer data source. Drop any of these records that overlap with the Wyoming data before merging. Arrange so the column order is the same between the two data sets, so they can be easily bound together. -DEM_DATA <- pivot_wider(DEM_DATA,names_from=Sex,values_from=Number) %>% rename(Num_Female=Female,Num_Male=Male) %>% select(colnames(DEM_2020)) %>% filter(Year% unique -DEM_DATA <- rbind(DEM_2020,DEM_DATA) %>% ungroup %>% arrange(Year,Age) %>% unique - -###Save demographic data set - LIN_DEM <- DEM_DATA %>% filter(County=='Lincoln') - write_csv(LIN_DEM,file="./Data/Cleaned_Data/Lincoln_Demographic_Data.csv") - write_csv(DEM_DATA,file="./Data/Cleaned_Data/Wyoming_County_Demographic_Data.csv") - saveRDS(LIN_DEM,file="./Data/Cleaned_Data/Lincoln_Demographic_Data.Rds") - saveRDS(DEM_DATA,file="./Data/Cleaned_Data/Wyoming_County_Demographic_Data.Rds") - -#########################################Mortality Rate -GET_MORTALITY_DATA <- function(FILE,SEX,LOWER_AGE,UPPER_AGE){ - #Create clean mortality rate data - #Data gathered from https://hdpulse.nimhd.nih.gov/data-portal/mortality/table?cod=247&cod_options=cod_15&ratetype=aa&ratetype_options=ratetype_2&race=00&race_options=race_6&sex=2&sex_options=sex_3&age=177&age_options=age_11&ruralurban=0&ruralurban_options=ruralurban_3&yeargroup=5&yeargroup_options=year5yearmort_1&statefips=56&statefips_options=area_states&county=56000&county_options=counties_wyoming&comparison=counties_to_us&comparison_options=comparison_counties&radio_comparison=areas&radio_comparison_options=cods_or_areas - - NAMES <- c("County","FIPS","Death_Rate","Lower_Rate","Upper_Rate","Deaths","Trend_Category","Trend","Lower_Trend","Upper_Trend") - DF <- read_csv(FILE,skip=5,col_names=NAMES,col_types=list('c',"i",'d','d','d','d','c','d','d','d')) %>% filter(grepl("County|Wyoming",County)|County=="United States") %>% mutate(Rate_SD=(Upper_Rate-Lower_Rate)/(2*1.96),Trend_SD=(Upper_Trend-Lower_Trend)/(2*1.96)) %>% select(County,Death_Rate,Rate_SD,Trend,Trend_SD) - DF$County <- gsub(" County","",DF$County,ignore.case=TRUE) - DF[,-1] <- DF[,-1]/100000 - WYOMING_TREND <- pull(DF[DF$County=="Wyoming",],"Trend") - US_TREND <- pull(DF[DF$County=="United States",],"Trend") - - WYOMING_RATE <- pull(DF[DF$County=="Wyoming",],"Death_Rate") - US_RATE <- pull(DF[DF$County=="United States",],"Death_Rate") - - DF$Imparted_Trend <- FALSE - if(is.na(WYOMING_TREND)){ - DF[1,4:5] <- DF[2,4:5] - DF[1,6] <- TRUE - } - - DF$Imparted_Rate <- FALSE - if(is.na(WYOMING_RATE)){ - DF[1,4:5] <- DF[2,2:3] - DF[1,6] <- TRUE - } - WYOMING_BASELINE_TREND <-cbind (DF[1,4:5] ,TRUE) - WYOMING_BASELINE_RATE <-DF[1,2:3] - - for(i in 3:nrow(DF)){ - #Impart any missing trends based on higher levels - if(is.na(pull(DF[i,],"Trend"))){ DF[i,4:6] <- WYOMING_BASELINE_TREND} - #Impart any missing death rates based on higher levels - if(is.na(pull(DF[i,],"Death_Rate"))){ - DF[i,2:3] <- WYOMING_BASELINE_RATE - DF[i,"Imparted_Rate"] <- TRUE - } - } - DF$Sex <- SEX - DF$Min_Age <- LOWER_AGE - DF$Max_Age <- UPPER_AGE - DF <- DF %>% select(County,Sex,Min_Age,Max_Age,Death_Rate,Rate_SD,Imparted_Rate,everything()) - return(DF) -} -MORTALITY_DATA_ALL <- rbind( -GET_MORTALITY_DATA("Data/Mortality_Rates/Female/A_Under1.csv","Female",0,0), -GET_MORTALITY_DATA("Data/Mortality_Rates/Female/B_1_9.csv","Female",1,9), -GET_MORTALITY_DATA("Data/Mortality_Rates/Female/C_10_19.csv","Female",10,19), -GET_MORTALITY_DATA("Data/Mortality_Rates/Female/D_20_39.csv","Female",20,39), -GET_MORTALITY_DATA("Data/Mortality_Rates/Female/E_40_64.csv","Female",40,64), -GET_MORTALITY_DATA("Data/Mortality_Rates/Female/G_65_74.csv","Female",65,74), -GET_MORTALITY_DATA("Data/Mortality_Rates/Female/H_75_84.csv","Female",75,84), -GET_MORTALITY_DATA("Data/Mortality_Rates/Female/I_85+.csv","Female",85,Inf), -GET_MORTALITY_DATA("Data/Mortality_Rates/Male/A_Under1.csv","Male",0,0), -GET_MORTALITY_DATA("Data/Mortality_Rates/Male/B_1_9.csv","Male",1,9), -GET_MORTALITY_DATA("Data/Mortality_Rates/Male/C_10_19.csv","Male",10,19), -GET_MORTALITY_DATA("Data/Mortality_Rates/Male/D_20_39.csv","Male",20,39), -GET_MORTALITY_DATA("Data/Mortality_Rates/Male/E_40_64.csv","Male",40,64), -GET_MORTALITY_DATA("Data/Mortality_Rates/Male/G_65_74.csv","Male",65,74), -GET_MORTALITY_DATA("Data/Mortality_Rates/Male/H_75_84.csv","Male",75,84), -GET_MORTALITY_DATA("Data/Mortality_Rates/Male/I_85+.csv","Male",85,Inf) - -) -LIN_MORTALITY <- MORTALITY_DATA_ALL %>% filter(County=="Lincoln") -###Save Mortality Rate data set - write_csv(LIN_MORTALITY,file="./Data/Cleaned_Data/Lincoln_Mortality_Rate.csv") - write_csv(MORTALITY_DATA_ALL,file="./Data/Cleaned_Data/Not_Used/Wyoming_County_Mortality_Rate.csv") - saveRDS(LIN_MORTALITY,file="./Data/Cleaned_Data/Lincoln_Mortality_Rate.Rds") - saveRDS(MORTALITY_DATA_ALL,file="./Data/Cleaned_Data/Not_Used/Wyoming_County_Mortality_Rate.Rds") - -#Clean all data from memory. Will be loaded in any paticular script as needed -rm(list = ls()) -gc() - diff --git a/Scripts/2_Download_and_Process_Demographic_Data.r b/Scripts/2_Download_and_Process_Demographic_Data.r new file mode 100644 index 0000000..b7e111d --- /dev/null +++ b/Scripts/2_Download_and_Process_Demographic_Data.r @@ -0,0 +1,44 @@ +library(tidyverse) +library(readxl) + +###################Demographics +if(!file.exists("./Data/Demo_Single_Year_2020s.xls")){download.file('http://eadiv.state.wy.us/Pop/CO_SYASEX24.xlsx',"./Data/Demo_Single_Year_2020s.xls")} +TEMP <- read_xlsx("./Data/Demo_Single_Year_2020s.xls",skip=2)[,-1] +TEMP <- TEMP[1:(min(which(is.na(TEMP[,1])))-1),] +TEMP <- TEMP[!grepl("Base",TEMP$YEAR,ignore.case=TRUE),] #There are two population values provided. I believe one is the census baseline, and one is a estimate in July. Keep the later estimate, to line up with the same seasonal collection pattern of the rest of the data +TEMP$YEAR <- year(as.Date(substr((TEMP$YEAR),1,8),format="%m/%d/%Y")) +colnames(TEMP) <- c("County","Year","Age","Number","Num_Male","Num_Female") +TEMP$County <- gsub(" County","",TEMP$County,ignore.case=TRUE) +DEM_2020 <- TEMP %>% select(-Number) +###Demographics all +DEM_DATA <- read_delim("Data/County_Demographics_Census/wy.1969_2023.singleages.through89.90plus.txt",delim=" ",col_names=c("ID","VALUES"),col_types=list('c','c')) +DEM_DATA$Year <- as.integer(substr(DEM_DATA$ID,1,4)) +DEM_DATA$fips<- substr(DEM_DATA$ID,7,11) +COUNTY_LIST <- read_csv("https://github.com/kjhealy/fips-codes/raw/refs/heads/master/county_fips_master.csv",col_types=list('c','c')) %>% filter(state_abbr=="WY") %>% select(fips,County=county_name) %>% mutate(County=gsub(" ","_",gsub(" County","",County,ignore.case=TRUE))) +DEM_DATA <- DEM_DATA %>% left_join(COUNTY_LIST) %>% select(-fips) +#16=3 +DEM_DATA$Sex <- ifelse(substr(DEM_DATA$VALUES,3,3)==1,"Male","Female") +DEM_DATA$Age <- parse_number(substr(DEM_DATA$VALUES,4,5)) +DEM_DATA$Number <- parse_number(substr(DEM_DATA$VALUES,6,14)) +DEM_DATA <- DEM_DATA %>% select(-ID,-VALUES) +DEM_DATA <- DEM_DATA %>% group_by(Year,County,Sex,Age) %>% summarize(Number=sum(Number)) %>% ungroup()#Aggregate to sex and age level + +#The Wyoming census data seems newer than this data set from SEER cancer data source. Drop any of these records that overlap with the Wyoming data before merging. Arrange so the column order is the same between the two data sets, so they can be easily bound together. +DEM_DATA <- pivot_wider(DEM_DATA,names_from=Sex,values_from=Number) %>% rename(Num_Female=Female,Num_Male=Male) %>% select(colnames(DEM_2020)) %>% filter(Year% unique +DEM_DATA <- rbind(DEM_2020,DEM_DATA) %>% ungroup %>% arrange(Year,Age) %>% unique %>% mutate(Region=County) %>% select(County,Region,Year,Age,Num_Male,Num_Female) #Add a region which in this case is just the county. For Lincoln cities and other areas are analysed +###Save demographic data set + if(!exists("SAVE_DEMO_LOC")){SAVE_DEMO_LOC <-"./Data/Cleaned_Data/Demographic_Sex_Age_Data"} + CSV_SAVE <- paste0(SAVE_DEMO_LOC,"/CSV") + RDS_SAVE <- paste0(SAVE_DEMO_LOC,"/RDS") + + #Save files for all county demographics + dir.create(CSV_SAVE , recursive = TRUE, showWarnings = FALSE) + dir.create(RDS_SAVE , recursive = TRUE, showWarnings = FALSE) + saveRDS(DEM_DATA,paste0(RDS_SAVE,"/All_Wyoming_Counties_Demographics.Rds" )) + write_csv(DEM_DATA,paste0(CSV_SAVE,"/All_Wyoming_Counties_Demographics.csv" )) + + #Save a file that is just Lincoln County, this just speeds up code so you do not always need to filter for 'Lincoln' + LIN_DEM <- DEM_DATA %>% filter(County=='Lincoln') + saveRDS(LIN_DEM,paste0(RDS_SAVE,"/Full_Lincoln_County_Demographics.Rds" )) + write_csv(LIN_DEM,paste0(CSV_SAVE,"/Full_Lincoln_County_Demographics.csv" )) + diff --git a/Scripts/3_Process_Existing_NIH_Mortality_Data.r b/Scripts/3_Process_Existing_NIH_Mortality_Data.r new file mode 100644 index 0000000..49e0397 --- /dev/null +++ b/Scripts/3_Process_Existing_NIH_Mortality_Data.r @@ -0,0 +1,99 @@ +#########################################Mortality Rate +library(tidyverse) +###Create Location to Save raw data sets + if(!exists("SAVE_LOC_RAW")){SAVE_LOC_RAW <-"./Data/Raw_Data/"} + dir.create(SAVE_LOC_RAW, recursive = TRUE, showWarnings = FALSE) +GET_MORTALITY_DATA <- function(FILE,SEX,LOWER_AGE,UPPER_AGE){ + #Create clean mortality rate data + #Data gathered from https://hdpulse.nimhd.nih.gov/data-portal/mortality/table?cod=247&cod_options=cod_15&ratetype=aa&ratetype_options=ratetype_2&race=00&race_options=race_6&sex=2&sex_options=sex_3&age=177&age_options=age_11&ruralurban=0&ruralurban_options=ruralurban_3&yeargroup=5&yeargroup_options=year5yearmort_1&statefips=56&statefips_options=area_states&county=56000&county_options=counties_wyoming&comparison=counties_to_us&comparison_options=comparison_counties&radio_comparison=areas&radio_comparison_options=cods_or_areas + + NAMES <- c("County","FIPS","Death_Rate","Lower_Rate","Upper_Rate","Deaths","Trend_Category","Trend","Lower_Trend","Upper_Trend") + DF <- read_csv(FILE,skip=5,col_names=NAMES,col_types=list('c',"i",'d','d','d','d','c','d','d','d')) %>% filter(grepl("County|Wyoming",County)|County=="United States") %>% mutate(Rate_SD=(Upper_Rate-Lower_Rate)/(2*1.96),Trend_SD=(Upper_Trend-Lower_Trend)/(2*1.96)) %>% select(County,Death_Rate,Rate_SD,Trend,Trend_SD) + DF$County <- gsub(" County","",DF$County,ignore.case=TRUE) + DF[,-1] <- DF[,-1]/100000 + WYOMING_TREND <- pull(DF[DF$County=="Wyoming",],"Trend") + US_TREND <- pull(DF[DF$County=="United States",],"Trend") + + WYOMING_RATE <- pull(DF[DF$County=="Wyoming",],"Death_Rate") + US_RATE <- pull(DF[DF$County=="United States",],"Death_Rate") + + DF$Imparted_Trend <- FALSE + if(is.na(WYOMING_TREND)){ + DF[1,4:5] <- DF[2,4:5] + DF[1,6] <- TRUE + } + + DF$Imparted_Rate <- FALSE + if(is.na(WYOMING_RATE)){ + DF[1,4:5] <- DF[2,2:3] + DF[1,6] <- TRUE + } + WYOMING_BASELINE_TREND <-cbind (DF[1,4:5] ,TRUE) + WYOMING_BASELINE_RATE <-DF[1,2:3] + + for(i in 3:nrow(DF)){ + #Impart any missing trends based on higher levels + if(is.na(pull(DF[i,],"Trend"))){ DF[i,4:6] <- WYOMING_BASELINE_TREND} + #Impart any missing death rates based on higher levels + if(is.na(pull(DF[i,],"Death_Rate"))){ + DF[i,2:3] <- WYOMING_BASELINE_RATE + DF[i,"Imparted_Rate"] <- TRUE + } + } + DF$Sex <- SEX + DF$Min_Age <- LOWER_AGE + DF$Max_Age <- UPPER_AGE + DF <- DF %>% select(County,Sex,Min_Age,Max_Age,Death_Rate,Rate_SD,Imparted_Rate,everything()) + return(DF) +} +SAVE_LOC_RAW_MORT <- paste0(SAVE_LOC_RAW,"Mortality_Rates/") +RAW_MORTALITY_LOC_FEMALE <- paste0(SAVE_LOC_RAW_MORT,"Female/") +RAW_MORTALITY_LOC_MALE <- paste0(SAVE_LOC_RAW_MORT,"Male/") +#Gather all mortality Records +MORTALITY_DATA_ALL <- rbind( + GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"A_Under1.csv"),"Female",0,0), + GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"B_1_9.csv"),"Female",1,9), + GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"C_10_19.csv"),"Female",10,19), + GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"D_20_39.csv"),"Female",20,39), + GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"E_40_64.csv"),"Female",40,64), + GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"G_65_74.csv"),"Female",65,74), + GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"H_75_84.csv"),"Female",75,84), + GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"I_85+.csv"),"Female",85,Inf), + GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"A_Under1.csv"),"Male",0,0), + GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"B_1_9.csv"),"Male",1,9), + GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"C_10_19.csv"),"Male",10,19), + GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"D_20_39.csv"),"Male",20,39), + GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"E_40_64.csv"),"Male",40,64), + GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"G_65_74.csv"),"Male",65,74), + GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"H_75_84.csv"),"Male",75,84), + GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"I_85+.csv"),"Male",85,Inf) +) +LIN_MORTALITY <- MORTALITY_DATA_ALL %>% filter(County=="Lincoln") +##Save the mortality data + if(!exists("SAVE_MORT_LOC")){SAVE_MORT_LOC <-"./Data/Cleaned_Data/Mortality_Rate_Data"} + CSV_SAVE <- paste0(SAVE_MORT_LOC ,"/CSV") + RDS_SAVE <- paste0(SAVE_MORT_LOC,"/RDS") + #Save files for all county demographics + dir.create(CSV_SAVE , recursive = TRUE, showWarnings = FALSE) + dir.create(RDS_SAVE , recursive = TRUE, showWarnings = FALSE) + saveRDS(MORTALITY_DATA_ALL,paste0(RDS_SAVE,"/All_Wyoming_Counties_Mortality_Rates.Rds" )) + write_csv(MORTALITY_DATA_ALL,paste0(CSV_SAVE,"/All_Wyoming_Counties_Mortality_Rates.csv" )) + saveRDS(LIN_MORTALITY,paste0(RDS_SAVE,"/Lincoln_County_Mortality_Rates.Rds" )) + write_csv(LIN_MORTALITY,paste0(CSV_SAVE,"/Lincoln_County_Mortality_Rates.csv" )) +#Create a short readme files to make the data sources more clear + #Save a raw data readme +sink(file=paste0(SAVE_LOC_RAW,"/README_MORTALITY_DATA.txt"),append=FALSE) + cat("Data files gathered manually from:\n") + cat("https://hdpulse.nimhd.nih.gov/data-portal/mortality/table?cod=247&cod_options=cod_15&ratetype=aa&ratetype_options=ratetype_2&race=00&race_options=race_6&sex=2&sex_options=sex_3&age=177&age_options=age_11&ruralurban=0&ruralurban_options=ruralurban_3&yeargroup=5&yeargroup_options=year5yearmort_1&statefips=56&statefips_options=area_states&county=56000&county_options=counties_wyoming&comparison=counties_to_us&comparison_options=comparison_counties&radio_comparison=areas&radio_comparison_options=cods_or_areas\n") + cat("\nEach file is single age group, so age weighting does not apply despite the variable names\n") +sink() + + + + #Save a processed raw data readme +sink(file=paste0(SAVE_MORT_LOC,"/README_MORTALITY_DATA.txt"),append=FALSE) + cat("This is a processed file of NIH death rates by age and county. Data files first gathered manually from:\n") + cat("\nhttps://hdpulse.nimhd.nih.gov/data-portal/mortality/table?cod=247&cod_options=cod_15&ratetype=aa&ratetype_options=ratetype_2&race=00&race_options=race_6&sex=2&sex_options=sex_3&age=177&age_options=age_11&ruralurban=0&ruralurban_options=ruralurban_3&yeargroup=5&yeargroup_options=year5yearmort_1&statefips=56&statefips_options=area_states&county=56000&county_options=counties_wyoming&comparison=counties_to_us&comparison_options=comparison_counties&radio_comparison=areas&radio_comparison_options=cods_or_areas\n") + cat("\nThese manually saved files are in the raw data directory. Each file is single age group, so age weighting does not apply despite the variable names\n") +sink() + diff --git a/Scripts/Get_Sim_Intial_Demographic_Data.r b/Scripts/Get_Sim_Intial_Demographic_Data.r new file mode 100644 index 0000000..ccbc34e --- /dev/null +++ b/Scripts/Get_Sim_Intial_Demographic_Data.r @@ -0,0 +1,44 @@ +##Extract Lincoln county ACS data including separate indicators for if a census tract is in Kemmerer, or any other part of the county. Population by Age-Sex in each year for each tract. +GET_ACS_LIN_DATA <- function(ACS_YEAR,ACS_CODES=CODES,CENSUS_TRACTS=PROJ_TRACTS){ + AGE_DATA <- get_acs(geography="tract",year=ACS_YEAR,variables=ACS_CODES$variable,state='WY',county='lincoln') %>% mutate(se=moe/1.64) %>% left_join(ACS_CODES %>% mutate(variable=gsub('E','',variable))) %>% select(-NAME,-moe) %>% left_join(CENSUS_TRACTS) %>% mutate(IN_KEM=ifelse(is.na(IN_KEM),0,1)) %>% rename(Population=estimate) %>% select(-variable,-GEOID) %>% select(Sex,Min_Age,Max_Age,Med_Age,IN_KEM,Population,se) %>% filter(!(Min_Age==0& Max_Age==Inf)) + AGE_DATA <- AGE_DATA %>% group_by(Sex,Min_Age,Max_Age,Med_Age,IN_KEM)%>% summarize(Population=sum(Population)) %>% ungroup + #Add Descriptive age category to a clean graph + AGE_DATA$Ages <- paste(AGE_DATA$Min_Age,"to",AGE_DATA$Max_Age) + AGE_DATA[AGE_DATA$Max_Age==Inf,"Ages"] <- "85+" + AGE_DATA[AGE_DATA$Med_Age==18,"Ages"] <- "18" + AGE_DATA[AGE_DATA$Med_Age==20,"Ages"] <- "20" + AGE_DATA[AGE_DATA$Med_Age==21,"Ages"] <- "21" + AGE_DATA[AGE_DATA$Min_Age==0,"Ages"] <- "Under 5" + #Turn the ages into factors to keep the correct order in graphs + #Add the percent of total relative population in the region + ORD <- AGE_DATA %>% select(Min_Age,Ages) %>% unique %>% arrange(Min_Age) %>% pull(Ages) %>% unique + AGE_DATA$Ages <- factor(AGE_DATA$Ages,levels=ORD) + AGE_DATA$Year <- ACS_YEAR + AGE_DATA <-AGE_DATA %>% select(Year,Sex,Ages,IN_KEM,everything()) + return(AGE_DATA) +} +####################Loop to create data for each year. Projecting distribution of ages into Kemmerer, and returning a demographic distribution for Kemmerer/Diamondville in each year of the ACS (currently 2009 to 2023) +MAKE_KEM_DEMO_DATA_YEAR <- function(ACS_YEAR){ + LIN_DEMOGRAPHICS <- readRDS("Data/Cleaned_Data/Lincoln_Demographic_Data.Rds") %>% filter(Year==ACS_YEAR) + AGE_DATA <- GET_ACS_LIN_DATA(ACS_YEAR) + for(i in 1:nrow(AGE_DATA)){ + if(i==1 & exists("RES")){rm(RES)} + C_DEMO <- AGE_DATA[i,] + C_SEX <- C_DEMO$Sex + if(C_DEMO$Min_Age==C_DEMO$Max_Age){C_AGE_YEARLY_DATA <- LIN_DEMOGRAPHICS %>% filter(Age==C_DEMO$Min_Age)}else{ + C_AGE_YEARLY_DATA <- LIN_DEMOGRAPHICS %>% filter(Age>=C_DEMO$Min_Age,Age<=C_DEMO$Max_Age) + } + #Extract only the current sex being applied + if(C_SEX=='Female'){C_AGE_YEARLY_DATA <- C_AGE_YEARLY_DATA %>% select(Age,Num_Female) %>% rename(POP=Num_Female)}else{C_AGE_YEARLY_DATA <- C_AGE_YEARLY_DATA %>% select(Age,Num_Male) %>% rename(POP=Num_Male)} + C_AGE_YEARLY_DATA$Sex <- C_SEX + C_AGE_YEARLY_DATA$PER <- C_AGE_YEARLY_DATA$POP/sum(C_AGE_YEARLY_DATA$POP) + C_RES <- C_AGE_YEARLY_DATA%>% left_join(C_DEMO) %>% mutate(Population=PER*Population) %>% select(Age,Sex,IN_KEM,Population) + if(i==1){RES <- C_RES}else{RES <- rbind(RES,C_RES)} + } + RES$Year <- ACS_YEAR + RES$County <- "Lincoln" + RES <- RES %>% select(County,IN_KEM,Year,Age,Sex,Population) + return(RES) +} + + diff --git a/Scripts/Load_ACS_Census_Data.r b/Scripts/Load_ACS_Census_Data.r index 494adaa..d2a07d7 100644 --- a/Scripts/Load_ACS_Census_Data.r +++ b/Scripts/Load_ACS_Census_Data.r @@ -1,9 +1,11 @@ +#library(tidyverse);setwd("../") library(tidycensus) library(zipcodeR) +source("Scripts/Get_Sim_Intial_Demographic_Data.r") +if(!exists("SAVE_DEMO_LOC")){SAVE_DEMO_LOC <-"./Data/Cleaned_Data/Demographic_Sex_Age_Data"} +ACS_END_YEAR <- 2023 #most recent in package as of Nov 4 2025 #Pull the relevant median age variables the value moe (margine of error) can be converted to standard error, following the link below #https://www.census.gov/content/dam/Census/library/publications/2018/acs/acs_general_handbook_2018_ch08.pdf -ACS_YEAR <- 2023 #most recent as of Nov 4 2025 -source("Scripts/Downshift_Population_Functions.r") #Packages to instal on computer if zipcodeR won't install #Sudo apt install libssl-dev libudunits2-dev libabsl-dev libcurl4-openssl-dev libgdal-dev cmake libfontconfig1-dev libharfbuzz-dev libfribidi-dev #install.packages("zipcodeR") @@ -13,28 +15,32 @@ source("Scripts/Downshift_Population_Functions.r") #census_api_key(KEY, install = TRUE) PROJ_TRACTS <- get_tracts(search_city('Kemmerer','WY')$zipcode) %>% full_join(get_tracts(search_city('Diamondville','WY')$zipcode)) PROJ_TRACTS <- PROJ_TRACTS %>% select(GEOID) %>% mutate('IN_KEM'=1) %>% mutate(GEOID=as.character(GEOID)) - -MED_AGE_VAR <- cbind(c('B01002_001E','B01002_002E','B01002_003E'),c('Median_Age','Median_Age_Male','Median_Age_Female')) %>% as_tibble %>% rename(variable=V1,Data_Type=V2) - ###Load data manually created which links vairable names to sex-age census data CODES <- read_csv("Data/API_CENSUS_CODES.csv",skip=1) %>% mutate(Med_Age=(Min_Age+Max_Age)/2) %>% rename(variable=Code) #Testing age Comparison between the two ###Extract census data for all tracts in Lincoln county, clean up the data, and indicate if the tract is in Kemmerer/Diamondvile or not. -AGE_DATA <- get_acs(geography="tract",year=ACS_YEAR,variables=CODES$variable,state='WY',county='lincoln') %>% mutate(se=moe/1.64) %>% left_join(CODES %>% mutate(variable=gsub('E','',variable))) %>% select(-NAME,-moe) %>% left_join(PROJ_TRACTS) %>% mutate(IN_KEM=ifelse(is.na(IN_KEM),0,1)) %>% rename(Population=estimate) %>% select(-variable,-GEOID) %>% select(Sex,Min_Age,Max_Age,Med_Age,IN_KEM,Population,se) %>% filter(!(Min_Age==0& Max_Age==Inf)) -AGE_DATA <- AGE_DATA %>% group_by(Sex,Min_Age,Max_Age,Med_Age,IN_KEM)%>% summarize(Population=sum(Population)) %>% ungroup -#Add Descriptive age category to a clean graph -AGE_DATA$Ages <- paste(AGE_DATA$Min_Age,"to",AGE_DATA$Max_Age) -AGE_DATA[AGE_DATA$Max_Age==Inf,"Ages"] <- "85+" -AGE_DATA[AGE_DATA$Med_Age==18,"Ages"] <- "18" -AGE_DATA[AGE_DATA$Med_Age==20,"Ages"] <- "20" -AGE_DATA[AGE_DATA$Med_Age==21,"Ages"] <- "21" -AGE_DATA[AGE_DATA$Min_Age==0,"Ages"] <- "Under 5" -#Turn the ages into factors to keep the correct order in graphs -#Add the percent of total relative population in the region -ORD <- AGE_DATA %>% select(Min_Age,Ages) %>% unique %>% arrange(Min_Age) %>% pull(Ages) %>% unique -AGE_DATA$Ages <- factor(AGE_DATA$Ages,levels=ORD) -AGE_DATA$Year <- ACS_YEAR -AGE_DATA <-AGE_DATA %>% select(Year,Sex,Ages,IN_KEM,everything()) -if(!exists('ACS_FILE_LOC')){ACS_FILE_LOC <- "Data/Cleaned_Data/ACS_Census_Demographic_Data.Rds"} -saveRDS(AGE_DATA,ACS_FILE_LOC) +DEMO_DATA_ALL <- do.call(rbind,lapply(2009:ACS_END_YEAR,MAKE_KEM_DEMO_DATA_YEAR)) +ORIG_DEMO_DATA_ALL <- DEMO_DATA_ALL +#Merge Age of 85+ into one group, because NIH death spastics end in this age category so it is extra overhead to keep each year +YOUNG <- DEMO_DATA_ALL %>% filter(Age<85) +OLD <- DEMO_DATA_ALL %>% filter(Age>=85) %>% group_by(County,IN_KEM,Year,Sex) %>% summarize(Age=85,Population=sum(Population)) %>% ungroup +DEMO_DATA_ALL <- rbind(YOUNG,OLD) %>% arrange(County,Year,IN_KEM,Sex) +DEMO_DATA_ALL <- DEMO_DATA_ALL %>% mutate(Population=round(Population)) %>% pivot_wider(values_from=Population,names_from=Sex,names_prefix="Num_") +OTHER_LIN_DEMO_DATA <- DEMO_DATA_ALL %>% filter(IN_KEM==0) %>% rename(Region=IN_KEM) %>% mutate(Region='Lincoln_Other') +KEM_DEMO_DATA <- DEMO_DATA_ALL %>% filter(IN_KEM==1) %>% rename(Region=IN_KEM) %>% mutate(Region='Kemmerer') + + + +####Save results +CSV_SAVE <- paste0(SAVE_DEMO_LOC,"/CSV") +RDS_SAVE <- paste0(SAVE_DEMO_LOC,"/RDS") +dir.create(CSV_SAVE , recursive = TRUE, showWarnings = FALSE) +dir.create(RDS_SAVE , recursive = TRUE, showWarnings = FALSE) +#Kemmerer Save +saveRDS(KEM_DEMO_DATA,paste0(RDS_SAVE,"/Kemmerer_Diamondville_Demographics.Rds" )) +write_csv(KEM_DEMO_DATA,paste0(CSV_SAVE,"/Kemmerer_Diamondville_Demographics.csv" )) +#Other Lincoln area (not Kemmerer) save +saveRDS(OTHER_LIN_DEMO_DATA,paste0(RDS_SAVE,"/Other_Lincoln_Demographics.Rds" )) +write_csv(OTHER_LIN_DEMO_DATA,paste0(CSV_SAVE,"/Other_Lincoln_Demographics.csv" )) +