115 lines
8.0 KiB
R
115 lines
8.0 KiB
R
library(tidyverse)
|
|
library(readxl)
|
|
#setwd("../")
|
|
###################Demographics
|
|
#Set up saving locations
|
|
if(!exists("SAVE_LOC_RAW")){SAVE_LOC_RAW <-"./Data/Raw_Data/"}
|
|
RAW_DEMO_LOC <- paste0(SAVE_LOC_RAW,"Demographics/")
|
|
dir.create(RAW_DEMO_LOC, recursive = TRUE, showWarnings = FALSE)
|
|
if(!exists("SAVE_LOC_RAW_POP")){SAVE_LOC_RAW_POP <-"./Data/Raw_Data/Population"}
|
|
dir.create(SAVE_LOC_RAW_POP, recursive = TRUE, showWarnings = FALSE)
|
|
|
|
#Demographic Reference data
|
|
if(!exists("SAVE_LOC_REF")){SAVE_LOC_REF <-paste0(RAW_DEMO_LOC,"Reference_Material_for_Demographics/")}
|
|
dir.create(SAVE_LOC_REF, recursive = TRUE, showWarnings = FALSE)
|
|
|
|
|
|
#Start a README file for the raw downloaded demographic data
|
|
sink(file=paste0(RAW_DEMO_LOC,"/README_DEMOGRAPHIC_DATA.txt"),append=FALSE)
|
|
cat("Demographic data used to find age and sex distribution of county populations\n")
|
|
sink()
|
|
#####Gather data
|
|
C_FILE_PATH <- paste0(RAW_DEMO_LOC,"Wyoming_County_Sex_by_Year_of_Age_Demographic_Data_2020_2024.xls")
|
|
try(if(!file.exists(C_FILE_PATH)){download.file("http://eadiv.state.wy.us/Pop/CO_SYASEX24.xlsx",C_FILE_PATH)})
|
|
#Append to the README for clarity of data sources
|
|
sink(file=paste0(RAW_DEMO_LOC,"/README_DEMOGRAPHIC_DATA.txt"),append=TRUE)
|
|
cat("\n\n 1) Annual County Resident Population Estimates by Single Year of Age and Sex: April 1, 2020 to July 1, 2024
|
|
Wyoming_County_Sex_by_Year_of_Age_Demographic_Data_2020_2024.xls comes from http://eadiv.state.wy.us/Pop/CO_SYASEX24.xlsx
|
|
Data Type: Excel table
|
|
Data Source: Wyoming Department of Information and Economic Development (WIEAD)
|
|
Original Source: Census Bureau, Population Division, June 2025")
|
|
sink()
|
|
|
|
TEMP <- read_xlsx(C_FILE_PATH,skip=2)[,-1]
|
|
TEMP <- TEMP[1:(min(which(is.na(TEMP[,1])))-1),]
|
|
TEMP <- TEMP[!grepl("Base",TEMP$YEAR,ignore.case=TRUE),] #There are two population values provided. I believe one is the census baseline, and one is a estimate in July. Keep the later estimate, to line up with the same seasonal collection pattern of the rest of the data
|
|
TEMP$YEAR <- year(as.Date(substr((TEMP$YEAR),1,8),format="%m/%d/%Y"))
|
|
colnames(TEMP) <- c("County","Year","Age","Number","Num_Male","Num_Female")
|
|
TEMP$County <- gsub(" County","",TEMP$County,ignore.case=TRUE)
|
|
DEM_2020 <- TEMP %>% select(-Number)
|
|
###Demographics all
|
|
try(DEM_DATA <- read_delim('https://seer.cancer.gov/popdata/yr1969_2023.singleages.through89.90plus/wy.1969_2023.singleages.through89.90plus.txt.gz',delim=" ",col_names=c("ID","VALUES"),col_types=list('c','c')))
|
|
SEER_DATA_LOC <- paste0(RAW_DEMO_LOC,"Wyoming_County_Sex_by_Year_of_Age_Demographic_Data_1969_2023.csv")
|
|
if(!exists("DEM_DATA")){DEM_DATA <- read_csv(SEER_DATA_LOC )}else{ write_csv(DEM_DATA,SEER_DATA_LOC)}
|
|
|
|
#Append to the README for clarity of data sources
|
|
sink(file=paste0(RAW_DEMO_LOC,"/README_DEMOGRAPHIC_DATA.txt"),append=TRUE)
|
|
cat("\n\n 2) Wyoming County-Level Population Files - Single-year Age Groups 1969 to 2023
|
|
Wyoming_County_Sex_by_Year_of_Age_Demographic_Data_1969_2023.csv comes from https://seer.cancer.gov/popdata/yr1969_2023.singleages.through89.90plus/wy.1969_2023.singleages.through89.90plus.txt.gz
|
|
Data Type: gunzip (gz) file with coded data
|
|
Data Source: The National Cancer Institute surveillance, Epidemiology, and End Results Program
|
|
Original Source: Census Bureau (data processed for yearly estimates)
|
|
Note: See https://seer.cancer.gov/popdata/download.html for more data information and other State data. Raw data is parsed using the described data format (first number year, then fips code etc.).
|
|
The required fips codes are provided in the reference folder from https://github.com/kjhealy/fips-codes/raw/refs/heads/master/county_fips_master.csv.")
|
|
sink()
|
|
|
|
DEM_DATA$Year <- as.numeric(substr(DEM_DATA$ID,1,4))
|
|
DEM_DATA$fips<- substr(DEM_DATA$ID,7,11)
|
|
COUNTY_LIST <- read_csv("https://github.com/kjhealy/fips-codes/raw/refs/heads/master/county_fips_master.csv",col_types=list('c','c')) %>% filter(state_abbr=="WY") %>% select(fips,County=county_name) %>% mutate(County=gsub(" ","_",gsub(" County","",County,ignore.case=TRUE)))
|
|
FIPS_LOC <- paste0(SAVE_LOC_REF,"fips_codes.csv")
|
|
if(!exists("COUNTY_LIST")){ COUNTY_LIST <- read_csv(FIPS_LOC)}else{ write_csv(COUNTY_LIST,FIPS_LOC)}
|
|
|
|
DEM_DATA <- DEM_DATA %>% left_join(COUNTY_LIST) %>% select(-fips)
|
|
#16=3
|
|
DEM_DATA$Sex <- ifelse(substr(DEM_DATA$VALUES,3,3)==1,"Male","Female")
|
|
DEM_DATA$Age <- parse_number(substr(DEM_DATA$VALUES,4,5))
|
|
DEM_DATA$Number <- parse_number(substr(DEM_DATA$VALUES,6,14))
|
|
DEM_DATA <- DEM_DATA %>% select(-ID,-VALUES)
|
|
DEM_DATA <- DEM_DATA %>% group_by(Year,County,Sex,Age) %>% summarize(Number=sum(Number)) %>% ungroup()#Aggregate to sex and age level
|
|
|
|
#The Wyoming census data seems newer than this data set from SEER cancer data source. Drop any of these records that overlap with the Wyoming data before merging. Arrange so the column order is the same between the two data sets, so they can be easily bound together.
|
|
DEM_DATA <- pivot_wider(DEM_DATA,names_from=Sex,values_from=Number) %>% rename(Num_Female=Female,Num_Male=Male) %>% select(colnames(DEM_2020)) %>% filter(Year<min(DEM_2020$Year)) %>% unique
|
|
DEM_DATA <- rbind(DEM_2020,DEM_DATA) %>% ungroup %>% arrange(Year,Age) %>% unique %>% mutate(Region=County) %>% select(County,Region,Year,Age,Num_Male,Num_Female) #Add a region which in this case is just the county. For Lincoln cities and other areas are analysed
|
|
###Save demographic data set
|
|
if(!exists("SAVE_DEMO_LOC")){SAVE_DEMO_LOC <-"./Data/Cleaned_Data/Demographic_Sex_Age_Data"}
|
|
CSV_SAVE <- paste0(SAVE_DEMO_LOC,"/CSV")
|
|
RDS_SAVE <- paste0(SAVE_DEMO_LOC,"/RDS")
|
|
#Location for starting demographic data that initiates a simulation (different periods included for robustness checks)
|
|
if(!exists("STARTING_SIM_DATA_SAVE_LOC")){STARTING_SIM_DATA_SAVE_LOC <-"./Data/Intermediate_Inputs/Starting_Demographic_Data_Sets_of_Monte_Carlo/"}
|
|
dir.create(STARTING_SIM_DATA_SAVE_LOC, recursive = TRUE, showWarnings = FALSE)
|
|
|
|
|
|
#Save files for all county demographic cleaned data
|
|
dir.create(CSV_SAVE , recursive = TRUE, showWarnings = FALSE)
|
|
dir.create(RDS_SAVE , recursive = TRUE, showWarnings = FALSE)
|
|
saveRDS(DEM_DATA,paste0(RDS_SAVE,"/All_Wyoming_Counties_Demographics.Rds" ))
|
|
write_csv(DEM_DATA,paste0(CSV_SAVE,"/All_Wyoming_Counties_Demographics.csv" ))
|
|
|
|
#Save a file that is just Lincoln County, this just speeds up code so you do not always need to filter for 'Lincoln'
|
|
LIN_DEM <- DEM_DATA %>% filter(County=='Lincoln')
|
|
MOST_RECENT_YEAR <- max(LIN_DEM$Year)
|
|
LIN_MAT <- LIN_DEM %>% filter(Year==MOST_RECENT_YEAR) %>% select(Num_Male,Num_Female) %>% as.matrix
|
|
rownames(LIN_MAT) <- 0:85
|
|
saveRDS(LIN_MAT,paste0(STARTING_SIM_DATA_SAVE_LOC,MOST_RECENT_YEAR,"_Starting_Lincoln_County_Demographics_Matrix.Rds") )
|
|
|
|
LIN_MAT_OLD <- LIN_DEM %>% filter(Year==2016) %>% select(Num_Male,Num_Female) %>% as.matrix #2016 to match the mid value of Kemmerer data years
|
|
LIN_MAT_OLD[86,] <-colSums(LIN_MAT_OLD[ 86:nrow(LIN_MAT_OLD ),])
|
|
LIN_MAT_OLD <- LIN_MAT_OLD[1:86,] #Combine 85+ into one group to match the death stastics data
|
|
rownames(LIN_MAT_OLD) <- 0:85
|
|
saveRDS(LIN_MAT_OLD,paste0(STARTING_SIM_DATA_SAVE_LOC,"2016_Starting_Lincoln_County_Demographics_Matrix.Rds" ))
|
|
|
|
LIN_MAT_VERY_OLD <- LIN_DEM %>% filter(Year==2025-40) %>% select(Num_Male,Num_Female) %>% as.matrix #1985 to match 40 years of estiamtes or the end date.
|
|
LIN_MAT_VERY_OLD[86,] <-colSums(LIN_MAT_VERY_OLD[ 86:nrow(LIN_MAT_VERY_OLD ),])
|
|
LIN_MAT_VERY_OLD <- LIN_MAT_VERY_OLD[1:86,] #Combine 85+ into one group to match the death stastics data
|
|
rownames(LIN_MAT_VERY_OLD) <- 0:85
|
|
saveRDS(LIN_MAT_VERY_OLD,paste0(STARTING_SIM_DATA_SAVE_LOC,"1985_Starting_Lincoln_County_Demographics_Matrix.Rds" ))
|
|
|
|
|
|
saveRDS(LIN_DEM,paste0(RDS_SAVE,"/Full_Lincoln_County_Demographics.Rds" ))
|
|
write_csv(LIN_DEM,paste0(CSV_SAVE,"/Full_Lincoln_County_Demographics.csv" ))
|
|
run_datetime <- format(Sys.time(), "%Y-%m-%d %H:%M:%S")
|
|
sink(file=paste0(SAVE_LOC_RAW_POP,"/README_POPULATION_DATA.txt"),append=TRUE)
|
|
cat(paste0("\n--- Run Date: ", run_datetime, " ---\n"))
|
|
sink()
|
|
|