Population_Study/Scripts/1C_Download_and_Process_Demographic_Data.r

library(tidyverse)
library(readxl)
#setwd("../")
###################Demographics
	#Set up saving locations
	if(!exists("SAVE_LOC_RAW")){SAVE_LOC_RAW <-"./Data/Raw_Data/"}
	RAW_DEMO_LOC <- paste0(SAVE_LOC_RAW,"Demographics/")
	dir.create(RAW_DEMO_LOC, recursive = TRUE, showWarnings = FALSE)
	if(!exists("SAVE_LOC_RAW_POP")){SAVE_LOC_RAW_POP <-"./Data/Raw_Data/Population"}
	dir.create(SAVE_LOC_RAW_POP, recursive = TRUE, showWarnings = FALSE)

	#Demographic Reference data
	if(!exists("SAVE_LOC_REF")){SAVE_LOC_REF <-paste0(RAW_DEMO_LOC,"Reference_Material_for_Demographics/")}
	dir.create(SAVE_LOC_REF, recursive = TRUE, showWarnings = FALSE)


	#Start a README file for the raw downloaded demographic data
		sink(file=paste0(RAW_DEMO_LOC,"/README_DEMOGRAPHIC_DATA.txt"),append=FALSE)
			cat("Demographic data used to find age and sex distribution of county populations\n")
		sink()
#####Gather data
	C_FILE_PATH <- paste0(RAW_DEMO_LOC,"Wyoming_County_Sex_by_Year_of_Age_Demographic_Data_2020_2024.xls")
	try(if(!file.exists(C_FILE_PATH)){download.file("http://eadiv.state.wy.us/Pop/CO_SYASEX24.xlsx",C_FILE_PATH)})
	#Append to the README for clarity of data sources
sink(file=paste0(RAW_DEMO_LOC,"/README_DEMOGRAPHIC_DATA.txt"),append=TRUE)
	cat("\n\n	 1) Annual County Resident Population Estimates by Single Year of Age and Sex: April 1, 2020 to July 1, 2024
	 	   Wyoming_County_Sex_by_Year_of_Age_Demographic_Data_2020_2024.xls comes from http://eadiv.state.wy.us/Pop/CO_SYASEX24.xlsx
	       	   Data Type: Excel table
	       	   Data Source: Wyoming Department of Information and Economic Development (WIEAD)
	     	   Original Source: Census Bureau, Population Division, June 2025")
sink()

TEMP <- read_xlsx(C_FILE_PATH,skip=2)[,-1]
TEMP <- TEMP[1:(min(which(is.na(TEMP[,1])))-1),]
TEMP <- TEMP[!grepl("Base",TEMP$YEAR,ignore.case=TRUE),] #There are two population values provided. I believe one is the census baseline, and one is a estimate in July. Keep the later estimate, to line up with the same seasonal collection pattern of the rest of the data
TEMP$YEAR <- year(as.Date(substr((TEMP$YEAR),1,8),format="%m/%d/%Y"))
colnames(TEMP) <- c("County","Year","Age","Number","Num_Male","Num_Female")
TEMP$County <- gsub(" County","",TEMP$County,ignore.case=TRUE)
DEM_2020 <- TEMP %>% select(-Number)
###Demographics all
try(DEM_DATA <- read_delim('https://seer.cancer.gov/popdata/yr1969_2023.singleages.through89.90plus/wy.1969_2023.singleages.through89.90plus.txt.gz',delim="  ",col_names=c("ID","VALUES"),col_types=list('c','c')))
SEER_DATA_LOC <- paste0(RAW_DEMO_LOC,"Wyoming_County_Sex_by_Year_of_Age_Demographic_Data_1969_2023.csv")
if(!exists("DEM_DATA")){DEM_DATA <- read_csv(SEER_DATA_LOC )}else{ write_csv(DEM_DATA,SEER_DATA_LOC)}

	#Append to the README for clarity of data sources
sink(file=paste0(RAW_DEMO_LOC,"/README_DEMOGRAPHIC_DATA.txt"),append=TRUE)
	cat("\n\n	 2) Wyoming County-Level Population Files - Single-year Age Groups 1969 to 2023
	 	   Wyoming_County_Sex_by_Year_of_Age_Demographic_Data_1969_2023.csv comes from https://seer.cancer.gov/popdata/yr1969_2023.singleages.through89.90plus/wy.1969_2023.singleages.through89.90plus.txt.gz
	       	   Data Type: gunzip (gz) file with coded data
	       	   Data Source: The National Cancer Institute surveillance, Epidemiology, and End Results Program
	     	   Original Source: Census Bureau (data processed for yearly estimates)
		   Note: See https://seer.cancer.gov/popdata/download.html for more data information and other State data. Raw data is parsed using the described data format (first number year, then fips code etc.).
		   The required fips codes are provided in the reference folder from https://github.com/kjhealy/fips-codes/raw/refs/heads/master/county_fips_master.csv.")
sink()

DEM_DATA$Year <- as.numeric(substr(DEM_DATA$ID,1,4))
DEM_DATA$fips<- substr(DEM_DATA$ID,7,11)
COUNTY_LIST <- read_csv("https://github.com/kjhealy/fips-codes/raw/refs/heads/master/county_fips_master.csv",col_types=list('c','c')) %>% filter(state_abbr=="WY") %>% select(fips,County=county_name) %>% mutate(County=gsub(" ","_",gsub(" County","",County,ignore.case=TRUE)))
FIPS_LOC <- paste0(SAVE_LOC_REF,"fips_codes.csv")
if(!exists("COUNTY_LIST")){ COUNTY_LIST <- read_csv(FIPS_LOC)}else{ write_csv(COUNTY_LIST,FIPS_LOC)}

DEM_DATA <- DEM_DATA %>% left_join(COUNTY_LIST) %>% select(-fips)
#16=3
DEM_DATA$Sex <- ifelse(substr(DEM_DATA$VALUES,3,3)==1,"Male","Female")
DEM_DATA$Age <- parse_number(substr(DEM_DATA$VALUES,4,5))
DEM_DATA$Number <- parse_number(substr(DEM_DATA$VALUES,6,14))
DEM_DATA <- DEM_DATA %>% select(-ID,-VALUES)
DEM_DATA <- DEM_DATA %>% group_by(Year,County,Sex,Age) %>% summarize(Number=sum(Number)) %>% ungroup()#Aggregate to sex and age level

#The Wyoming census data seems newer than this data set from SEER cancer data source. Drop any of these records that overlap with the Wyoming data before merging. Arrange so the column order is the same between the two data sets, so they can be easily bound together.
DEM_DATA <- pivot_wider(DEM_DATA,names_from=Sex,values_from=Number) %>% rename(Num_Female=Female,Num_Male=Male) %>% select(colnames(DEM_2020)) %>% filter(Year<min(DEM_2020$Year)) %>% unique
DEM_DATA <- rbind(DEM_2020,DEM_DATA) %>% ungroup %>% arrange(Year,Age)  %>% unique %>% mutate(Region=County) %>% select(County,Region,Year,Age,Num_Male,Num_Female) #Add a region which in this case is just the county. For Lincoln cities and other areas are analysed
###Save demographic data set
	if(!exists("SAVE_DEMO_LOC")){SAVE_DEMO_LOC <-"./Data/Cleaned_Data/Demographic_Sex_Age_Data"}
	CSV_SAVE <- paste0(SAVE_DEMO_LOC,"/CSV")
	RDS_SAVE <- paste0(SAVE_DEMO_LOC,"/RDS")

	#Save files for all county demographics
	dir.create(CSV_SAVE  , recursive = TRUE, showWarnings = FALSE)
	dir.create(RDS_SAVE , recursive = TRUE, showWarnings = FALSE)
	saveRDS(DEM_DATA,paste0(RDS_SAVE,"/All_Wyoming_Counties_Demographics.Rds" ))
	write_csv(DEM_DATA,paste0(CSV_SAVE,"/All_Wyoming_Counties_Demographics.csv" ))

	#Save a file that is just Lincoln County, this just speeds up code so you do not always need to filter for 'Lincoln'
	LIN_DEM <- DEM_DATA %>% filter(County=='Lincoln')
	MOST_RECENT_YEAR <- max(LIN_DEM$Year)
	LIN_MAT <- LIN_DEM %>% filter(Year==MOST_RECENT_YEAR) %>% select(Num_Male,Num_Female) %>% as.matrix
	rownames(LIN_MAT)      	<- 0:85
	saveRDS(LIN_MAT,paste0(RDS_SAVE,paste0("/",MOST_RECENT_YEAR,"_Starting_Lincoln_County_Demographics_Matrix.Rds") ))

	LIN_MAT_OLD <- 	LIN_DEM %>% filter(Year==2016) %>% select(Num_Male,Num_Female) %>% as.matrix #2016 to match the mid value of Kemmerer data years
	LIN_MAT_OLD[86,] <-colSums(LIN_MAT_OLD[ 86:nrow(LIN_MAT_OLD ),])
	LIN_MAT_OLD <- LIN_MAT_OLD[1:86,] #Combine 85+ into one group to match the death stastics data
	rownames(LIN_MAT_OLD)      	<- 0:85
	saveRDS(LIN_MAT_OLD,paste0(RDS_SAVE,"/2016_Starting_Lincoln_County_Demographics_Matrix.Rds" ))

	LIN_MAT_VERY_OLD <- LIN_DEM %>% filter(Year==2025-40) %>% select(Num_Male,Num_Female) %>% as.matrix #1985 to match 40 years of estiamtes or the end date.
	LIN_MAT_VERY_OLD[86,] <-colSums(LIN_MAT_VERY_OLD[ 86:nrow(LIN_MAT_VERY_OLD ),])
	LIN_MAT_VERY_OLD <- LIN_MAT_VERY_OLD[1:86,] #Combine 85+ into one group to match the death stastics data
	rownames(LIN_MAT_VERY_OLD)      	<- 0:85
	saveRDS(LIN_MAT_VERY_OLD,paste0(RDS_SAVE,"/1985_Starting_Lincoln_County_Demographics_Matrix.Rds" ))


	saveRDS(LIN_DEM,paste0(RDS_SAVE,"/Full_Lincoln_County_Demographics.Rds" ))
	write_csv(LIN_DEM,paste0(CSV_SAVE,"/Full_Lincoln_County_Demographics.csv" ))
run_datetime <- format(Sys.time(), "%Y-%m-%d %H:%M:%S")
sink(file=paste0(SAVE_LOC_RAW_POP,"/README_POPULATION_DATA.txt"),append=TRUE)
	cat(paste0("\n--- Run Date: ", run_datetime, " ---\n"))
sink()