93 lines
6.4 KiB
R
93 lines
6.4 KiB
R
library(tidyverse)
|
|
library(readxl)
|
|
#setwd("../")
|
|
###################Demographics
|
|
#Set up saving locations
|
|
if(!exists("SAVE_LOC_RAW")){SAVE_LOC_RAW <-"./Data/Raw_Data/"}
|
|
RAW_DEMO_LOC <- paste0(SAVE_LOC_RAW,"Demographics/")
|
|
dir.create(RAW_DEMO_LOC, recursive = TRUE, showWarnings = FALSE)
|
|
if(!exists("SAVE_LOC_RAW_POP")){SAVE_LOC_RAW_POP <-"./Data/Raw_Data/Population"}
|
|
dir.create(SAVE_LOC_RAW_POP, recursive = TRUE, showWarnings = FALSE)
|
|
|
|
#Demographic Reference data
|
|
if(!exists("SAVE_LOC_REF")){SAVE_LOC_REF <-paste0(RAW_DEMO_LOC,"Reference_Material_for_Demographics/")}
|
|
dir.create(SAVE_LOC_REF, recursive = TRUE, showWarnings = FALSE)
|
|
|
|
|
|
#Start a README file for the raw downloaded demographic data
|
|
sink(file=paste0(RAW_DEMO_LOC,"/README_DEMOGRAPHIC_DATA.txt"),append=FALSE)
|
|
cat("Demographic data used to find age and sex distribution of county populations\n")
|
|
sink()
|
|
#####Gather data
|
|
C_FILE_PATH <- paste0(RAW_DEMO_LOC,"Wyoming_County_Sex_by_Year_of_Age_Demographic_Data_2020_2024.xls")
|
|
try(if(!file.exists(C_FILE_PATH)){download.file("http://eadiv.state.wy.us/Pop/CO_SYASEX24.xlsx",C_FILE_PATH)})
|
|
#Append to the README for clarity of data sources
|
|
sink(file=paste0(RAW_DEMO_LOC,"/README_DEMOGRAPHIC_DATA.txt"),append=TRUE)
|
|
cat("\n\n 1) Annual County Resident Population Estimates by Single Year of Age and Sex: April 1, 2020 to July 1, 2024
|
|
Wyoming_County_Sex_by_Year_of_Age_Demographic_Data_2020_2024.xls comes from http://eadiv.state.wy.us/Pop/CO_SYASEX24.xlsx
|
|
Data Type: Excel table
|
|
Data Source: Wyoming Department of Information and Economic Development (WIEAD)
|
|
Original Source: Census Bureau, Population Division, June 2025")
|
|
sink()
|
|
|
|
TEMP <- read_xlsx(C_FILE_PATH,skip=2)[,-1]
|
|
TEMP <- TEMP[1:(min(which(is.na(TEMP[,1])))-1),]
|
|
TEMP <- TEMP[!grepl("Base",TEMP$YEAR,ignore.case=TRUE),] #There are two population values provided. I believe one is the census baseline, and one is a estimate in July. Keep the later estimate, to line up with the same seasonal collection pattern of the rest of the data
|
|
TEMP$YEAR <- year(as.Date(substr((TEMP$YEAR),1,8),format="%m/%d/%Y"))
|
|
colnames(TEMP) <- c("County","Year","Age","Number","Num_Male","Num_Female")
|
|
TEMP$County <- gsub(" County","",TEMP$County,ignore.case=TRUE)
|
|
DEM_2020 <- TEMP %>% select(-Number)
|
|
###Demographics all
|
|
try(DEM_DATA <- read_delim('https://seer.cancer.gov/popdata/yr1969_2023.singleages.through89.90plus/wy.1969_2023.singleages.through89.90plus.txt.gz',delim=" ",col_names=c("ID","VALUES"),col_types=list('c','c')))
|
|
SEER_DATA_LOC <- paste0(RAW_DEMO_LOC,"Wyoming_County_Sex_by_Year_of_Age_Demographic_Data_1969_2023.csv")
|
|
if(!exists("DEM_DATA")){DEM_DATA <- read_csv(SEER_DATA_LOC )}else{ write_csv(DEM_DATA,SEER_DATA_LOC)}
|
|
|
|
#Append to the README for clarity of data sources
|
|
sink(file=paste0(RAW_DEMO_LOC,"/README_DEMOGRAPHIC_DATA.txt"),append=TRUE)
|
|
cat("\n\n 2) Wyoming County-Level Population Files - Single-year Age Groups 1969 to 2023
|
|
Wyoming_County_Sex_by_Year_of_Age_Demographic_Data_1969_2023.csv comes from https://seer.cancer.gov/popdata/yr1969_2023.singleages.through89.90plus/wy.1969_2023.singleages.through89.90plus.txt.gz
|
|
Data Type: gunzip (gz) file with coded data
|
|
Data Source: The National Cancer Institute surveillance, Epidemiology, and End Results Program
|
|
Original Source: Census Bureau (data processed for yearly estimates)
|
|
Note: See https://seer.cancer.gov/popdata/download.html for more data information and other State data. Raw data is parsed using the described data format (first number year, then fips code etc.).
|
|
The required fips codes are provided in the reference folder from https://github.com/kjhealy/fips-codes/raw/refs/heads/master/county_fips_master.csv.")
|
|
sink()
|
|
|
|
DEM_DATA$Year <- as.numeric(substr(DEM_DATA$ID,1,4))
|
|
DEM_DATA$fips<- substr(DEM_DATA$ID,7,11)
|
|
COUNTY_LIST <- read_csv("https://github.com/kjhealy/fips-codes/raw/refs/heads/master/county_fips_master.csv",col_types=list('c','c')) %>% filter(state_abbr=="WY") %>% select(fips,County=county_name) %>% mutate(County=gsub(" ","_",gsub(" County","",County,ignore.case=TRUE)))
|
|
FIPS_LOC <- paste0(SAVE_LOC_REF,"fips_codes.csv")
|
|
if(!exists("COUNTY_LIST")){ COUNTY_LIST <- read_csv(FIPS_LOC)}else{ write_csv(COUNTY_LIST,FIPS_LOC)}
|
|
|
|
DEM_DATA <- DEM_DATA %>% left_join(COUNTY_LIST) %>% select(-fips)
|
|
#16=3
|
|
DEM_DATA$Sex <- ifelse(substr(DEM_DATA$VALUES,3,3)==1,"Male","Female")
|
|
DEM_DATA$Age <- parse_number(substr(DEM_DATA$VALUES,4,5))
|
|
DEM_DATA$Number <- parse_number(substr(DEM_DATA$VALUES,6,14))
|
|
DEM_DATA <- DEM_DATA %>% select(-ID,-VALUES)
|
|
DEM_DATA <- DEM_DATA %>% group_by(Year,County,Sex,Age) %>% summarize(Number=sum(Number)) %>% ungroup()#Aggregate to sex and age level
|
|
|
|
#The Wyoming census data seems newer than this data set from SEER cancer data source. Drop any of these records that overlap with the Wyoming data before merging. Arrange so the column order is the same between the two data sets, so they can be easily bound together.
|
|
DEM_DATA <- pivot_wider(DEM_DATA,names_from=Sex,values_from=Number) %>% rename(Num_Female=Female,Num_Male=Male) %>% select(colnames(DEM_2020)) %>% filter(Year<min(DEM_2020$Year)) %>% unique
|
|
DEM_DATA <- rbind(DEM_2020,DEM_DATA) %>% ungroup %>% arrange(Year,Age) %>% unique %>% mutate(Region=County) %>% select(County,Region,Year,Age,Num_Male,Num_Female) #Add a region which in this case is just the county. For Lincoln cities and other areas are analysed
|
|
###Save demographic data set
|
|
if(!exists("SAVE_DEMO_LOC")){SAVE_DEMO_LOC <-"./Data/Cleaned_Data/Demographic_Sex_Age_Data"}
|
|
CSV_SAVE <- paste0(SAVE_DEMO_LOC,"/CSV")
|
|
RDS_SAVE <- paste0(SAVE_DEMO_LOC,"/RDS")
|
|
|
|
#Save files for all county demographics
|
|
dir.create(CSV_SAVE , recursive = TRUE, showWarnings = FALSE)
|
|
dir.create(RDS_SAVE , recursive = TRUE, showWarnings = FALSE)
|
|
saveRDS(DEM_DATA,paste0(RDS_SAVE,"/All_Wyoming_Counties_Demographics.Rds" ))
|
|
write_csv(DEM_DATA,paste0(CSV_SAVE,"/All_Wyoming_Counties_Demographics.csv" ))
|
|
|
|
#Save a file that is just Lincoln County, this just speeds up code so you do not always need to filter for 'Lincoln'
|
|
LIN_DEM <- DEM_DATA %>% filter(County=='Lincoln')
|
|
saveRDS(LIN_DEM,paste0(RDS_SAVE,"/Full_Lincoln_County_Demographics.Rds" ))
|
|
write_csv(LIN_DEM,paste0(CSV_SAVE,"/Full_Lincoln_County_Demographics.csv" ))
|
|
run_datetime <- format(Sys.time(), "%Y-%m-%d %H:%M:%S")
|
|
sink(file=paste0(SAVE_LOC_RAW_POP,"/README_POPULATION_DATA.txt"),append=TRUE)
|
|
cat(paste0("\n--- Run Date: ", run_datetime, " ---\n"))
|
|
sink()
|
|
|