45 lines
3.4 KiB
R
45 lines
3.4 KiB
R
library(tidyverse)
|
|
library(readxl)
|
|
|
|
###################Demographics
|
|
if(!file.exists("./Data/Demo_Single_Year_2020s.xls")){download.file('http://eadiv.state.wy.us/Pop/CO_SYASEX24.xlsx',"./Data/Demo_Single_Year_2020s.xls")}
|
|
TEMP <- read_xlsx("./Data/Demo_Single_Year_2020s.xls",skip=2)[,-1]
|
|
TEMP <- TEMP[1:(min(which(is.na(TEMP[,1])))-1),]
|
|
TEMP <- TEMP[!grepl("Base",TEMP$YEAR,ignore.case=TRUE),] #There are two population values provided. I believe one is the census baseline, and one is a estimate in July. Keep the later estimate, to line up with the same seasonal collection pattern of the rest of the data
|
|
TEMP$YEAR <- year(as.Date(substr((TEMP$YEAR),1,8),format="%m/%d/%Y"))
|
|
colnames(TEMP) <- c("County","Year","Age","Number","Num_Male","Num_Female")
|
|
TEMP$County <- gsub(" County","",TEMP$County,ignore.case=TRUE)
|
|
DEM_2020 <- TEMP %>% select(-Number)
|
|
###Demographics all
|
|
DEM_DATA <- read_delim("Data/County_Demographics_Census/wy.1969_2023.singleages.through89.90plus.txt",delim=" ",col_names=c("ID","VALUES"),col_types=list('c','c'))
|
|
DEM_DATA$Year <- as.integer(substr(DEM_DATA$ID,1,4))
|
|
DEM_DATA$fips<- substr(DEM_DATA$ID,7,11)
|
|
COUNTY_LIST <- read_csv("https://github.com/kjhealy/fips-codes/raw/refs/heads/master/county_fips_master.csv",col_types=list('c','c')) %>% filter(state_abbr=="WY") %>% select(fips,County=county_name) %>% mutate(County=gsub(" ","_",gsub(" County","",County,ignore.case=TRUE)))
|
|
DEM_DATA <- DEM_DATA %>% left_join(COUNTY_LIST) %>% select(-fips)
|
|
#16=3
|
|
DEM_DATA$Sex <- ifelse(substr(DEM_DATA$VALUES,3,3)==1,"Male","Female")
|
|
DEM_DATA$Age <- parse_number(substr(DEM_DATA$VALUES,4,5))
|
|
DEM_DATA$Number <- parse_number(substr(DEM_DATA$VALUES,6,14))
|
|
DEM_DATA <- DEM_DATA %>% select(-ID,-VALUES)
|
|
DEM_DATA <- DEM_DATA %>% group_by(Year,County,Sex,Age) %>% summarize(Number=sum(Number)) %>% ungroup()#Aggregate to sex and age level
|
|
|
|
#The Wyoming census data seems newer than this data set from SEER cancer data source. Drop any of these records that overlap with the Wyoming data before merging. Arrange so the column order is the same between the two data sets, so they can be easily bound together.
|
|
DEM_DATA <- pivot_wider(DEM_DATA,names_from=Sex,values_from=Number) %>% rename(Num_Female=Female,Num_Male=Male) %>% select(colnames(DEM_2020)) %>% filter(Year<min(DEM_2020$Year)) %>% unique
|
|
DEM_DATA <- rbind(DEM_2020,DEM_DATA) %>% ungroup %>% arrange(Year,Age) %>% unique %>% mutate(Region=County) %>% select(County,Region,Year,Age,Num_Male,Num_Female) #Add a region which in this case is just the county. For Lincoln cities and other areas are analysed
|
|
###Save demographic data set
|
|
if(!exists("SAVE_DEMO_LOC")){SAVE_DEMO_LOC <-"./Data/Cleaned_Data/Demographic_Sex_Age_Data"}
|
|
CSV_SAVE <- paste0(SAVE_DEMO_LOC,"/CSV")
|
|
RDS_SAVE <- paste0(SAVE_DEMO_LOC,"/RDS")
|
|
|
|
#Save files for all county demographics
|
|
dir.create(CSV_SAVE , recursive = TRUE, showWarnings = FALSE)
|
|
dir.create(RDS_SAVE , recursive = TRUE, showWarnings = FALSE)
|
|
saveRDS(DEM_DATA,paste0(RDS_SAVE,"/All_Wyoming_Counties_Demographics.Rds" ))
|
|
write_csv(DEM_DATA,paste0(CSV_SAVE,"/All_Wyoming_Counties_Demographics.csv" ))
|
|
|
|
#Save a file that is just Lincoln County, this just speeds up code so you do not always need to filter for 'Lincoln'
|
|
LIN_DEM <- DEM_DATA %>% filter(County=='Lincoln')
|
|
saveRDS(LIN_DEM,paste0(RDS_SAVE,"/Full_Lincoln_County_Demographics.Rds" ))
|
|
write_csv(LIN_DEM,paste0(CSV_SAVE,"/Full_Lincoln_County_Demographics.csv" ))
|
|
|