Population_Study/Scripts/1B_Process_Existing_NIH_Mortality_Data.r

99 lines
6.3 KiB
R

#########################################Mortality Rate
library(tidyverse)
###Create Location to Save raw data sets
if(!exists("SAVE_LOC_RAW")){SAVE_LOC_RAW <-"./Data/Raw_Data/"}
dir.create(SAVE_LOC_RAW, recursive = TRUE, showWarnings = FALSE)
GET_MORTALITY_DATA <- function(FILE,SEX,LOWER_AGE,UPPER_AGE){
#Create clean mortality rate data
#Data gathered from https://hdpulse.nimhd.nih.gov/data-portal/mortality/table?cod=247&cod_options=cod_15&ratetype=aa&ratetype_options=ratetype_2&race=00&race_options=race_6&sex=2&sex_options=sex_3&age=177&age_options=age_11&ruralurban=0&ruralurban_options=ruralurban_3&yeargroup=5&yeargroup_options=year5yearmort_1&statefips=56&statefips_options=area_states&county=56000&county_options=counties_wyoming&comparison=counties_to_us&comparison_options=comparison_counties&radio_comparison=areas&radio_comparison_options=cods_or_areas
NAMES <- c("County","FIPS","Death_Rate","Lower_Rate","Upper_Rate","Deaths","Trend_Category","Trend","Lower_Trend","Upper_Trend")
DF <- read_csv(FILE,skip=5,col_names=NAMES,col_types=list('c',"i",'d','d','d','d','c','d','d','d')) %>% filter(grepl("County|Wyoming",County)|County=="United States") %>% mutate(Rate_SD=(Upper_Rate-Lower_Rate)/(2*1.96),Trend_SD=(Upper_Trend-Lower_Trend)/(2*1.96)) %>% select(County,Death_Rate,Rate_SD,Trend,Trend_SD)
DF$County <- gsub(" County","",DF$County,ignore.case=TRUE)
DF[,-1] <- DF[,-1]/100000
WYOMING_TREND <- pull(DF[DF$County=="Wyoming",],"Trend")
US_TREND <- pull(DF[DF$County=="United States",],"Trend")
WYOMING_RATE <- pull(DF[DF$County=="Wyoming",],"Death_Rate")
US_RATE <- pull(DF[DF$County=="United States",],"Death_Rate")
DF$Imparted_Trend <- FALSE
if(is.na(WYOMING_TREND)){
DF[1,4:5] <- DF[2,4:5]
DF[1,6] <- TRUE
}
DF$Imparted_Rate <- FALSE
if(is.na(WYOMING_RATE)){
DF[1,4:5] <- DF[2,2:3]
DF[1,6] <- TRUE
}
WYOMING_BASELINE_TREND <-cbind (DF[1,4:5] ,TRUE)
WYOMING_BASELINE_RATE <-DF[1,2:3]
for(i in 3:nrow(DF)){
#Impart any missing trends based on higher levels
if(is.na(pull(DF[i,],"Trend"))){ DF[i,4:6] <- WYOMING_BASELINE_TREND}
#Impart any missing death rates based on higher levels
if(is.na(pull(DF[i,],"Death_Rate"))){
DF[i,2:3] <- WYOMING_BASELINE_RATE
DF[i,"Imparted_Rate"] <- TRUE
}
}
DF$Sex <- SEX
DF$Min_Age <- LOWER_AGE
DF$Max_Age <- UPPER_AGE
DF <- DF %>% select(County,Sex,Min_Age,Max_Age,Death_Rate,Rate_SD,Imparted_Rate,everything())
return(DF)
}
SAVE_LOC_RAW_MORT <- paste0(SAVE_LOC_RAW,"Mortality_Rates/")
RAW_MORTALITY_LOC_FEMALE <- paste0(SAVE_LOC_RAW_MORT,"Female/")
RAW_MORTALITY_LOC_MALE <- paste0(SAVE_LOC_RAW_MORT,"Male/")
#Gather all mortality Records
MORTALITY_DATA_ALL <- rbind(
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"A_Under1.csv"),"Female",0,0),
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"B_1_9.csv"),"Female",1,9),
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"C_10_19.csv"),"Female",10,19),
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"D_20_39.csv"),"Female",20,39),
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"E_40_64.csv"),"Female",40,64),
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"G_65_74.csv"),"Female",65,74),
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"H_75_84.csv"),"Female",75,84),
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"I_85+.csv"),"Female",85,Inf),
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"A_Under1.csv"),"Male",0,0),
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"B_1_9.csv"),"Male",1,9),
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"C_10_19.csv"),"Male",10,19),
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"D_20_39.csv"),"Male",20,39),
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"E_40_64.csv"),"Male",40,64),
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"G_65_74.csv"),"Male",65,74),
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"H_75_84.csv"),"Male",75,84),
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"I_85+.csv"),"Male",85,Inf)
)
LIN_MORTALITY <- MORTALITY_DATA_ALL %>% filter(County=="Lincoln")
##Save the mortality data
if(!exists("SAVE_MORT_LOC")){SAVE_MORT_LOC <-"./Data/Cleaned_Data/Mortality_Rate_Data"}
CSV_SAVE <- paste0(SAVE_MORT_LOC ,"/CSV")
RDS_SAVE <- paste0(SAVE_MORT_LOC,"/RDS")
#Save files for all county demographics
dir.create(CSV_SAVE , recursive = TRUE, showWarnings = FALSE)
dir.create(RDS_SAVE , recursive = TRUE, showWarnings = FALSE)
saveRDS(MORTALITY_DATA_ALL,paste0(RDS_SAVE,"/All_Wyoming_Counties_Mortality_Rates.Rds" ))
write_csv(MORTALITY_DATA_ALL,paste0(CSV_SAVE,"/All_Wyoming_Counties_Mortality_Rates.csv" ))
saveRDS(LIN_MORTALITY,paste0(RDS_SAVE,"/Lincoln_County_Mortality_Rates.Rds" ))
write_csv(LIN_MORTALITY,paste0(CSV_SAVE,"/Lincoln_County_Mortality_Rates.csv" ))
#Create a short readme files to make the data sources more clear
#Save a raw data readme
sink(file=paste0(SAVE_LOC_RAW_MORT ,"/README_MORTALITY_DATA.txt"),append=FALSE)
cat("Data files gathered manually from:
National Institute of Health HDPules: An Ecosystem of Health Disparities and Minority Health Resources at
https://hdpulse.nimhd.nih.gov/data-portal/mortality/table?cod=247&cod_options=cod_15&ratetype=aa&ratetype_options=ratetype_2&race=00&race_options=race_6&sex=2&sex_options=sex_3&age=177&age_options=age_11&ruralurban=0&ruralurban_options=ruralurban_3&yeargroup=5&yeargroup_options=year5yearmort_1&statefips=56&statefips_options=area_states&county=56000&county_options=counties_wyoming&comparison=counties_to_us&comparison_options=comparison_counties&radio_comparison=areas&radio_comparison_options=cods_or_areas
Each file is single age group, so age weighting does not apply despite the variable names. Each age group file is named sequentially with a prefix “A_” for the first age group and a prefix “I_” for the oldest. There are separate directories for each sex. The cleaning script uses this directory structure to extract the right files and merge them into one data set.
These files must be manually downloaded because there is a filter feature on the web page that is used to select the county and age. While there may be a way to scrape the data with code the trade off on my time was not worth it. Future runs will need to check these records, and can download the files to match this directory structure, in order to process a update in death rates.
Valid data as of Nov 6 2025 Alex Gebben")
sink()