Cleaning up mortality loading

This commit is contained in:
Alex 2025-11-23 16:02:00 -07:00
parent 16cf5be64c
commit 3bf85ce48b
18 changed files with 85 additions and 31 deletions

View File

@ -0,0 +1,13 @@
Data is manually gathered from CDC WONDER data queries.
https://wonder.cdc.gov/. Each file Contains additional information about the sources
A secondary source of data comes from the World Pandemic Uncertainty Index Ahir, Hites, Bloom, Nick and Furceri, Davide, World Pandemic Uncertainty Index [WUPI], retrieved from FRED, Federal Reserve Bank of St. Louis; https://fred.stlouisfed.org/series/WUPI, November 23, 2025.
The data sets available in November 2025, are combined into a single file. Data sets include:
1) The age adjusted (weighted) mortality rates of Lincoln County, Wyoming and the US from three data sets starting in 1979, 2018, and 2020
2) The single year age-sex mortality rate of the US, starting in 2018 compiled yearly. These are suppressed for privacy at any level lower than the nation.
3) The 10 year age bin mortality rates with age adjustment for the U.S. and Wyoming in each year. These are used to append the to the yearly records which exclude 85+ values.
4) The world pandemic uncertainty index as collected from FRED which is used to account for pandemics in the regression, making the age time series stationary.
These are used to project mortality trends over time. In the case of the age adjusted data, this has local trends that can be compared to the national average. The single age-sex data is only at a national level but can be imparted to local levels as a general trend in the distribution of deaths
--- Run Date: 2025-11-23 15:59:28 ---

View File

@ -1,40 +1,14 @@
library(tidyverse)
LIN_1979 <- read_csv("Data/Raw_Data/Mortality_Rates_New/Lincoln_Age_Adjusted_1979-1998.csv") %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`)%>% mutate(Region='Lincoln')
WY_1979 <- read_csv("Data/Raw_Data/Mortality_Rates_New/Wyoming_Age_Adjusted_1979-1998.csv") %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`)%>% mutate(Region='Wyoming')
US_1979 <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_Age_Adjusted_1979-1998.csv")%>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`) %>% filter(!is.na(Sex),!is.na(Year)) %>% mutate(Region="US")%>% filter(Year<2018)
LIN_1999 <- read_csv("Data/Raw_Data/Mortality_Rates_New/Lincoln_Age_Adjusted_1999-2020.csv") %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`)%>% mutate(Region='Lincoln')
WY_1999 <- read_csv("Data/Raw_Data/Mortality_Rates_New/Wyoming_Age_Adjusted_1999-2020.csv") %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`)%>% mutate(Region='Wyoming') %>% filter(Year<2018)
US_1999 <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_Age_Adjusted_1999-2020.csv")%>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`) %>% filter(!is.na(Sex),!is.na(Year)) %>% mutate(Region="US")%>% filter(Year<2018)
WY_2018 <- read_csv("Data/Raw_Data/Mortality_Rates_New/Wyoming_Age_Adjusted_2018-2023.csv") %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`) %>% mutate(Region='Wyoming')
US_2018 <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_Age_Adjusted_2018-2023.csv")%>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`) %>% mutate(Region="US")
##No adjustment for later data allowed
LIN_2018<- read_csv("Data/Raw_Data/Mortality_Rates_New/Lincoln_Not_Age_Adjusted_2018-2023.csv") %>% select(Year,Sex,Mort_Rate=`Crude Rate`)%>% mutate(Region='Lincoln')
ADJUST_TERM <- LIN_2018 %>% rename(UNADJUSTED=Mort_Rate) %>% inner_join(LIN_1999) %>% filter(!is.na(Year)) %>% mutate(Ratio=Mort_Rate/UNADJUSTED) %>% group_by(Sex) %>% summarize(Ratio=mean(Ratio))
LIN_2018 <- LIN_2018 %>% filter(Year>2020) %>% left_join(ADJUST_TERM) %>% mutate(Mort_Rate=Mort_Rate*Ratio) %>% select(-Ratio)
DF <- rbind(LIN_1999,LIN_2018,WY_1999,US_1999,US_2018,WY_2018,WY_1979,US_1979,LIN_1979) %>% filter(!is.na(Year),!is.na(Sex))
ggplot(DF,aes(x=Year,y=Mort_Rate,group=Region,color=Region,fill=Region))+geom_point()+geom_smooth(method="lm")+ facet_grid(. ~ Sex)
ggplot(DF,aes(x=Year,y=Mort_Rate,group=Region,color=Region,fill=Region))+geom_point()+geom_smooth(span=0.4)+ facet_grid(. ~ Sex)
ggplot(DF,aes(x=Year,y=Mort_Rate,group=Region,color=Region,fill=Region))+geom_point()+geom_line()+ facet_grid(. ~ Sex)
########################################################ARIMA
PANDIMIC_INDEX <- read_csv("https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23ebf3fb&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=off&txtcolor=%23444444&ts=12&tts=12&width=1320&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=WUPI&scale=left&cosd=1996-01-01&coed=2025-07-01&line_color=%230073e6&link_values=false&line_style=solid&mark_type=none&mw=3&lw=3&ost=-99999&oet=99999&mma=0&fml=a&fq=Annual&fam=sum&fgst=lin&fgsnd=2020-02-01&line_index=1&transformation=lin&vintage_date=2025-11-20&revision_date=2025-11-20&nd=1996-01-01") %>% mutate(Year=year(observation_date)) %>% select(Year,WUPI)%>% mutate(L_WUPI=lag(WUPI),L_TWO_WUPI=lag(WUPI,2))
DF <- DF %>% ungroup%>% group_by(Sex,Region) %>% arrange(Sex,Region,Year) %>% mutate(L_Mort_Rate=lag(Mort_Rate)) %>% ungroup
REG_DATA <- DF %>% pivot_wider(values_from=c(Mort_Rate,L_Mort_Rate),names_from=Region)
REG_DATA <- REG_DATA %>% left_join(PANDIMIC_INDEX) %>% mutate(WUPI=ifelse(is.na(WUPI),0,WUPI),L_WUPI=ifelse(is.na(L_WUPI),0,L_WUPI))
library(fixest)
library(forecast)
########################################################ARIMA
MOD_WOMEN <- feols(Mort_Rate_US~L_Mort_Rate_US+Year+WUPI,REG_DATA %>% filter(Sex=='Female'))
acf(resid(MOD_WOMEN))
MOD_WOMEN
MOD_MEN <- feols(Mort_Rate_US~L_Mort_Rate_US+Year+WUPI,REG_DATA %>% filter(Sex=='Male'))
###Lincoln
REG_DATA
MOD_WOMEN <- feols(Mort_Rate_Lincoln~Mort_Rate_US,REG_DATA %>% filter(Sex=='Female'))
acf(resid(MOD_WOMEN))
MOD_WOMEN
@ -43,7 +17,6 @@ MOD_MEN <- feols(Mort_Rate_US~L_Mort_Rate_US+Year+WUPI,REG_DATA %>% filter(Sex==
acf(resid(MOD_MEN))
plot(resid(MOD_MEN))
plot(predict(MOD_MEN))
library(forecast)
DATA_MEN <- REG_DATA %>% filter(Sex=='Male')
DATA_WOMEN <- REG_DATA %>% filter(Sex=='Female')

View File

@ -3,6 +3,7 @@ Rscript "./Scripts/1A_Download_and_Process_Population_Data.r"
Rscript "./Scripts/1B_Process_Existing_NIH_Mortality_Data.r"
Rscript "./Scripts/1C_Download_and_Process_Demographic_Data.r"
Rscript "./Scripts/1D_Use_ACS_Census_Data_to_Estimate_Kemmerer_Demographics.r"
Rscript "./Scripts/1E_Process_WONDER_Mortality_Data.r"
#Create data sets used in later simulations, produce some results for the report when related to this process.
Rscript "./Scripts/2A_Birth_Rate_Regression_and_Impart_Kemmerer_Births.r"
Rscript "./Scripts/2B_Impart_Deaths_and_Migration_to_Subregions.r"

View File

@ -0,0 +1,67 @@
library(tidyverse)
#setwd("../")
#Define and set all working directories for reading or saving files
if(!exists("DATA_LOC_RAW")){DATA_LOC_RAW <- "./Data/Raw_Data/Mortality_Rates_Over_Time/"}
if(!exists("FIGURE_SAVE_LOC")){FIGURE_SAVE_LOC <- "./Results/Mortality_Trends/"}
if(!exists("DATA_SAVE_LOC_RDS")){DATA_SAVE_LOC_RDS <- "./Data/Cleaned_Data/Mortality_Data/RDS/"}
if(!exists("DATA_SAVE_LOC_CSV")){DATA_SAVE_LOC_CSV <- "./Data/Cleaned_Data/Mortality_Data/CSV/"}
dir.create(FIGURE_SAVE_LOC, recursive = TRUE, showWarnings = FALSE)
dir.create(DATA_SAVE_LOC_RDS, recursive = TRUE, showWarnings = FALSE)
dir.create(DATA_SAVE_LOC_CSV, recursive = TRUE, showWarnings = FALSE)
##Write a log to help users identify the source of the data and make any future changes
sink(file=paste0(DATA_LOC_RAW,"README_MORTALITY_DATA.txt"),append=FALSE)
cat("Data is manually gathered from CDC WONDER data queries.\n https://wonder.cdc.gov/. Each file Contains additional information about the sources \n A secondary source of data comes from the World Pandemic Uncertainty Index Ahir, Hites, Bloom, Nick and Furceri, Davide, World Pandemic Uncertainty Index [WUPI], retrieved from FRED, Federal Reserve Bank of St. Louis; https://fred.stlouisfed.org/series/WUPI, November 23, 2025.
\n The data sets available in November 2025, are combined into a single file. Data sets include:\n 1) The age adjusted (weighted) mortality rates of Lincoln County, Wyoming and the US from three data sets starting in 1979, 2018, and 2020\n 2) The single year age-sex mortality rate of the US, starting in 2018 compiled yearly. These are suppressed for privacy at any level lower than the nation. \n 3) The 10 year age bin mortality rates with age adjustment for the U.S. and Wyoming in each year. These are used to append the to the yearly records which exclude 85+ values.\n 4) The world pandemic uncertainty index as collected from FRED which is used to account for pandemics in the regression, making the age time series stationary.\n \n These are used to project mortality trends over time. In the case of the age adjusted data, this has local trends that can be compared to the national average. The single age-sex data is only at a national level but can be imparted to local levels as a general trend in the distribution of deaths "
)
sink()
#Load data files which were generated using Wonder Data set manually
LIN_1979 <- read_csv(paste0(DATA_LOC_RAW,"Lincoln_Age_Adjusted_1979-1998.csv")) %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`)%>% mutate(Region='Lincoln')
WY_1979 <- read_csv(paste0(DATA_LOC_RAW,"Wyoming_Age_Adjusted_1979-1998.csv")) %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`)%>% mutate(Region='Wyoming')
US_1979 <- read_csv(paste0(DATA_LOC_RAW,"US_Age_Adjusted_1979-1998.csv")) %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`) %>% filter(!is.na(Sex),!is.na(Year)) %>% mutate(Region="US")%>% filter(Year<2018)
LIN_1999 <- read_csv(paste0(DATA_LOC_RAW,"Lincoln_Age_Adjusted_1999-2020.csv")) %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`)%>% mutate(Region='Lincoln')
WY_1999 <- read_csv(paste0(DATA_LOC_RAW,"Wyoming_Age_Adjusted_1999-2020.csv")) %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`)%>% mutate(Region='Wyoming') %>% filter(Year<2018)
US_1999 <- read_csv(paste0(DATA_LOC_RAW,"US_Age_Adjusted_1999-2020.csv"))%>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`) %>% filter(!is.na(Sex),!is.na(Year)) %>% mutate(Region="US")%>% filter(Year<2018)
WY_2018 <- read_csv(paste0(DATA_LOC_RAW,"Wyoming_Age_Adjusted_2018-2023.csv")) %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`) %>% mutate(Region='Wyoming')
US_2018 <- read_csv(paste0(DATA_LOC_RAW,"US_Age_Adjusted_2018-2023.csv"))%>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`) %>% mutate(Region="US")
##No adjustment for later data allowed in WONDER, so applying an average value for use in the graphs as a reasonable assumption.
LIN_2018<- read_csv(paste0(DATA_LOC_RAW,"Lincoln_Not_Age_Adjusted_2018-2023.csv")) %>% select(Year,Sex,Mort_Rate=`Crude Rate`)%>% mutate(Region='Lincoln')
ADJUST_TERM <- LIN_2018 %>% rename(UNADJUSTED=Mort_Rate) %>% inner_join(LIN_1999) %>% filter(!is.na(Year)) %>% mutate(Ratio=Mort_Rate/UNADJUSTED) %>% group_by(Sex) %>% summarize(Ratio=mean(Ratio))
LIN_2018 <- LIN_2018 %>% filter(Year>2020) %>% left_join(ADJUST_TERM) %>% mutate(Mort_Rate=Mort_Rate*Ratio) %>% select(-Ratio)
DF <- rbind(LIN_1999,LIN_2018,WY_1999,US_1999,US_2018,WY_2018,WY_1979,US_1979,LIN_1979) %>% filter(!is.na(Year),!is.na(Sex))
DF <- DF %>% ungroup%>% group_by(Sex,Region) %>% arrange(Sex,Region,Year) %>% mutate(L_Mort_Rate=lag(Mort_Rate)) %>% ungroup
#Make a regression table which includes lats of mortality rate for a stationary time series, and pivot such that each year is it's own row with each area of interest (US, Wyoming, Lincoln)
REG_DATA <- DF %>% pivot_wider(values_from=c(Mort_Rate,L_Mort_Rate),names_from=Region)
##Pull the World Pandemic Index from FRED for use in the regression data
PANDIMIC_INDEX <- read_csv("https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23ebf3fb&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=off&txtcolor=%23444444&ts=12&tts=12&width=1320&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=WUPI&scale=left&cosd=1996-01-01&coed=2025-07-01&line_color=%230073e6&link_values=false&line_style=solid&mark_type=none&mw=3&lw=3&ost=-99999&oet=99999&mma=0&fml=a&fq=Annual&fam=sum&fgst=lin&fgsnd=2020-02-01&line_index=1&transformation=lin&vintage_date=2025-11-20&revision_date=2025-11-20&nd=1996-01-01") %>% mutate(Year=year(observation_date)) %>% select(Year,WUPI)%>% mutate(L_WUPI=lag(WUPI),L_TWO_WUPI=lag(WUPI,2))
REG_DATA <- REG_DATA %>% left_join(PANDIMIC_INDEX) %>% mutate(WUPI=ifelse(is.na(WUPI),0,WUPI),L_WUPI=ifelse(is.na(L_WUPI),0,L_WUPI))
#ggplot(DF,aes(x=Year,y=Mort_Rate,group=Region,color=Region,fill=Region))+geom_point()+geom_smooth(method="lm")+ facet_grid(. ~ Sex)
png(paste0(FIGURE_SAVE_LOC,"Age_Adjusted_Mortality_Rate_Trends_by_Region.png"), res = 600, height = 7, width=11, units = "in")
ggplot(DF,aes(x=Year,y=Mort_Rate,group=Region,color=Region,fill=Region))+geom_point()+geom_smooth()+ facet_grid(.~ Sex)+theme_bw()+ylab("Age Adjusted Mortality Rate (Deaths per 100,000 people)")+ theme(legend.position = "top")
dev.off()
#ggplot(DF,aes(x=Year,y=Mort_Rate,group=Region,color=Region,fill=Region))+geom_point()+geom_line()+ facet_gri. ~ Sex)
####Save Data
#Simple long format data save
saveRDS(DF,paste0(DATA_SAVE_LOC_RDS,"Long_Mortality_Rate_Data.Rds" ))
write_csv(DF,paste0(DATA_SAVE_LOC_RDS,"Long_Mortality_Rate_Data.csv" ))
#Data combined and cleaned for regression save
saveRDS(REG_DATA,paste0(DATA_SAVE_LOC_RDS,"Mortality_Rate_and_Pandemic_Data_for_Regression.Rds" ))
write_csv(REG_DATA,paste0(DATA_SAVE_LOC_CSV,"Mortality_Rate_and_Pandemic_Data_for_Regression.csv" ))
run_datetime <- format(Sys.time(), "%Y-%m-%d %H:%M:%S")
sink(file=paste0(DATA_LOC_RAW,"README_MORTALITY_DATA.txt"),append=TRUE)
cat(paste0("\n--- Run Date: ", run_datetime, " ---\n"))
sink()