Cleaning up mortality loading

2025-11-23 16:02:00 -07:00 · 2025-11-23 16:02:00 -07:00 · 3bf85ce48b
commit 3bf85ce48b
parent 16cf5be64c
18 changed files with 85 additions and 31 deletions
--- a/Data/Raw_Data/Mortality_Rates_Over_Time/Lincoln_Age_Adjusted_1979-1998.csv
+++ b/Data/Raw_Data/Mortality_Rates_Over_Time/Lincoln_Age_Adjusted_1979-1998.csv
--- a/Data/Raw_Data/Mortality_Rates_Over_Time/Lincoln_Age_Adjusted_1999-2020.csv
+++ b/Data/Raw_Data/Mortality_Rates_Over_Time/Lincoln_Age_Adjusted_1999-2020.csv
--- a/Data/Raw_Data/Mortality_Rates_Over_Time/Lincoln_Not_Age_Adjusted_2018-2023.csv
+++ b/Data/Raw_Data/Mortality_Rates_Over_Time/Lincoln_Not_Age_Adjusted_2018-2023.csv
--- a/Data/Raw_Data/Mortality_Rates_Over_Time/Other/Wyoming_10_Year_Age_Groups_1999-2020.csv
+++ b/Data/Raw_Data/Mortality_Rates_Over_Time/Other/Wyoming_10_Year_Age_Groups_1999-2020.csv
--- a/Data/Raw_Data/Mortality_Rates_Over_Time/README_MORTALITY_DATA.txt
+++ b/Data/Raw_Data/Mortality_Rates_Over_Time/README_MORTALITY_DATA.txt
@ -0,0 +1,13 @@
+Data is manually gathered from CDC WONDER data queries.
+ https://wonder.cdc.gov/. Each file Contains additional information about the sources 
+ A secondary source of data comes from the World Pandemic Uncertainty Index Ahir, Hites, Bloom, Nick and Furceri, Davide, World Pandemic Uncertainty Index [WUPI], retrieved from FRED, Federal Reserve Bank of St. Louis; https://fred.stlouisfed.org/series/WUPI, November 23, 2025. 
+
+	    
+ The data sets available in November 2025, are combined into a single file. Data sets include:
+	   1) The age adjusted (weighted) mortality rates of Lincoln County, Wyoming and the US from three data sets starting in 1979, 2018, and 2020
+  2)  The single year age-sex mortality rate of the US, starting in 2018 compiled yearly. These are suppressed for privacy at any level lower than the nation. 
+ 3) The 10 year age bin mortality rates with age adjustment for the U.S. and Wyoming in each year. These are used to append the to the yearly records which exclude 85+ values.
+ 4) The world pandemic uncertainty index as collected from FRED which is used to account for pandemics in the regression, making the age time series stationary.
+ 
+ These are used to project mortality trends over time. In the case of the age adjusted data, this has local trends that can be compared to the national average. The single age-sex data is only at a national level but can be imparted to local levels as a general trend in the distribution of deaths 
+--- Run Date: 2025-11-23 15:59:28 ---
--- a/Data/Raw_Data/Mortality_Rates_Over_Time/US_10_Year_Age_Groups_1999-2020.csv
+++ b/Data/Raw_Data/Mortality_Rates_Over_Time/US_10_Year_Age_Groups_1999-2020.csv
--- a/Data/Raw_Data/Mortality_Rates_Over_Time/US_10_Year_Age_Groups_2018-2023.csv
+++ b/Data/Raw_Data/Mortality_Rates_Over_Time/US_10_Year_Age_Groups_2018-2023.csv
--- a/Data/Raw_Data/Mortality_Rates_Over_Time/US_Age_Adjusted_1979-1998.csv
+++ b/Data/Raw_Data/Mortality_Rates_Over_Time/US_Age_Adjusted_1979-1998.csv
--- a/Data/Raw_Data/Mortality_Rates_Over_Time/US_Age_Adjusted_1999-2020.csv
+++ b/Data/Raw_Data/Mortality_Rates_Over_Time/US_Age_Adjusted_1999-2020.csv
--- a/Data/Raw_Data/Mortality_Rates_Over_Time/US_Age_Adjusted_2018-2023.csv
+++ b/Data/Raw_Data/Mortality_Rates_Over_Time/US_Age_Adjusted_2018-2023.csv
--- a/Data/Raw_Data/Mortality_Rates_Over_Time/US_Single_Age_1999-2020.csv
+++ b/Data/Raw_Data/Mortality_Rates_Over_Time/US_Single_Age_1999-2020.csv
--- a/Data/Raw_Data/Mortality_Rates_Over_Time/US_Single_Age_2018-2023.csv
+++ b/Data/Raw_Data/Mortality_Rates_Over_Time/US_Single_Age_2018-2023.csv
--- a/Data/Raw_Data/Mortality_Rates_Over_Time/Wyoming_Age_Adjusted_1979-1998.csv
+++ b/Data/Raw_Data/Mortality_Rates_Over_Time/Wyoming_Age_Adjusted_1979-1998.csv
--- a/Data/Raw_Data/Mortality_Rates_Over_Time/Wyoming_Age_Adjusted_1999-2020.csv
+++ b/Data/Raw_Data/Mortality_Rates_Over_Time/Wyoming_Age_Adjusted_1999-2020.csv
--- a/Data/Raw_Data/Mortality_Rates_Over_Time/Wyoming_Age_Adjusted_2018-2023.csv
+++ b/Data/Raw_Data/Mortality_Rates_Over_Time/Wyoming_Age_Adjusted_2018-2023.csv
--- a/Mortality_Rate_Analysis.r
+++ b/Mortality_Rate_Analysis.r
@ -1,40 +1,14 @@
 library(tidyverse)
-LIN_1979 <- read_csv("Data/Raw_Data/Mortality_Rates_New/Lincoln_Age_Adjusted_1979-1998.csv") %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`)%>% mutate(Region='Lincoln') 
-WY_1979 <- read_csv("Data/Raw_Data/Mortality_Rates_New/Wyoming_Age_Adjusted_1979-1998.csv") %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`)%>% mutate(Region='Wyoming')
-US_1979 <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_Age_Adjusted_1979-1998.csv")%>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`) %>% filter(!is.na(Sex),!is.na(Year)) %>% mutate(Region="US")%>% filter(Year<2018)
-
-
-
-
-LIN_1999 <- read_csv("Data/Raw_Data/Mortality_Rates_New/Lincoln_Age_Adjusted_1999-2020.csv") %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`)%>% mutate(Region='Lincoln') 
-WY_1999 <- read_csv("Data/Raw_Data/Mortality_Rates_New/Wyoming_Age_Adjusted_1999-2020.csv") %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`)%>% mutate(Region='Wyoming') %>% filter(Year<2018)
-US_1999 <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_Age_Adjusted_1999-2020.csv")%>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`) %>% filter(!is.na(Sex),!is.na(Year)) %>% mutate(Region="US")%>% filter(Year<2018)
-
-
-WY_2018 <- read_csv("Data/Raw_Data/Mortality_Rates_New/Wyoming_Age_Adjusted_2018-2023.csv") %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`) %>% mutate(Region='Wyoming') 
-US_2018 <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_Age_Adjusted_2018-2023.csv")%>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`)  %>% mutate(Region="US")
-##No adjustment for later data allowed
-LIN_2018<- read_csv("Data/Raw_Data/Mortality_Rates_New/Lincoln_Not_Age_Adjusted_2018-2023.csv") %>% select(Year,Sex,Mort_Rate=`Crude Rate`)%>% mutate(Region='Lincoln') 
-ADJUST_TERM <- LIN_2018 %>% rename(UNADJUSTED=Mort_Rate) %>% inner_join(LIN_1999) %>% filter(!is.na(Year)) %>% mutate(Ratio=Mort_Rate/UNADJUSTED) %>% group_by(Sex) %>% summarize(Ratio=mean(Ratio))
-LIN_2018 <- LIN_2018 %>% filter(Year>2020) %>% left_join(ADJUST_TERM) %>% mutate(Mort_Rate=Mort_Rate*Ratio) %>% select(-Ratio)
-
-DF <- rbind(LIN_1999,LIN_2018,WY_1999,US_1999,US_2018,WY_2018,WY_1979,US_1979,LIN_1979) %>% filter(!is.na(Year),!is.na(Sex))
-ggplot(DF,aes(x=Year,y=Mort_Rate,group=Region,color=Region,fill=Region))+geom_point()+geom_smooth(method="lm")+ facet_grid(. ~ Sex)
-ggplot(DF,aes(x=Year,y=Mort_Rate,group=Region,color=Region,fill=Region))+geom_point()+geom_smooth(span=0.4)+ facet_grid(. ~ Sex)
-ggplot(DF,aes(x=Year,y=Mort_Rate,group=Region,color=Region,fill=Region))+geom_point()+geom_line()+ facet_grid(. ~ Sex)
-########################################################ARIMA
-PANDIMIC_INDEX <- read_csv("https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23ebf3fb&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=off&txtcolor=%23444444&ts=12&tts=12&width=1320&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=WUPI&scale=left&cosd=1996-01-01&coed=2025-07-01&line_color=%230073e6&link_values=false&line_style=solid&mark_type=none&mw=3&lw=3&ost=-99999&oet=99999&mma=0&fml=a&fq=Annual&fam=sum&fgst=lin&fgsnd=2020-02-01&line_index=1&transformation=lin&vintage_date=2025-11-20&revision_date=2025-11-20&nd=1996-01-01") %>% mutate(Year=year(observation_date)) %>% select(Year,WUPI)%>% mutate(L_WUPI=lag(WUPI),L_TWO_WUPI=lag(WUPI,2))
-DF <- DF %>% ungroup%>% group_by(Sex,Region) %>% arrange(Sex,Region,Year)  %>% mutate(L_Mort_Rate=lag(Mort_Rate)) %>% ungroup
-REG_DATA <- DF %>% pivot_wider(values_from=c(Mort_Rate,L_Mort_Rate),names_from=Region) 
-REG_DATA <- REG_DATA %>% left_join(PANDIMIC_INDEX) %>% mutate(WUPI=ifelse(is.na(WUPI),0,WUPI),L_WUPI=ifelse(is.na(L_WUPI),0,L_WUPI))
-
 library(fixest)
+library(forecast)
+
+########################################################ARIMA
+
 MOD_WOMEN <- feols(Mort_Rate_US~L_Mort_Rate_US+Year+WUPI,REG_DATA %>% filter(Sex=='Female'))
 acf(resid(MOD_WOMEN))
 MOD_WOMEN 
 MOD_MEN <- feols(Mort_Rate_US~L_Mort_Rate_US+Year+WUPI,REG_DATA %>% filter(Sex=='Male'))
 ###Lincoln
-REG_DATA
 MOD_WOMEN <- feols(Mort_Rate_Lincoln~Mort_Rate_US,REG_DATA %>% filter(Sex=='Female'))
 acf(resid(MOD_WOMEN))
 MOD_WOMEN
@ -43,7 +17,6 @@ MOD_MEN <- feols(Mort_Rate_US~L_Mort_Rate_US+Year+WUPI,REG_DATA %>% filter(Sex==
 acf(resid(MOD_MEN))
 plot(resid(MOD_MEN))
 plot(predict(MOD_MEN))
-library(forecast)
 DATA_MEN <- REG_DATA %>% filter(Sex=='Male')
 DATA_WOMEN <- REG_DATA %>% filter(Sex=='Female')

--- a/Prelim_Process.sh
+++ b/Prelim_Process.sh
@ -3,6 +3,7 @@ Rscript "./Scripts/1A_Download_and_Process_Population_Data.r"
 Rscript "./Scripts/1B_Process_Existing_NIH_Mortality_Data.r"
 Rscript "./Scripts/1C_Download_and_Process_Demographic_Data.r"
 Rscript "./Scripts/1D_Use_ACS_Census_Data_to_Estimate_Kemmerer_Demographics.r"
+Rscript "./Scripts/1E_Process_WONDER_Mortality_Data.r"
 #Create data sets used in later simulations, produce some results for the report when related to this process.
 Rscript "./Scripts/2A_Birth_Rate_Regression_and_Impart_Kemmerer_Births.r"
 Rscript "./Scripts/2B_Impart_Deaths_and_Migration_to_Subregions.r"
--- a/Scripts/1E_Process_WONDER_Mortality_Data.r
+++ b/Scripts/1E_Process_WONDER_Mortality_Data.r
@ -0,0 +1,67 @@
+library(tidyverse)
+#setwd("../")
+#Define and set all working directories for reading or saving files
+if(!exists("DATA_LOC_RAW")){DATA_LOC_RAW <- "./Data/Raw_Data/Mortality_Rates_Over_Time/"}
+if(!exists("FIGURE_SAVE_LOC")){FIGURE_SAVE_LOC <- "./Results/Mortality_Trends/"}
+if(!exists("DATA_SAVE_LOC_RDS")){DATA_SAVE_LOC_RDS <- "./Data/Cleaned_Data/Mortality_Data/RDS/"}
+if(!exists("DATA_SAVE_LOC_CSV")){DATA_SAVE_LOC_CSV <- "./Data/Cleaned_Data/Mortality_Data/CSV/"}
+dir.create(FIGURE_SAVE_LOC, recursive = TRUE, showWarnings = FALSE)
+dir.create(DATA_SAVE_LOC_RDS, recursive = TRUE, showWarnings = FALSE)
+dir.create(DATA_SAVE_LOC_CSV, recursive = TRUE, showWarnings = FALSE)
+
+
+
+
+##Write a log to help users identify the source of the data and make any future changes
+sink(file=paste0(DATA_LOC_RAW,"README_MORTALITY_DATA.txt"),append=FALSE)
+	cat("Data is manually gathered from CDC WONDER data queries.\n https://wonder.cdc.gov/. Each file Contains additional information about the sources \n A secondary source of data comes from the World Pandemic Uncertainty Index Ahir, Hites, Bloom, Nick and Furceri, Davide, World Pandemic Uncertainty Index [WUPI], retrieved from FRED, Federal Reserve Bank of St. Louis; https://fred.stlouisfed.org/series/WUPI, November 23, 2025. 
+
+	    \n The data sets available in November 2025, are combined into a single file. Data sets include:\n 1) The age adjusted (weighted) mortality rates of Lincoln County, Wyoming and the US from three data sets starting in 1979, 2018, and 2020\n  2)  The single year age-sex mortality rate of the US, starting in 2018 compiled yearly. These are suppressed for privacy at any level lower than the nation. \n 3) The 10 year age bin mortality rates with age adjustment for the U.S. and Wyoming in each year. These are used to append the to the yearly records which exclude 85+ values.\n 4) The world pandemic uncertainty index as collected from FRED which is used to account for pandemics in the regression, making the age time series stationary.\n \n These are used to project mortality trends over time. In the case of the age adjusted data, this has local trends that can be compared to the national average. The single age-sex data is only at a national level but can be imparted to local levels as a general trend in the distribution of deaths "
+	)
+sink()
+
+
+#Load data files which were generated using Wonder Data set manually 
+LIN_1979 <- read_csv(paste0(DATA_LOC_RAW,"Lincoln_Age_Adjusted_1979-1998.csv")) %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`)%>% mutate(Region='Lincoln') 
+WY_1979 <- read_csv(paste0(DATA_LOC_RAW,"Wyoming_Age_Adjusted_1979-1998.csv")) %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`)%>% mutate(Region='Wyoming')
+US_1979 <- read_csv(paste0(DATA_LOC_RAW,"US_Age_Adjusted_1979-1998.csv")) %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`) %>% filter(!is.na(Sex),!is.na(Year)) %>% mutate(Region="US")%>% filter(Year<2018)
+LIN_1999 <- read_csv(paste0(DATA_LOC_RAW,"Lincoln_Age_Adjusted_1999-2020.csv")) %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`)%>% mutate(Region='Lincoln') 
+WY_1999 <- read_csv(paste0(DATA_LOC_RAW,"Wyoming_Age_Adjusted_1999-2020.csv")) %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`)%>% mutate(Region='Wyoming') %>% filter(Year<2018)
+US_1999 <- read_csv(paste0(DATA_LOC_RAW,"US_Age_Adjusted_1999-2020.csv"))%>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`) %>% filter(!is.na(Sex),!is.na(Year)) %>% mutate(Region="US")%>% filter(Year<2018)
+
+
+WY_2018 <- read_csv(paste0(DATA_LOC_RAW,"Wyoming_Age_Adjusted_2018-2023.csv")) %>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`) %>% mutate(Region='Wyoming') 
+US_2018 <- read_csv(paste0(DATA_LOC_RAW,"US_Age_Adjusted_2018-2023.csv"))%>% select(Year,Sex,Mort_Rate=`Age Adjusted Rate`)  %>% mutate(Region="US")
+##No adjustment for later data allowed in WONDER, so applying an average value for use in the graphs as a reasonable assumption.
+LIN_2018<- read_csv(paste0(DATA_LOC_RAW,"Lincoln_Not_Age_Adjusted_2018-2023.csv")) %>% select(Year,Sex,Mort_Rate=`Crude Rate`)%>% mutate(Region='Lincoln') 
+ADJUST_TERM <- LIN_2018 %>% rename(UNADJUSTED=Mort_Rate) %>% inner_join(LIN_1999) %>% filter(!is.na(Year)) %>% mutate(Ratio=Mort_Rate/UNADJUSTED) %>% group_by(Sex) %>% summarize(Ratio=mean(Ratio))
+LIN_2018 <- LIN_2018 %>% filter(Year>2020) %>% left_join(ADJUST_TERM) %>% mutate(Mort_Rate=Mort_Rate*Ratio) %>% select(-Ratio)
+DF <- rbind(LIN_1999,LIN_2018,WY_1999,US_1999,US_2018,WY_2018,WY_1979,US_1979,LIN_1979) %>% filter(!is.na(Year),!is.na(Sex))
+DF <- DF %>% ungroup%>% group_by(Sex,Region) %>% arrange(Sex,Region,Year)  %>% mutate(L_Mort_Rate=lag(Mort_Rate)) %>% ungroup
+#Make a regression table which includes lats of mortality rate for a stationary time series, and pivot such that each year is it's own row with each area of interest (US, Wyoming, Lincoln)
+REG_DATA <- DF %>% pivot_wider(values_from=c(Mort_Rate,L_Mort_Rate),names_from=Region) 
+
+##Pull the World Pandemic Index from FRED for use in the regression data
+PANDIMIC_INDEX <- read_csv("https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23ebf3fb&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=off&txtcolor=%23444444&ts=12&tts=12&width=1320&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=WUPI&scale=left&cosd=1996-01-01&coed=2025-07-01&line_color=%230073e6&link_values=false&line_style=solid&mark_type=none&mw=3&lw=3&ost=-99999&oet=99999&mma=0&fml=a&fq=Annual&fam=sum&fgst=lin&fgsnd=2020-02-01&line_index=1&transformation=lin&vintage_date=2025-11-20&revision_date=2025-11-20&nd=1996-01-01") %>% mutate(Year=year(observation_date)) %>% select(Year,WUPI)%>% mutate(L_WUPI=lag(WUPI),L_TWO_WUPI=lag(WUPI,2))
+REG_DATA <- REG_DATA %>% left_join(PANDIMIC_INDEX) %>% mutate(WUPI=ifelse(is.na(WUPI),0,WUPI),L_WUPI=ifelse(is.na(L_WUPI),0,L_WUPI))
+
+
+
+#ggplot(DF,aes(x=Year,y=Mort_Rate,group=Region,color=Region,fill=Region))+geom_point()+geom_smooth(method="lm")+ facet_grid(. ~ Sex)
+png(paste0(FIGURE_SAVE_LOC,"Age_Adjusted_Mortality_Rate_Trends_by_Region.png"), res = 600, height = 7, width=11, units = "in")
+ggplot(DF,aes(x=Year,y=Mort_Rate,group=Region,color=Region,fill=Region))+geom_point()+geom_smooth()+ facet_grid(.~ Sex)+theme_bw()+ylab("Age Adjusted Mortality Rate (Deaths per 100,000 people)")+ theme(legend.position = "top")
+dev.off()
+#ggplot(DF,aes(x=Year,y=Mort_Rate,group=Region,color=Region,fill=Region))+geom_point()+geom_line()+ facet_gri. ~ Sex)
+####Save Data
+#Simple long format data save
+saveRDS(DF,paste0(DATA_SAVE_LOC_RDS,"Long_Mortality_Rate_Data.Rds" ))
+write_csv(DF,paste0(DATA_SAVE_LOC_RDS,"Long_Mortality_Rate_Data.csv" ))
+#Data combined and cleaned for regression save
+saveRDS(REG_DATA,paste0(DATA_SAVE_LOC_RDS,"Mortality_Rate_and_Pandemic_Data_for_Regression.Rds" ))
+write_csv(REG_DATA,paste0(DATA_SAVE_LOC_CSV,"Mortality_Rate_and_Pandemic_Data_for_Regression.csv" ))
+
+run_datetime <- format(Sys.time(), "%Y-%m-%d %H:%M:%S")
+sink(file=paste0(DATA_LOC_RAW,"README_MORTALITY_DATA.txt"),append=TRUE)
+	cat(paste0("\n--- Run Date: ", run_datetime, " ---\n"))
+sink()
+