Working on arima sim with age splits

This commit is contained in:
Alex Gebben Work 2025-11-25 16:03:08 -07:00
parent 3bf85ce48b
commit d1d4682b72
5 changed files with 106 additions and 173 deletions

View File

@ -4,10 +4,10 @@ Data is manually gathered from CDC WONDER data queries.
The data sets available in November 2025, are combined into a single file. Data sets include:
1) The age adjusted (weighted) mortality rates of Lincoln County, Wyoming and the US from three data sets starting in 1979, 2018, and 2020
1) The age adjusted (weighted) mortality rates of Lincoln County, Wyoming and the US from three data sets starting in 1979, 2018, and 2020
2) The single year age-sex mortality rate of the US, starting in 2018 compiled yearly. These are suppressed for privacy at any level lower than the nation.
3) The 10 year age bin mortality rates with age adjustment for the U.S. and Wyoming in each year. These are used to append the to the yearly records which exclude 85+ values.
4) The world pandemic uncertainty index as collected from FRED which is used to account for pandemics in the regression, making the age time series stationary.
These are used to project mortality trends over time. In the case of the age adjusted data, this has local trends that can be compared to the national average. The single age-sex data is only at a national level but can be imparted to local levels as a general trend in the distribution of deaths
--- Run Date: 2025-11-23 15:59:28 ---
--- Run Date: 2025-11-25 12:04:27 ---

View File

@ -3,183 +3,76 @@ library(fixest)
library(forecast)
########################################################ARIMA
DATA_WOMEN <- readRDS("Data/Cleaned_Data/Mortality_Data/RDS/Mortality_Rate_and_Pandemic_Data_for_Regression.Rds") %>% filter(Sex=='Female')
DATA_MEN <- readRDS("Data/Cleaned_Data/Mortality_Data/RDS/Mortality_Rate_and_Pandemic_Data_for_Regression.Rds") %>% filter(Sex=='Male')
MOD_WOMEN <- feols(Mort_Rate_US~L_Mort_Rate_US+Year+WUPI,REG_DATA %>% filter(Sex=='Female'))
acf(resid(MOD_WOMEN))
MOD_WOMEN
MOD_MEN <- feols(Mort_Rate_US~L_Mort_Rate_US+Year+WUPI,REG_DATA %>% filter(Sex=='Male'))
###Lincoln
MOD_WOMEN <- feols(Mort_Rate_Lincoln~Mort_Rate_US,REG_DATA %>% filter(Sex=='Female'))
acf(resid(MOD_WOMEN))
MOD_WOMEN
MOD_MEN <- feols(Mort_Rate_US~L_Mort_Rate_US+Year+WUPI,REG_DATA %>% filter(Sex=='Male'))
acf(resid(MOD_MEN))
plot(resid(MOD_MEN))
plot(predict(MOD_MEN))
DATA_MEN <- REG_DATA %>% filter(Sex=='Male')
DATA_WOMEN <- REG_DATA %>% filter(Sex=='Female')
#Create time series data
ST_YEAR <- DATA_MEN %>% pull(Year) %>% min
TS_MEN_US <- DATA_MEN %>% select(Mort_Rate_US) %>% ts(start=ST_YEAR,frequency=1)
TS_MEN_LIN <- DATA_MEN %>% select(Mort_Rate_Lincoln) %>% ts(start=ST_YEAR,frequency=1)
TS_PANDEMIC <- DATA_MEN %>% select(WUPI,L_WUPI) %>% ts(start=ST_YEAR,frequency=1)
TS_WOMEN_US <- DATA_WOMEN %>% select(Mort_Rate_US) %>% ts(start=ST_YEAR,frequency=1)
TS_WOMEN_LIN <- DATA_WOMEN %>% select(Mort_Rate_Lincoln) %>% ts(start=ST_YEAR,frequency=1)
TS_MEN_US_INV <- TS_MEN_US
TS_PANDEMIC <- DATA_MEN %>% select(WUPI,L_WUPI) %>% ts(start=ST_YEAR,frequency=1)
TS_WOMEN_US_INV <- TS_WOMEN_US
FORECAST_XREG <- TS_PANDEMIC
FORECAST_XREG[,] <- 0
MOD_US_MEN <- auto.arima(TS_MEN_US,lambda=0,biasadj=TRUE,xreg=TS_PANDEMIC)
checkresiduals(MOD_US_MEN)
#checkresiduals(MOD_US_MEN)
plot(forecast(MOD_US_MEN,xreg=FORECAST_XREG))
#plot(forecast(MOD_US_MEN,xreg=FORECAST_XREG))
MOD_US_WOMEN <- auto.arima(TS_WOMEN_US,lambda=0,biasadj=TRUE,xreg=TS_PANDEMIC)
checkresiduals(MOD_US_WOMEN)
plot(forecast(MOD_US_WOMEN,xreg=FORECAST_XREG))
#checkresiduals(MOD_US_WOMEN)
#plot(forecast(MOD_US_WOMEN,xreg=FORECAST_XREG))
MOD_LIN <- auto.arima(TS_MEN_LIN,biasadj=TRUE,xreg=TS_MEN_US)
simulate(MOD_US_MEN,xreg=FORECAST_XREG)
plot(simulate(MOD_LIN,xreg=simulate(MOD_US_MEN,xreg=FORECAST_XREG)))
plot(forecast(MOD_LIN,xreg=simulate(MOD_US_MEN,xreg=FORECAST_XREG)))
################################Other work
SINGLE_DATA <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_Single_Age_1999-2020.csv") %>% select(Year,Sex,Age=`Single-Year Ages Code`,Mortality_Rate=`Crude Rate`) %>% mutate(Mortality_Rate=parse_number(Mortality_Rate)) %>% filter(!is.na(Mortality_Rate))
OLDER <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_10_Year_Age_Groups_1999-2020.csv")%>% rename(Age=`Ten-Year Age Groups Code`,Mortality_Rate=`Crude Rate`) %>% filter(Age=='85+')%>% mutate(Age=85,Year=as.numeric(Year),Mortality_Rate=parse_number(Mortality_Rate)) %>% select(Year,Sex,Age,Mortality_Rate)%>% select(colnames(SINGLE_DATA))
SINGLE_DATA <- rbind(SINGLE_DATA,OLDER) %>% left_join(REG_DATA %>% select(Year,Sex,US_Rate=Mort_Rate_US))
SINGLE_DATA_PLAIN <- SINGLE_DATA
SINGLE_DATA <- SINGLE_DATA %>% group_by(Age,Sex) %>% mutate(INDEX=Mortality_Rate/sum(ifelse(Year==min(Year),Mortality_Rate,0)),US_INDEX=US_Rate/sum(ifelse(Year==min(Year),US_Rate,0))) %>% ungroup
ggplot(SINGLE_DATA,aes(x=Year,y=INDEX,group=Age,color=Age))+geom_point()+geom_smooth(aes(y=US_INDEX),se=FALSE,color="black",linetype=2,linewidth=2) + facet_grid(rows= vars(Sex))
ggplot(SINGLE_DATA,aes(x=Year,y=INDEX,group=Age,color=Age))+geom_point(size=0.5)+geom_line(aes(y=US_INDEX),color="black",linetype=2,linewidth=1) + facet_grid(rows= vars(Sex))
ggplot(SINGLE_DATA,aes(x=Year,y=Mortality_Rate,group=Age,color=Age))+geom_point() + scale_y_log10() + facet_grid(. ~ Sex)
ggplot(SINGLE_DATA,aes(x=Year,y=INDEX,group=Age,color=Age))+geom_smooth(se=FALSE,method="lm")+ scale_y_log10() + facet_grid(. ~ Sex)
COR_DAT <- SINGLE_DATA %>% arrange(Sex,Age) %>% select(-US_Rate,-US_INDEX,-INDEX) %>% pivot_wider(values_from=c(Mortality_Rate),names_from=c(Sex,Age)) %>% arrange(Year) %>% select(-Year) %>% as.matrix
rownames(COR_DAT) <- 1999:2020
TEST <- matrix(as.numeric(COR_DAT[1,]),nrow(COR_DAT),nrow=nrow(COR_DAT),ncol=ncol(COR_DAT))
COR_DAT <- COR_DAT/TEST
#COR_DAT <- t(COR_DAT)
#US_COR_DATA <- SINGLE_DATA %>% select(Year,Sex,US_Rate) %>% unique %>% pivot_wider(values_from=US_Rate,names_from=Sex) %>% arrange(Year)
#COR_DAT <- cbind(US_COR_DATA,COR_DAT) %>% as.matrix
library(factoextra)
COR_DAT
corrplot(cor(COR_DAT[,1:86]),type="lower")
corrplot(cor(COR_DAT[,87:172]),type="lower")
COR_DAT <- t(COR_DAT)
COR_DAT <- COR_DAT[,-1]
fviz_nbclust(COR_DAT,kmeans,"gap_stat")
fviz_nbclust(COR_DAT,kmeans,"wss",nboot=1000)
fviz_nbclust(COR_DAT,kmeans,nboot=1000)
km.res <- kmeans(COR_DAT, 3, nstart = 1)
fviz_cluster(km.res,COR_DAT)
print(km.res)
COR_DAT
SINGLE_DATA %>% filter(!is.numeric(Mortality_Rate))
SINGLE_DATA <- SINGLE_DATA %>% mutate(Age=as.numeric(Age),US_Rate=as.numeric(US_Rate),Year=as.numeric(Year),Mortality_Rate=as.numeric(Mortality_Rate))
SINGLE_DATA %>% filter(is.numeric(Age),is.numeric(Mortality_Rate),is.numeric(US_Rate))
DATA <- SINGLE_DATA %>% left_join(PANDIMIC_INDEX ) %>% ungroup %>% group_by(Sex,Year) %>% mutate(Rank=rank(Mortality_Rate)) %>% ungroup %>% group_by(Year,Sex) %>% mutate(PER_MORT_MAX=Mortality_Rate/max(Mortality_Rate))
feols(Mortality_Rate~factor(Rank)+Sex+Year,DATA)
MOD <- feols(log(Mortality_Rate)~log(US_Rate)*Sex+factor(Rank)+Sex|Year,DATA)
MOD
plot(resid(MOD))
DATA$RESID <- resid(MOD)
acf((DATA %>% filter(Age==85,Sex=='Female') %>% pull(RESID)))
DATA
ggplot(SINGLE_DATA,aes(x=Year,y=Mortality_Rate,group=Age,color=Age))+geom_point() + scale_y_log10() + facet_grid(. ~ Sex)
TEST <- SINGLE_DATA %>% group_by(Year,Sex) %>% mutate(TEST=rank(Mortality_Rate)) %>%ungroup
ggplot(TEST,aes(x=Year,y=TEST,group=Age,color=Age)) +geom_point()+ geom_smooth()
TEMP <- SINGLE_DATA %>% group_by(Sex,Year) %>% summarize(GAP=max(Mortality_Rate)-min(Mortality_Rate))
ggplot(TEMP,aes(x=Year,y=GAP,group=Sex,color=Sex))+geom_line()
MOD <- feols(log(Mortality_Rate)~factor(Age)*(log(US_Rate)+WUPI+L_WUPI)+Year,data=as_tibble(SINGLE_DATA))
SINGLE_DATA
plot(resid(MOD))
TEMP <- SINGLE_DATA
TEMP$RESID <- resid(MOD)
acf(TEMP %>% filter(Age==80) %>% pull(RESID))
etable(MOD,group=list("Single Age"="factor"))
SINGLE_DATA[87,]
resid(MOD)
predict
MOD_LIN_MEN <- auto.arima(TS_MEN_LIN,biasadj=TRUE,xreg=TS_MEN_US)
MOD_LIN_WOMEN <- auto.arima(TS_WOMEN_LIN,biasadj=TRUE,xreg=TS_WOMEN_US)
############################################Start Simualtion work
SINGLE_MODS <- readRDS("Data/Intermediate_Inputs/Mortality_Regression_Data/Single_Sex_Age_Time_Series_Regression.Rds")
MIN_VALUES <- readRDS("Data/Intermediate_Inputs/Mortality_Regression_Data/Single_Sex_Age_Min_Values_for_Bounding_Predictions.Rds")
MAX_VALUES <- readRDS("Data/Intermediate_Inputs/Mortality_Regression_Data/Single_Sex_Age_Max_Values_for_Bounding_Predictions.Rds")
BASELINE_AGE_ADJUST_MEN <- readRDS("Data/Cleaned_Data/Mortality_Data/RDS/Single_Sex_Age_Population_in_2000.Rds") %>% filter(Sex=='Male') %>% pull(Percent_of_Population)
BASELINE_AGE_ADJUST_WOMEN <- readRDS("Data/Cleaned_Data/Mortality_Data/RDS/Single_Sex_Age_Population_in_2000.Rds") %>% filter(Sex=='Female') %>% pull(Percent_of_Population)
#Adjust to just women popualtion (Not all population percent
BASELINE_AGE_ADJUST_WOMEN <- BASELINE_AGE_ADJUST_WOMEN/ sum(BASELINE_AGE_ADJUST_WOMEN )
BASELINE_AGE_ADJUST_MEN <- BASELINE_AGE_ADJUST_MEN/ sum(BASELINE_AGE_ADJUST_MEN )
REG_SINGLE_DATA <- SINGLE_DATA_PLAIN %>% mutate( %>% pivot_wider(values_from="Mortality_Rate",names_from=c("Age"))
MALE <- REG_SINGLE_DATA %>% filter(Sex=='Male')
FEMALE <- REG_SINGLE_DATA %>% filter(Sex=='Female')
library(corrplot)
corrplot(cor(MALE %>% select(-Sex)))
US_CAUSES <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_Cause_of_Death_1999-2020.csv") %>% select(Year,ICD=`ICD Sub-Chapter Code`,Death_Rate=`Crude Rate`) %>% filter(!is.na(Death_Rate)) %>% mutate(Death_Rate=ifelse(Death_Rate=='Suppressed' |Death_Rate=='Unreliable',NA,Death_Rate)) %>% rbind(read_csv("Data/Raw_Data/Mortality_Rates_New/US_Cause_of_Death_2018-2023.csv") %>% select(Year,ICD=`ICD Sub-Chapter Code`,Death_Rate=`Crude Rate`) %>% filter(!is.na(Death_Rate)) %>% mutate(Death_Rate=ifelse(Death_Rate=='Suppressed' |Death_Rate=='Unreliable',NA,Death_Rate))) %>% mutate(Death_Rate=parse_number(Death_Rate)) %>% arrange(Year,ICD) %>% group_by(ICD) %>% filter(max(is.na(Death_Rate))==0,min(Death_Rate)!=max(Death_Rate)) %>% ungroup %>% unique
BIND <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_Cause_of_Death_1999-2020.csv") %>% select(ICD=`ICD Sub-Chapter Code`,NAME=`ICD Sub-Chapter`) %>% unique
US_CAUSES <- US_CAUSES %>% left_join(BIND) %>% select(-ICD) %>% rename(ICD=NAME)
US_CAUSES %>% group_by(ICD) %>% summarize(Rate=mean(Death_Rate)) %>% summarize(ICD,Rate, Rank=rank(desc(Rate))) %>% arrange(Rank)
ggplot(US_CAUSES,aes(x=Year,y=scale(Death_Rate),group=ICD,color=ICD,fill=ICD)) +geom_point() +geom_smooth()+theme(legend.position="bottom")
US_CAUSES
parse_number(REG_SINGLE_DATA[,3:89])
US_CAUSES <- US_CAUSES %>% pivot_wider(values_from=Death_Rate,names_from=ICD)
CAUS_MAT <- US_CAUSES %>% select(-Year)
sd(t(CAUS_MAT)[,118])
ST_YEAR <- 2025
END_YEAR <- 2025+40
GAP <- END_YEAR-ST_YEAR
NUM_SIMS <- END_YEAR-ST_YEAR+1
CAUS_MAT <- scale(CAUS_MAT)
XREG <- cbind(rep(0,NUM_SIMS),rep(0,NUM_SIMS))
#colnames(XREG) <- c("WUPI","L_WUPI")
XREG <- ts(XREG,start=ST_YEAR,frequency=1)
fviz_nbclust(CAUS_MAT,kmeans,"gap_stat") #6
fviz_nbclust(CAUS_MAT,kmeans,"wss") #5
fviz_nbclust(CAUS_MAT,kmeans) #2
km.res <- kmeans(CAUS_MAT, 6, nstart = 1)
fviz_cluster(km.res,CAUS_MAT)
summary(km.res
print(km.res)
km.res$cluster
corrplot(cor(US_CAUSES))
MALE <- US_CAUSES %>% left_join(MALE) %>% select(Year,Sex,US_Rate,everything())
MALE %>% tail
COR_MALE <- MALE %>% select(-Sex) %>% as.matrix
corrplot(cor(COR_MALE,use="pairwise.complete"),type="lower",diag=FALSE,)
?corrplot
COR_MALE
corrplot(cor(cbind(MALE[,1],MALE[,4:ncol(MALE)]/t(MALE[,3]))))
corrplot(cor(log(FEMALE %>% select(-Sex))))
TEMP <- MALE %>% select(RATE=`36`,US_Rate,Year) %>% as.data.frame
TEST <- feols((RATE)~US_Rate+Year,TEMP)
TEST <- feols(RATE/US_Rate~Year,TEMP)
acf(as.numeric((TEMP[,'RATE']-predict(TEST)) %>% unlist))
acf
TEST <- lm(log(`22`)~log(US_Rate)+Year,data=MALE)
predict(TEST
resid(TEST)
lm(`20`~Year+
plot(as.vector(t(FEMALE %>% dplyr::select(`20`))))
%>% pull(`22`)
?corrplot
plot(cor(FEMALE %>% select(-Sex))[2,])
plot(cor(FEMALE %>% select(-Sex))[1,])
MAT <- (cbind(abs(cor(FEMALE %>% select(-Sex))[2,]),abs(cor(FEMALE %>% select(-Sex))[1,])))
plot(apply(MAT,1,min))
SIM_LIN_WOMEN <-simulate(MOD_LIN_WOMEN,xreg=simulate(MOD_US_WOMEN,xreg=XREG))
SIM_LIN_MEN <- simulate(MOD_LIN_MEN,xreg=simulate(MOD_US_MEN,xreg=XREG))
C_VAL <- rbind(cbind(ST_YEAR:END_YEAR,rep("Female",NUM_SIMS),as.vector(SIM_LIN_MEN)), cbind(ST_YEAR:END_YEAR,rep("Male",NUM_SIMS),as.vector(SIM_LIN_WOMEN))) %>% as_tibble
colnames(C_VAL) <- c("Year","Sex","US_Adj_Death_Rate")
C_VAL$Year <- as.numeric(pull(C_VAL,Year))
C_VAL$US_Adj_Death_Rate <- as.numeric(pull(C_VAL,US_Adj_Death_Rate))
C_VAL
###Pedict
RES <- do.call(rbind,lapply(1:86,function(x){return(predict(SINGLE_MODS[[x]],C_VAL))}))#For each data frame containing each year and sex combination of the forecast, predict the data for each age 0-85. Bind these by row to create a result with ages by row, and year by column
ncol(RES)
#RES1 <- RES
#Rows Year, Column Age
MIN_MAT <- matrix(rep(MIN_VALUES,ncol(RES)),ncol=ncol(RES))
RES <- ifelse(RES<MIN_VALUES,MIN_VALUES,RES)
RES <- ifelse(RES>MAX_VALUES,MIN_VALUES,RES)
FEMALE_RES <- t(RES[,1:NUM_SIMS])
MALE_RES <- t(RES[,(NUM_SIMS+1):(2*NUM_SIMS)])
PRED_ADJ_RATE_WOMEN <- rowSums(FEMALE_RES*BASELINE_AGE_ADJUST_WOMEN)
PRED_ADJ_RATE_MEN <- rowSums(MALE_RES*BASELINE_AGE_ADJUST_MEN)
MALE_RES <- MALE_RES*C_VAL[1:(nrow(C_VAL)/2),]$US_Adj_Death_Rate/PRED_ADJ_RATE_MEN
FEMALE_RES <- MALE_RES*C_VAL[(nrow(C_VAL)/2+1):(nrow(C_VAL)),]$US_Adj_Death_Rate/PRED_ADJ_RATE_WOMEN
#Testing looks good so far
FEMALE_RES[,20:30]
MALE_RES[,20:30]
MALE
library(fixest)
MOD
MOD <- feols(Age_85~US_Rate,data=MALE)
acf(MALE[,"Age_85"]-predict(MOD))
residuals(MOD)
MOD0
resid(MOD0)

View File

@ -1,9 +1,11 @@
#Clean and collect data sets used in later code.
Rscript "./Scripts/1A_Download_and_Process_Population_Data.r"
Rscript "./Scripts/1B_Process_Existing_NIH_Mortality_Data.r"
Rscript "./Scripts/1B_Process_Existing_NIH_Mortality_Data.r" #Somewhat outdated, could use only the data in 1E and 1F but speeds up completion time to not change
Rscript "./Scripts/1C_Download_and_Process_Demographic_Data.r"
Rscript "./Scripts/1D_Use_ACS_Census_Data_to_Estimate_Kemmerer_Demographics.r"
Rscript "./Scripts/1E_Process_WONDER_Mortality_Data.r"
Rscript "./Scripts/1F_Process_WONDER_Single_Age_Sex_Mortality_Data.r"
#Create data sets used in later simulations, produce some results for the report when related to this process.
Rscript "./Scripts/2A_Birth_Rate_Regression_and_Impart_Kemmerer_Births.r"
Rscript "./Scripts/2B_Impart_Deaths_and_Migration_to_Subregions.r"

View File

@ -2,25 +2,26 @@ library(tidyverse)
library(fixest)
####SPLIT OUT THE DATA MANAGEMENT PULL IN ARIMA
################################Create the data need to model the age-sex specific death rates
DF1999 <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_Single_Age_1999-2020.csv") %>% select(Year,Sex,Age=`Single-Year Ages Code`,Mortality_Rate=`Crude Rate`) %>% mutate(Mortality_Rate=parse_number(Mortality_Rate)) %>% filter(!is.na(Mortality_Rate)) %>% mutate(Mortality_Rate=as.numeric(Mortality_Rate))
DF2018 <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_Single_Age_2018-2023.csv") %>% select(Year,Sex,Age=`Single-Year Ages Code`,Mortality_Rate=`Crude Rate`) %>% filter(!is.na(Mortality_Rate))%>% mutate(Mortality_Rate=parse_number(Mortality_Rate)) %>% filter(!is.na(Mortality_Rate)) %>% mutate(Mortality_Rate=as.numeric(Mortality_Rate))
RAW_DATA_LOC <- "Data/Cleaned_Data/Mortality_Data/RDS/"
REG_DATA <- readRDS(paste0(RAW_DATA_LOC,"Single_Sex_Age_US_Mortality_Rate_Data_Wide.Rds"))
if(!exists("SAVE_DATA_LOC")){SAVE_DATA_LOC<- "Data/Intermediate_Inputs/Mortality_Regression_Data/"}
dir.create(SAVE_DATA_LOC, recursive = TRUE, showWarnings = FALSE)
OLDER1 <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_10_Year_Age_Groups_1999-2020.csv")%>% rename(Age=`Ten-Year Age Groups Code`,Mortality_Rate=`Crude Rate`) %>% filter(Age=='85+')%>% mutate(Age=85,Year=as.numeric(Year),Mortality_Rate=parse_number(Mortality_Rate)) %>% select(Year,Sex,Age,Mortality_Rate) %>% mutate(Mortality_Rate=as.numeric(Mortality_Rate),Age=as.numeric(Age))
OLDER2 <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_10_Year_Age_Groups_2018-2023.csv")%>% rename(Age=`Ten-Year Age Groups Code`,Mortality_Rate=`Crude Rate`) %>% filter(Age=='85+')%>% mutate(Age=85,Year=as.numeric(Year),Mortality_Rate=parse_number(Mortality_Rate)) %>% select(Year,Sex,Age,Mortality_Rate)%>% mutate(Mortality_Rate=as.numeric(Mortality_Rate),Age=as.numeric(Age))
DF <- rbind(DF1999,DF2018,OLDER1,OLDER2) %>% unique %>% group_by(Year,Sex,Age) %>% arrange(Year,Sex,Age) %>% mutate(Age=as.numeric(Age)) %>% ungroup
#hist(US_CAUSES$Death_Rate,breaks=150)
#Overall US death rates
US_AGE_ADJ <- rbind(read_csv("Data/Raw_Data/Mortality_Rates_New/US_Age_Adjusted_1979-1998.csv") %>% select(Year,Sex,US_Adj_Death_Rate=`Crude Rate`),read_csv("Data/Raw_Data/Mortality_Rates_New/US_Age_Adjusted_1999-2020.csv") %>% select(Year,Sex,US_Adj_Death_Rate=`Crude Rate`),read_csv("Data/Raw_Data/Mortality_Rates_New/US_Age_Adjusted_2018-2023.csv") %>% select(Year,Sex,US_Adj_Death_Rate=`Crude Rate`)) %>% unique
REG_DATA <- DF %>% left_join(US_AGE_ADJ) %>% pivot_wider(values_from=Mortality_Rate,names_from=Age,names_prefix="Age_")
#####################Model all ages and sex
MOD <- feols(Age_.[0:85]~US_Adj_Death_Rate+Sex*Year,REG_DATA)
###Simulate each age-sex death rate over time with the models
#########When project far into the future some death rate values become negative. Make bounds to limit the forecast to a reasonable range. In this case I select half of the historic minimum, or double the historic maximum as upper an lower bounds in the study period.
BOUNDS <- DF %>% group_by(Age) %>% summarize(MAX_RATE=2*max(Mortality_Rate),MIN_RATE=min(Mortality_Rate)/2)
BOUNDS <- readRDS("Data/Cleaned_Data/Mortality_Data/RDS/Single_Sex_Age_US_Mortality_Rate_Data_Long.Rds") %>% group_by(Age) %>% summarize(MAX_RATE=2*max(Mortality_Rate),MIN_RATE=min(Mortality_Rate)/2)
MAX_BOUND <- BOUNDS %>% pull(MAX_RATE)
MIN_BOUND <- BOUNDS %>% pull(MIN_RATE)
#Create a proxy data set to simulate with
saveRDS(MOD,paste0(SAVE_DATA_LOC,"Single_Sex_Age_Time_Series_Regression.Rds"))
saveRDS(MAX_BOUND,paste0(SAVE_DATA_LOC,"Single_Sex_Age_Max_Values_for_Bounding_Predictions.Rds"))
saveRDS(MIN_BOUND,paste0(SAVE_DATA_LOC,"Single_Sex_Age_Min_Values_for_Bounding_Predictions.Rds"))
#Create a proxy data set to simulate with
C_VAL <- REG_DATA %>% mutate(Year=Year+(2025-1999)) %>% select(Year,Sex,US_Adj_Death_Rate)
#################NOTE YOU NEED TO ADJUST THE SINGLE AGE DEATH RATE DOWN TO MATCH LINCOLN IN SOME WAY
###Mostly Working: Pass in a data frame, with year, sex, and US age adjusted mortality rate. The years should go from the simulation start 2025, to the end roughly 2045. WHAT IS MISSING is to pass the arima results of the US age adjusted mortality rates as applied in Lincoln to replace the age adjusted mortality term. Once that is done, a new simulation will give the age specific mortality rates based on the forecasted Lincoln average rates.

View File

@ -0,0 +1,37 @@
library(tidyverse)
#setwd("../")
#Define and set all working directories for reading or saving files
if(!exists("DATA_LOC_RAW")){DATA_LOC_RAW <- "./Data/Raw_Data/Mortality_Rates_Over_Time/"}
if(!exists("DATA_SAVE_LOC_RDS")){DATA_SAVE_LOC_RDS <- "./Data/Cleaned_Data/Mortality_Data/RDS/"}
if(!exists("DATA_SAVE_LOC_CSV")){DATA_SAVE_LOC_CSV <- "./Data/Cleaned_Data/Mortality_Data/CSV/"}
dir.create(DATA_SAVE_LOC_RDS, recursive = TRUE, showWarnings = FALSE)
dir.create(DATA_SAVE_LOC_CSV, recursive = TRUE, showWarnings = FALSE)
#get the 2000 age distribution for use to reverse engineer age adjusted values to later use when lining the age adjusted regression with single year-sex mortality estiamtes.
AGE_ADJUST_REF_DATA <- read_csv(paste0(DATA_LOC_RAW,"US_Single_Age_1999-2020.csv")) %>% select(Year,Sex,Age=`Single-Year Ages Code`,Population) %>% mutate(Population=parse_number(Population)) %>% filter(!is.na(Population)) %>% filter(Year==2000,!is.na(Population))
OLD_ADJUST_REF_DATA <- read_csv(paste0(DATA_LOC_RAW,"US_10_Year_Age_Groups_1999-2020.csv"))%>% rename(Age=`Ten-Year Age Groups Code`) %>% filter(Age=='85+',Year==2000)%>% mutate(Age=85,Year=as.numeric(Year),Population=parse_number(Population)) %>% select(Year,Sex,Age,Population) %>% mutate(Population=as.numeric(Population),Age=as.numeric(Age))
AGE_ADJUST_REF_DATA <- full_join(AGE_ADJUST_REF_DATA,OLD_ADJUST_REF_DATA) %>% arrange(Sex,Age) %>% ungroup %>% mutate(Population=as.numeric(Population))
AGE_ADJUST_REF_DATA$Percent_of_Population <- AGE_ADJUST_REF_DATA$Population/sum(AGE_ADJUST_REF_DATA$Population)
saveRDS(AGE_ADJUST_REF_DATA,paste0(DATA_SAVE_LOC_RDS,"Single_Sex_Age_Population_in_2000.Rds" ))
write_csv(AGE_ADJUST_REF_DATA,paste0(DATA_SAVE_LOC_RDS,"Single_Sex_Age_Population_in_2000.Rds.csv" ))
####
DF1999 <- read_csv(paste0(DATA_LOC_RAW,"US_Single_Age_1999-2020.csv")) %>% select(Year,Sex,Age=`Single-Year Ages Code`,Mortality_Rate=`Crude Rate`) %>% mutate(Mortality_Rate=parse_number(Mortality_Rate)) %>% filter(!is.na(Mortality_Rate)) %>% mutate(Mortality_Rate=as.numeric(Mortality_Rate))
sum(AGE_ADJUST_REF_DATA$Population,na.rm=TRUE)/10^6
DF2018 <- read_csv(paste0(DATA_LOC_RAW,"US_Single_Age_2018-2023.csv")) %>% select(Year,Sex,Age=`Single-Year Ages Code`,Mortality_Rate=`Crude Rate`) %>% filter(!is.na(Mortality_Rate))%>% mutate(Mortality_Rate=parse_number(Mortality_Rate)) %>% filter(!is.na(Mortality_Rate)) %>% mutate(Mortality_Rate=as.numeric(Mortality_Rate))
OLDER1 <- read_csv(paste0(DATA_LOC_RAW,"US_10_Year_Age_Groups_1999-2020.csv"))%>% rename(Age=`Ten-Year Age Groups Code`,Mortality_Rate=`Crude Rate`) %>% filter(Age=='85+')%>% mutate(Age=85,Year=as.numeric(Year),Mortality_Rate=parse_number(Mortality_Rate)) %>% select(Year,Sex,Age,Mortality_Rate) %>% mutate(Mortality_Rate=as.numeric(Mortality_Rate),Age=as.numeric(Age))
OLDER2 <- read_csv(paste0(DATA_LOC_RAW,"US_10_Year_Age_Groups_2018-2023.csv"))%>% rename(Age=`Ten-Year Age Groups Code`,Mortality_Rate=`Crude Rate`) %>% filter(Age=='85+')%>% mutate(Age=85,Year=as.numeric(Year),Mortality_Rate=parse_number(Mortality_Rate)) %>% select(Year,Sex,Age,Mortality_Rate)%>% mutate(Mortality_Rate=as.numeric(Mortality_Rate),Age=as.numeric(Age))
DF <- rbind(DF1999,DF2018,OLDER1,OLDER2) %>% unique %>% group_by(Year,Sex,Age) %>% arrange(Year,Sex,Age) %>% mutate(Age=as.numeric(Age)) %>% ungroup
#hist(US_CAUSES$Death_Rate,breaks=150)
#Overall US death rates
DF <- rbind(DF1999,DF2018,OLDER1,OLDER2) %>% unique %>% group_by(Year,Sex,Age) %>% arrange(Year,Sex,Age) %>% mutate(Age=as.numeric(Age)) %>% ungroup
US_AGE_ADJ <- rbind(read_csv(paste0(DATA_LOC_RAW,"US_Age_Adjusted_1979-1998.csv")) %>% select(Year,Sex,US_Adj_Death_Rate=`Crude Rate`),read_csv(paste0(DATA_LOC_RAW,"US_Age_Adjusted_1999-2020.csv")) %>% select(Year,Sex,US_Adj_Death_Rate=`Crude Rate`),read_csv(paste0(DATA_LOC_RAW,"US_Age_Adjusted_2018-2023.csv")) %>% select(Year,Sex,US_Adj_Death_Rate=`Crude Rate`)) %>% unique
US_AGE_ADJ <- rbind(read_csv(paste0(DATA_LOC_RAW,"US_Age_Adjusted_1979-1998.csv")) %>% select(Year,Sex,US_Adj_Death_Rate=`Crude Rate`),read_csv(paste0(DATA_LOC_RAW,"US_Age_Adjusted_1999-2020.csv")) %>% select(Year,Sex,US_Adj_Death_Rate=`Crude Rate`),read_csv(paste0(DATA_LOC_RAW,"US_Age_Adjusted_2018-2023.csv")) %>% select(Year,Sex,US_Adj_Death_Rate=`Crude Rate`)) %>% unique
REG_DATA <- DF %>% left_join(US_AGE_ADJ) %>% pivot_wider(values_from=Mortality_Rate,names_from=Age,names_prefix="Age_")
saveRDS(DF,paste0(DATA_SAVE_LOC_RDS,"Single_Sex_Age_US_Mortality_Rate_Data_Long.Rds" ))
write_csv(DF,paste0(DATA_SAVE_LOC_RDS,"Single_Sex_Age_US_Mortality_Rate_Data_Long.csv" ))
saveRDS(REG_DATA,paste0(DATA_SAVE_LOC_RDS,"Single_Sex_Age_US_Mortality_Rate_Data_Wide.Rds" ))
write_csv(REG_DATA,paste0(DATA_SAVE_LOC_RDS,"Single_Sex_Age_US_Mortality_Rate_Data_Wide.csv" ))