Cleaning up Mortality process

This commit is contained in:
Alex 2025-11-21 16:19:29 -07:00
parent 98463bc6d7
commit be73496390
2 changed files with 74 additions and 4 deletions

42
Causes.r Normal file
View File

@ -0,0 +1,42 @@
library(tidyverse)
library(factoextra)
library(fixest)
library(corrplot)
################################Other work
DF1999 <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_Single_Age_1999-2020.csv") %>% select(Year,Sex,Age=`Single-Year Ages Code`,Mortality_Rate=`Crude Rate`) %>% mutate(Mortality_Rate=parse_number(Mortality_Rate)) %>% filter(!is.na(Mortality_Rate)) %>% mutate(Mortality_Rate=as.numeric(Mortality_Rate))
DF2018 <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_Single_Age_2018-2023.csv") %>% select(Year,Sex,Age=`Single-Year Ages Code`,Mortality_Rate=`Crude Rate`) %>% filter(!is.na(Mortality_Rate))%>% mutate(Mortality_Rate=parse_number(Mortality_Rate)) %>% filter(!is.na(Mortality_Rate)) %>% mutate(Mortality_Rate=as.numeric(Mortality_Rate))
OLDER1 <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_10_Year_Age_Groups_1999-2020.csv")%>% rename(Age=`Ten-Year Age Groups Code`,Mortality_Rate=`Crude Rate`) %>% filter(Age=='85+')%>% mutate(Age=85,Year=as.numeric(Year),Mortality_Rate=parse_number(Mortality_Rate)) %>% select(Year,Sex,Age,Mortality_Rate) %>% mutate(Mortality_Rate=as.numeric(Mortality_Rate),Age=as.numeric(Age))
OLDER2 <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_10_Year_Age_Groups_2018-2023.csv")%>% rename(Age=`Ten-Year Age Groups Code`,Mortality_Rate=`Crude Rate`) %>% filter(Age=='85+')%>% mutate(Age=85,Year=as.numeric(Year),Mortality_Rate=parse_number(Mortality_Rate)) %>% select(Year,Sex,Age,Mortality_Rate)%>% mutate(Mortality_Rate=as.numeric(Mortality_Rate),Age=as.numeric(Age))
#NOTE should add 85+ for 2018-2023
DF <- rbind(DF1999,DF2018,OLDER1,OLDER2) %>% unique %>% group_by(Year,Sex,Age) %>% arrange(Year,Sex,Age) %>% mutate(Age=as.numeric(Age)) %>% ungroup
US_CAUSES <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_Cause_of_Death_1999-2020.csv") %>% select(Year,ICD=`ICD Sub-Chapter Code`,Death_Rate=`Crude Rate`) %>% filter(!is.na(Death_Rate)) %>% mutate(Death_Rate=ifelse(Death_Rate=='Suppressed' |Death_Rate=='Unreliable',NA,Death_Rate)) %>% rbind(read_csv("Data/Raw_Data/Mortality_Rates_New/US_Cause_of_Death_2018-2023.csv") %>% select(Year,ICD=`ICD Sub-Chapter Code`,Death_Rate=`Crude Rate`) %>% filter(!is.na(Death_Rate)) %>% mutate(Death_Rate=ifelse(Death_Rate=='Suppressed' |Death_Rate=='Unreliable',NA,Death_Rate))) %>% mutate(Death_Rate=parse_number(Death_Rate)) %>% arrange(Year,ICD) %>% group_by(ICD) %>% filter(max(is.na(Death_Rate))==0,min(Death_Rate)!=max(Death_Rate)) %>% ungroup %>% unique
US_CAUSES <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_Cause_of_Death_1999-2020.csv") %>% select(Year,ICD=`ICD Sub-Chapter Code`,Death_Rate=`Crude Rate`) %>% filter(!is.na(Death_Rate)) %>% mutate(Death_Rate=ifelse(Death_Rate=='Suppressed' |Death_Rate=='Unreliable',NA,Death_Rate)) %>% rbind(read_csv("Data/Raw_Data/Mortality_Rates_New/US_Cause_of_Death_2018-2023.csv") %>% select(Year,ICD=`ICD Sub-Chapter Code`,Death_Rate=`Crude Rate`) %>% filter(!is.na(Death_Rate)) %>% mutate(Death_Rate=ifelse(Death_Rate=='Suppressed' |Death_Rate=='Unreliable',NA,Death_Rate))) %>% mutate(Death_Rate=parse_number(Death_Rate)) %>% arrange(Year,ICD) %>% group_by(ICD) %>% filter(max(is.na(Death_Rate))==0,min(Death_Rate)!=max(Death_Rate)) %>% ungroup %>% unique
BIND <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_Cause_of_Death_1999-2020.csv") %>% select(ICD=`ICD Sub-Chapter Code`,NAME=`ICD Sub-Chapter`) %>% unique
US_CAUSES <- US_CAUSES %>% left_join(BIND) %>% select(-ICD) %>% rename(ICD=NAME)
#hist(US_CAUSES$Death_Rate,breaks=150)
US_CAUSES
CAUSE_SUMMARY <- US_CAUSES %>% group_by(ICD) %>% summarize(Rate=mean(Death_Rate)) %>% summarize(ICD,Rate, Rank=rank(desc(Rate))) %>% arrange(Rank) %>% ungroup %>% filter(Rank<=40)
CAUSE_SUMMARY %>% print(n=100)
ICD_WIDE <- US_CAUSES %>% inner_join(CAUSE_SUMMARY %>% print(n=40) %>% select(ICD_RANK=Rank,ICD)) %>% select(-ICD) %>% unique %>% pivot_wider(values_from=Death_Rate,names_from=ICD_RANK,names_prefix="ICD_")
ICD_WIDE <- ICD_WIDE %>% select(c("Year",sort(colnames(ICD_WIDE[,-1]))))
####
US_AGE_ADJ <- rbind(read_csv("Data/Raw_Data/Mortality_Rates_New/US_Age_Adjusted_1979-1998.csv") %>% select(Year,Sex,US_Adj_Death_Rate=`Crude Rate`),read_csv("Data/Raw_Data/Mortality_Rates_New/US_Age_Adjusted_1999-2020.csv") %>% select(Year,Sex,US_Adj_Death_Rate=`Crude Rate`),read_csv("Data/Raw_Data/Mortality_Rates_New/US_Age_Adjusted_2018-2023.csv") %>% select(Year,Sex,US_Adj_Death_Rate=`Crude Rate`)) %>% unique
####
REG_DATA <- DF %>% left_join(US_AGE_ADJ) %>% left_join(ICD_WIDE)
#REG_DATA <- DF %>% left_join(ICD_WIDE)
TEST <- REG_DATA %>% pivot_wider(values_from=Mortality_Rate,names_from=Age,names_prefix="Age_")
TEST[,4:129] <- TEST[,4:129]/t(TEST[,3])
REG_DATA %>% pivot_wider(values_from=Mortality_Rate,names_from=Age,names_prefix="Age_") %>% group_by(Sex)
REG_DATA
MOD <- feols(Age_.[0:85]~US_Adj_Death_Rate+Sex*Year,TEST %>% filter(Sex=="Male"))
summary(MOD[[1]])
acf(resid(MOD[[43]]))
predict(MOD[[2]],TEST[1,])

View File

@ -22,7 +22,6 @@ DF <- rbind(LIN_1999,LIN_2018,WY_1999,US_1999,US_2018,WY_2018,WY_1979,US_1979,LI
ggplot(DF,aes(x=Year,y=Mort_Rate,group=Region,color=Region,fill=Region))+geom_point()+geom_smooth(method="lm")+ facet_grid(. ~ Sex) ggplot(DF,aes(x=Year,y=Mort_Rate,group=Region,color=Region,fill=Region))+geom_point()+geom_smooth(method="lm")+ facet_grid(. ~ Sex)
ggplot(DF,aes(x=Year,y=Mort_Rate,group=Region,color=Region,fill=Region))+geom_point()+geom_smooth(span=0.4)+ facet_grid(. ~ Sex) ggplot(DF,aes(x=Year,y=Mort_Rate,group=Region,color=Region,fill=Region))+geom_point()+geom_smooth(span=0.4)+ facet_grid(. ~ Sex)
ggplot(DF,aes(x=Year,y=Mort_Rate,group=Region,color=Region,fill=Region))+geom_point()+geom_line()+ facet_grid(. ~ Sex) ggplot(DF,aes(x=Year,y=Mort_Rate,group=Region,color=Region,fill=Region))+geom_point()+geom_line()+ facet_grid(. ~ Sex)
ggplot(DF,aes(x=Year,y=Mort_Rate,group=Region,color=Region,fill=Region))+geom_point()+geom_smooth()
########################################################ARIMA ########################################################ARIMA
PANDIMIC_INDEX <- read_csv("https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23ebf3fb&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=off&txtcolor=%23444444&ts=12&tts=12&width=1320&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=WUPI&scale=left&cosd=1996-01-01&coed=2025-07-01&line_color=%230073e6&link_values=false&line_style=solid&mark_type=none&mw=3&lw=3&ost=-99999&oet=99999&mma=0&fml=a&fq=Annual&fam=sum&fgst=lin&fgsnd=2020-02-01&line_index=1&transformation=lin&vintage_date=2025-11-20&revision_date=2025-11-20&nd=1996-01-01") %>% mutate(Year=year(observation_date)) %>% select(Year,WUPI)%>% mutate(L_WUPI=lag(WUPI),L_TWO_WUPI=lag(WUPI,2)) PANDIMIC_INDEX <- read_csv("https://fred.stlouisfed.org/graph/fredgraph.csv?bgcolor=%23ebf3fb&chart_type=line&drp=0&fo=open%20sans&graph_bgcolor=%23ffffff&height=450&mode=fred&recession_bars=off&txtcolor=%23444444&ts=12&tts=12&width=1320&nt=0&thu=0&trc=0&show_legend=yes&show_axis_titles=yes&show_tooltip=yes&id=WUPI&scale=left&cosd=1996-01-01&coed=2025-07-01&line_color=%230073e6&link_values=false&line_style=solid&mark_type=none&mw=3&lw=3&ost=-99999&oet=99999&mma=0&fml=a&fq=Annual&fam=sum&fgst=lin&fgsnd=2020-02-01&line_index=1&transformation=lin&vintage_date=2025-11-20&revision_date=2025-11-20&nd=1996-01-01") %>% mutate(Year=year(observation_date)) %>% select(Year,WUPI)%>% mutate(L_WUPI=lag(WUPI),L_TWO_WUPI=lag(WUPI,2))
DF <- DF %>% ungroup%>% group_by(Sex,Region) %>% arrange(Sex,Region,Year) %>% mutate(L_Mort_Rate=lag(Mort_Rate)) %>% ungroup DF <- DF %>% ungroup%>% group_by(Sex,Region) %>% arrange(Sex,Region,Year) %>% mutate(L_Mort_Rate=lag(Mort_Rate)) %>% ungroup
@ -111,7 +110,6 @@ fviz_nbclust(COR_DAT,kmeans,nboot=1000)
km.res <- kmeans(COR_DAT, 3, nstart = 1) km.res <- kmeans(COR_DAT, 3, nstart = 1)
fviz_cluster(km.res,COR_DAT) fviz_cluster(km.res,COR_DAT)
print(km.res) print(km.res)
COR_DAT COR_DAT
SINGLE_DATA %>% filter(!is.numeric(Mortality_Rate)) SINGLE_DATA %>% filter(!is.numeric(Mortality_Rate))
SINGLE_DATA <- SINGLE_DATA %>% mutate(Age=as.numeric(Age),US_Rate=as.numeric(US_Rate),Year=as.numeric(Year),Mortality_Rate=as.numeric(Mortality_Rate)) SINGLE_DATA <- SINGLE_DATA %>% mutate(Age=as.numeric(Age),US_Rate=as.numeric(US_Rate),Year=as.numeric(Year),Mortality_Rate=as.numeric(Mortality_Rate))
@ -143,12 +141,42 @@ resid(MOD)
predict predict
REG_SINGLE_DATA <- SINGLE_DATA_PLAIN %>% pivot_wider(values_from="Mortality_Rate",names_from=c("Age")) REG_SINGLE_DATA <- SINGLE_DATA_PLAIN %>% mutate( %>% pivot_wider(values_from="Mortality_Rate",names_from=c("Age"))
MALE <- REG_SINGLE_DATA %>% filter(Sex=='Male') MALE <- REG_SINGLE_DATA %>% filter(Sex=='Male')
FEMALE <- REG_SINGLE_DATA %>% filter(Sex=='Female') FEMALE <- REG_SINGLE_DATA %>% filter(Sex=='Female')
library(corrplot) library(corrplot)
corrplot(cor(MALE %>% select(-Sex))) corrplot(cor(MALE %>% select(-Sex)))
MALE US_CAUSES <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_Cause_of_Death_1999-2020.csv") %>% select(Year,ICD=`ICD Sub-Chapter Code`,Death_Rate=`Crude Rate`) %>% filter(!is.na(Death_Rate)) %>% mutate(Death_Rate=ifelse(Death_Rate=='Suppressed' |Death_Rate=='Unreliable',NA,Death_Rate)) %>% rbind(read_csv("Data/Raw_Data/Mortality_Rates_New/US_Cause_of_Death_2018-2023.csv") %>% select(Year,ICD=`ICD Sub-Chapter Code`,Death_Rate=`Crude Rate`) %>% filter(!is.na(Death_Rate)) %>% mutate(Death_Rate=ifelse(Death_Rate=='Suppressed' |Death_Rate=='Unreliable',NA,Death_Rate))) %>% mutate(Death_Rate=parse_number(Death_Rate)) %>% arrange(Year,ICD) %>% group_by(ICD) %>% filter(max(is.na(Death_Rate))==0,min(Death_Rate)!=max(Death_Rate)) %>% ungroup %>% unique
BIND <- read_csv("Data/Raw_Data/Mortality_Rates_New/US_Cause_of_Death_1999-2020.csv") %>% select(ICD=`ICD Sub-Chapter Code`,NAME=`ICD Sub-Chapter`) %>% unique
US_CAUSES <- US_CAUSES %>% left_join(BIND) %>% select(-ICD) %>% rename(ICD=NAME)
US_CAUSES %>% group_by(ICD) %>% summarize(Rate=mean(Death_Rate)) %>% summarize(ICD,Rate, Rank=rank(desc(Rate))) %>% arrange(Rank)
ggplot(US_CAUSES,aes(x=Year,y=scale(Death_Rate),group=ICD,color=ICD,fill=ICD)) +geom_point() +geom_smooth()+theme(legend.position="bottom")
US_CAUSES
parse_number(REG_SINGLE_DATA[,3:89])
US_CAUSES <- US_CAUSES %>% pivot_wider(values_from=Death_Rate,names_from=ICD)
CAUS_MAT <- US_CAUSES %>% select(-Year)
sd(t(CAUS_MAT)[,118])
CAUS_MAT <- scale(CAUS_MAT)
fviz_nbclust(CAUS_MAT,kmeans,"gap_stat") #6
fviz_nbclust(CAUS_MAT,kmeans,"wss") #5
fviz_nbclust(CAUS_MAT,kmeans) #2
km.res <- kmeans(CAUS_MAT, 6, nstart = 1)
fviz_cluster(km.res,CAUS_MAT)
summary(km.res
print(km.res)
km.res$cluster
corrplot(cor(US_CAUSES))
MALE <- US_CAUSES %>% left_join(MALE) %>% select(Year,Sex,US_Rate,everything())
MALE %>% tail
COR_MALE <- MALE %>% select(-Sex) %>% as.matrix
corrplot(cor(COR_MALE,use="pairwise.complete"),type="lower",diag=FALSE,)
?corrplot
COR_MALE
corrplot(cor(cbind(MALE[,1],MALE[,4:ncol(MALE)]/t(MALE[,3])))) corrplot(cor(cbind(MALE[,1],MALE[,4:ncol(MALE)]/t(MALE[,3]))))
corrplot(cor(log(FEMALE %>% select(-Sex)))) corrplot(cor(log(FEMALE %>% select(-Sex))))
TEMP <- MALE %>% select(RATE=`36`,US_Rate,Year) %>% as.data.frame TEMP <- MALE %>% select(RATE=`36`,US_Rate,Year) %>% as.data.frame