Signficant Script Cleaning midsteam
This commit is contained in:
parent
d66fa06536
commit
285026e421
1
.gitignore
vendored
1
.gitignore
vendored
@ -2,6 +2,7 @@
|
||||
#
|
||||
*.png
|
||||
*.csv
|
||||
Data/Cleaned_Data/
|
||||
|
||||
Results/
|
||||
#Don't save any major data files on the server, can be regenerated after pulling
|
||||
|
||||
Binary file not shown.
Binary file not shown.
|
Can't render this file because it contains an unexpected character in line 5 and column 13.
|
|
Can't render this file because it contains an unexpected character in line 5 and column 13.
|
|
Can't render this file because it contains an unexpected character in line 5 and column 13.
|
|
Can't render this file because it contains an unexpected character in line 5 and column 13.
|
|
Can't render this file because it contains an unexpected character in line 5 and column 13.
|
|
Can't render this file because it contains an unexpected character in line 5 and column 13.
|
|
Can't render this file because it contains an unexpected character in line 5 and column 13.
|
|
Can't render this file because it contains an unexpected character in line 5 and column 13.
|
|
Can't render this file because it contains an unexpected character in line 5 and column 13.
|
|
Can't render this file because it contains an unexpected character in line 5 and column 13.
|
|
Can't render this file because it contains an unexpected character in line 5 and column 13.
|
|
Can't render this file because it contains an unexpected character in line 5 and column 13.
|
|
Can't render this file because it contains an unexpected character in line 5 and column 13.
|
|
Can't render this file because it contains an unexpected character in line 5 and column 13.
|
|
Can't render this file because it contains an unexpected character in line 5 and column 13.
|
|
Can't render this file because it contains an unexpected character in line 5 and column 13.
|
|
Can't render this file because it contains an unexpected character in line 5 and column 13.
|
5
Data/Raw_Data/Mortality_Rates/README_MORTALITY.txt
Normal file
5
Data/Raw_Data/Mortality_Rates/README_MORTALITY.txt
Normal file
@ -0,0 +1,5 @@
|
||||
Data files gathered manually from:
|
||||
|
||||
https://hdpulse.nimhd.nih.gov/data-portal/mortality/table?cod=247&cod_options=cod_15&ratetype=aa&ratetype_options=ratetype_2&race=00&race_options=race_6&sex=2&sex_options=sex_3&age=177&age_options=age_11&ruralurban=0&ruralurban_options=ruralurban_3&yeargroup=5&yeargroup_options=year5yearmort_1&statefips=56&statefips_options=area_states&county=56000&county_options=counties_wyoming&comparison=counties_to_us&comparison_options=comparison_counties&radio_comparison=areas&radio_comparison_options=cods_or_areas
|
||||
|
||||
Each file is single age group, so age weighting does not apply despite the variable names
|
||||
1
Data/Raw_Data/Mortality_Rates/README_MORTALITY_DATA.txt
Normal file
1
Data/Raw_Data/Mortality_Rates/README_MORTALITY_DATA.txt
Normal file
@ -0,0 +1 @@
|
||||
[1] "Data files gathered manually from:\n\n\n\n\nhttps://hdpulse.nimhd.nih.gov/data-portal/mortality/table?cod=247&cod_options=cod_15&ratetype=aa&ratetype_options=ratetype_2&race=00&race_options=race_6&sex=2&sex_options=sex_3&age=177&age_options=age_11&ruralurban=0&ruralurban_options=ruralurban_3&yeargroup=5&yeargroup_options=year5yearmort_1&statefips=56&statefips_options=area_states&county=56000&county_options=counties_wyoming&comparison=counties_to_us&comparison_options=comparison_counties&radio_comparison=areas&radio_comparison_options=cods_or_areas\n\n\n\n\nEach file is single age group, so age weighting does not apply despite the variable names"
|
||||
BIN
Data/Raw_Data/Population/Pop_1970s.xls
Normal file
BIN
Data/Raw_Data/Population/Pop_1970s.xls
Normal file
Binary file not shown.
BIN
Data/Raw_Data/Population/Pop_1980s.xls
Normal file
BIN
Data/Raw_Data/Population/Pop_1980s.xls
Normal file
Binary file not shown.
BIN
Data/Raw_Data/Population/Pop_1990s.xls
Normal file
BIN
Data/Raw_Data/Population/Pop_1990s.xls
Normal file
Binary file not shown.
@ -4,6 +4,32 @@ ACS_FILE_LOC <- "Data/Cleaned_Data/ACS_Census_Demographic_Data.Rds"
|
||||
ACS_YEAR <- 2023 #Year of the ACS data
|
||||
if(!file.exists(ACS_FILE_LOC)){source("Scripts/Load_ACS_Census_Data.r")}else{AGE_DATA <- readRDS(ACS_FILE_LOC)}
|
||||
LIN_DEMOGRAPHICS <- readRDS("Data/Cleaned_Data/Lincoln_Demographic_Data.Rds") %>% filter(Year==ACS_YEAR)
|
||||
LIN_DEMOGRAPHICS
|
||||
GET_INTIAL_DATA_SUMMARY <- function(DEMO_DATA,FILE_SAVE_NAME,ST_POP=NA,Area=NA,ST_YEAR=NA,SAVE_DIR='Data/Cleaned_Data/Intiate_Simulation/'){
|
||||
ST_BIRTH <- round(sum((DEMO_DATA%>% filter(Age==0))[4:5]))
|
||||
ST_BIRTH_TWO_PREV <- round(sum((DEMO_DATA%>% filter(Age==1))[4:5]))
|
||||
if(is.na(ST_YEAR)){ST_YEAR=max(POP_DATA$Year)}
|
||||
|
||||
if(is.na(ST_POP)){
|
||||
if(Area=='Kemmerer'){
|
||||
ST_POP<- POP_DATA %>% filter(City %in% c('Kemmerer','Diamondville'),Year==ST_YEAR) %>% mutate(Population=as.numeric(Population)) %>% pull(Population) %>% sum()
|
||||
} else if(Area=='Other Lincoln'){
|
||||
ST_POP_KEM<- POP_DATA %>% filter(City %in% c('Kemmerer','Diamondville'),Year==ST_YEAR) %>% mutate(Population=as.numeric(Population)) %>% pull(Population) %>% sum()
|
||||
|
||||
ST_POP_LIN <- readRDS("Data/Cleaned_Data/Wyoming_County_Population.Rds") %>% filter(Year==ST_YEAR,County==Area) %>% pull(Population) #Grab the year of data for the county
|
||||
ST_POP <- ST_POP_LIN-ST_POP_KEM
|
||||
|
||||
} else{
|
||||
ST_POP <- readRDS("Data/Cleaned_Data/Wyoming_County_Population.Rds") %>% filter(Year==ST_YEAR,County==Area) %>% pull(Population) #Grab the year of data for the county
|
||||
}
|
||||
}
|
||||
|
||||
INTIATE_DATA <- DEMO_DATA %>% mutate(Male_Window=Age>=18 & Age<=30,Female_Window=Age>=18 & Age<=28) %>% group_by(County,Year) %>% summarize(Female_Birth_Group=sum(Num_Female*Female_Window),Male_Birth_Group=sum(Num_Male*Male_Window),Min_Birth_Group=ifelse(Female_Birth_Group<Male_Birth_Group,Female_Birth_Group,Male_Birth_Group),PREV_BIRTH=ST_BIRTH,PREV_TWO_BIRTH=ST_BIRTH_TWO_PREV,Population=ST_POP) %>% mutate(Births=NA,Deaths=NA,Migration=NA) %>% select(Year,County,Population,Births,Deaths,Migration,Min_Birth_Group,PREV_BIRTH,PREV_TWO_BIRTH) %>% ungroup
|
||||
saveRDS(INTIATE_DATA ,paste0(SAVE_DIR,FILE_SAVE_NAME,".Rds"))
|
||||
write_csv(INTIATE_DATA ,paste0(SAVE_DIR,FILE_SAVE_NAME,".csv"))
|
||||
return(INTIATE_DATA)
|
||||
GET_INTIAL_DATA_SUMMARY(LIN_DEMOGRAPHICS,"test")
|
||||
|
||||
|
||||
for(i in 1:nrow(AGE_DATA)){
|
||||
if(i==1 & exists("RES")){rm(RES)}
|
||||
@ -31,6 +57,7 @@ KEM_DEMOGRAPHICS <- RES %>% filter(IN_KEM==1) %>% select(Age,Sex,Per_Pop) %>% mu
|
||||
INTIATE_KEMMER <- KEM_DEMOGRAPHICS %>% mutate(Male_Window=Age>=18 & Age<=30,Female_Window=Age>=18 & Age<=28) %>% group_by(County,Year) %>% summarize(Female_Birth_Group=sum(Num_Female*Female_Window),Male_Birth_Group=sum(Num_Male*Male_Window),Min_Birth_Group=ifelse(Female_Birth_Group<Male_Birth_Group,Female_Birth_Group,Male_Birth_Group),PREV_BIRTH=ST_BIRTH_KEM,PREV_TWO_BIRTH=ST_BIRTH_TWO_PREV_KEM,Population=CURRENT_KEM_POP) %>% mutate(Births=NA,Deaths=NA,Migration=NA) %>% select(Year,County,Population,Births,Deaths,Migration,Min_Birth_Group,PREV_BIRTH,PREV_TWO_BIRTH) %>% ungroup
|
||||
saveRDS(INTIATE_KEMMER ,"Data/Cleaned_Data/Intiate_Simulation/Kemmerer_Summary_Start_Data.Rds")
|
||||
write_csv(INTIATE_KEMMER ,"Data/Cleaned_Data/Intiate_Simulation/Kemmerer_Summary_Start_Data.csv")
|
||||
|
||||
###Create data sets that contain all people not in Loncoln but not Kemmerer or Diamondville, for a seperate comparison run
|
||||
LIN_DEMOGRAPHICS_RECENT <- readRDS("Data/Cleaned_Data/Lincoln_Demographic_Data.Rds") %>% filter(Year==max(Year)) #Grab the most recent year of data for all of the county
|
||||
LIN_OTHER_DEMOGRAPHICS <- LIN_DEMOGRAPHICS_RECENT %>% left_join(KEM_DEMOGRAPHICS %>% select(-County,ADJ_MALE=Num_Male,ADJ_FEMALE=Num_Female)) %>% mutate(Num_Male=Num_Male-ADJ_MALE,Num_Female=Num_Female-ADJ_FEMALE) %>% select(-ADJ_MALE,-ADJ_FEMALE) %>% ungroup#Deduct the populaiton estimates of Kemmerer to create a "Other Lincoln" data set
|
||||
|
||||
@ -1,12 +1,14 @@
|
||||
#################IMPORTANT!!!! CLEAN UP DATA SCRIPT FOR USE
|
||||
#############################Clean up scirpt folders, simulations of deaths should be seperated from this file. Death rate simulation is complicated and should be commented, and turned into a seperate script.
|
||||
##############WORKING ON REGRESSION OF BIRTHS FOR PREDICTIONS OF GROWTH, THINK ABOUT GOING FROM COUNTY TO CITY, SEPERATE REGRESION FROM DATA COLLOECTION
|
||||
#############################Clean up script folders, simulations of deaths should be separated from this file. Death rate simulation is complicated and should be commented, and turned into a separate script.
|
||||
library(rvest)
|
||||
library(tidyverse)
|
||||
library(readxl)
|
||||
#setwd("../")
|
||||
###Create Location to Save raw data sets
|
||||
if(!exists("SAVE_LOC_RAW")){SAVE_LOC_RAW <-"./Data/Raw_Data/"}
|
||||
dir.create(SAVE_LOC_RAW, recursive = TRUE, showWarnings = FALSE)
|
||||
########County, Death, Birth and Migration Data
|
||||
#Data found on the page http://eadiv.state.wy.us/pop/
|
||||
#Website States: Wyoming Economic Analysis Division based on U.S. Census Bureau's population estimation and vital stats above
|
||||
PAGE <- read_html("http://eadiv.state.wy.us/pop/BirthDeathMig.htm")
|
||||
NODE <- html_element(PAGE ,"table")
|
||||
TBL <- html_table(NODE)
|
||||
@ -30,7 +32,7 @@ TBL <- TBL %>% filter(!is.na(Type)) %>% select(County,Type,everything())
|
||||
GROUP <- colnames(TBL)[-1:-2]
|
||||
Data <- pivot_longer(TBL,all_of(GROUP),names_to="Year",values_to="Pop_Change")
|
||||
Data$County <- ifelse(toupper(Data$County)=="TOTAL","Wyoming",Data$County)
|
||||
WY_COUNTY_DATA_SET <- pivot_wider(Data,names_from=Type,values_from=Pop_Change) %>% rename("Migration"=`Net Migration`) %>% mutate(Year=as.integer(Year),Births=parse_number(Births),Deaths=parse_number(Deaths),Migration=parse_number(Migration)) %>% mutate(Year=Year-1) #Data apears to be one off from populaiton
|
||||
WY_COUNTY_DATA_SET <- pivot_wider(Data,names_from=Type,values_from=Pop_Change) %>% rename("Migration"=`Net Migration`) %>% mutate(Year=as.integer(Year),Births=parse_number(Births),Deaths=parse_number(Deaths),Migration=parse_number(Migration)) %>% mutate(Year=Year-1) #Data appears to be one off from population
|
||||
WY_COUNTY_DATA_SET[,"County"] <- gsub(" ","_",WY_COUNTY_DATA_SET %>% pull(County))
|
||||
|
||||
########################City and County Population Data 2020 to 2024
|
||||
@ -116,12 +118,15 @@ CITY_POP <- rbind(CITY_TBL,CITY_POP)
|
||||
CITY_POP$City <- gsub("LaGrange","La_Grange",CITY_POP$City)
|
||||
COUNTY_POP <- rbind(COUNTY_TBL,COUNTY_POP)
|
||||
####################County and City Population Data for 1990-2000
|
||||
if(!file.exists("./Data/Pop_1990s.xls")){download.file('http://eadiv.state.wy.us/pop/c&sc90_00.xls',"./Data/Pop_1990s.xls")}
|
||||
TEMP <- read_xls("Data/Pop_1990s.xls",skip=2)[-1:-4,]
|
||||
colnames(TEMP)[1] <- "County"
|
||||
tail(TEMP)
|
||||
TEMP <- TEMP[1:which(TEMP[,1]=="Wind River Res."),]
|
||||
#Location to save any raw population files. Most files are not saved since they are pulled from a html and not a excel file, but older files are only available as excel files
|
||||
SAVE_LOC_RAW_POP <- paste0(SAVE_LOC_RAW,"/Population")
|
||||
dir.create(SAVE_LOC_RAW_POP , recursive = TRUE, showWarnings = FALSE)
|
||||
|
||||
POP_FILE_1990 <- paste0(SAVE_LOC_RAW_POP,"/Pop_1990s.xls")
|
||||
if(!file.exists(POP_FILE_1990)){download.file('http://eadiv.state.wy.us/pop/c&sc90_00.xls',POP_FILE_1990)}
|
||||
TEMP <- read_xls(POP_FILE_1990,skip=2)[-1:-4,]
|
||||
colnames(TEMP)[1] <- "County"
|
||||
TEMP <- TEMP[1:which(TEMP[,1]=="Wind River Res."),]
|
||||
TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population))
|
||||
TEMP <- TEMP %>% filter(Year!=2000)
|
||||
TEMP_COUNTY <- TEMP[grepl("Cnty",TEMP %>% pull(County),ignore.case=TRUE),]
|
||||
@ -138,8 +143,10 @@ TEMP_CITY <- TEMP_CITY %>% filter(Year!=2000)
|
||||
|
||||
try(rm(TEMP_CITY,TEMP_COUNTY,TEMP))
|
||||
####################County and City Population Data for 1980-1990
|
||||
if(!file.exists("./Data/Pop_1980s.xls")){download.file('http://eadiv.state.wy.us/pop/C&SC8090.xls',"./Data/Pop_1980s.xls")}
|
||||
TEMP <- read_xls("Data/Pop_1980s.xls",skip=2)[-1:-4,]
|
||||
POP_FILE_1980 <- paste0(SAVE_LOC_RAW_POP ,"/Pop_1980s.xls")
|
||||
if(!file.exists(POP_FILE_1980)){download.file('http://eadiv.state.wy.us/pop/C&SC8090.xls',POP_FILE_1980)}
|
||||
|
||||
TEMP <- read_xls(POP_FILE_1980,skip=2)[-1:-4,]
|
||||
colnames(TEMP)[1] <- "County"
|
||||
TEMP <- TEMP[2:which(TEMP[,1]=="Upton"),1:(min(which(is.na(TEMP[2,])))-1)]
|
||||
TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population))
|
||||
@ -162,8 +169,10 @@ COUNTY_POP <- rbind(TEMP_COUNTY,COUNTY_POP)
|
||||
try(rm(TEMP_CITY,TEMP_COUNTY,TEMP))
|
||||
|
||||
####################County Population Data for 1970-1980
|
||||
if(!file.exists("./Data/Pop_1970s.xls")){download.file('http://eadiv.state.wy.us/pop/Cnty7080.xls',"./Data/Pop_1970s.xls")}
|
||||
TEMP <- read_xls("Data/Pop_1970s.xls",skip=2)[-1:-4,]
|
||||
POP_FILE_1970 <- paste0(SAVE_LOC_RAW_POP ,"/Pop_1970s.xls")
|
||||
if(!file.exists(POP_FILE_1970)){download.file('http://eadiv.state.wy.us/pop/Cnty7080.xls',POP_FILE_1970)}
|
||||
|
||||
TEMP <- read_xls(POP_FILE_1970,skip=2)[-1:-4,]
|
||||
colnames(TEMP)[1] <- "County"
|
||||
TEMP <- TEMP[1:which(TEMP[,1]=="Weston"),]
|
||||
TEMP <- pivot_longer(TEMP,all_of(colnames(TEMP)[-1]),names_to="Year",values_to="Population") %>% mutate(Year=as.integer(Year),Population=as.numeric(Population))
|
||||
@ -197,123 +206,23 @@ try(rm(TEMP))
|
||||
colnames(TEMP ) <- c("Year","City","Population")
|
||||
TEMP <- as_tibble(TEMP)
|
||||
CITY_POP <- rbind(TEMP,CITY_POP) %>% arrange(City,Year)
|
||||
#Remove empty values, ensure all numeric values are not saved as characters
|
||||
CITY_POP <- CITY_POP %>% filter(!is.na(Population) ) %>% mutate(Population=parse_number(Population),Year=parse_number(Year))
|
||||
#Add Other Data
|
||||
COUNTY_POP <- COUNTY_POP %>% mutate(Year=as.integer(Year)) %>% unique
|
||||
COUNTY_POP <- COUNTY_POP %>% mutate(Year=as.numeric(Year)) %>% unique
|
||||
WY_COUNTY_DATA_SET <- COUNTY_POP %>% left_join(WY_COUNTY_DATA_SET ) %>% mutate(Population=as.numeric(Population)) %>% unique
|
||||
|
||||
###Save Population Results
|
||||
write_csv(CITY_POP,file="./Data/Cleaned_Data/Wyoming_City_Population.csv")
|
||||
write_csv(WY_COUNTY_DATA_SET ,file="./Data/Cleaned_Data/Wyoming_County_Population.csv")
|
||||
saveRDS(CITY_POP,file="./Data/Cleaned_Data/Wyoming_City_Population.Rds")
|
||||
saveRDS(WY_COUNTY_DATA_SET ,file="./Data/Cleaned_Data/Wyoming_County_Population.Rds")
|
||||
if(!exists("SAVE_LOC_POP")){SAVE_LOC_POP <-"./Data/Cleaned_Data/Population_Data"}
|
||||
CSV_SAVE <- paste0(SAVE_LOC_POP,"/CSV")
|
||||
RDS_SAVE <- paste0(SAVE_LOC_POP,"/RDS")
|
||||
dir.create(CSV_SAVE, recursive = TRUE, showWarnings = FALSE)
|
||||
dir.create(RDS_SAVE, recursive = TRUE, showWarnings = FALSE)
|
||||
|
||||
saveRDS(CITY_POP,paste0(RDS_SAVE,"/All_Wyoming_City_Populations.Rds" ))
|
||||
write_csv(CITY_POP,paste0(CSV_SAVE,"/All_Wyoming_City_Populations.csv" ))
|
||||
|
||||
saveRDS(WY_COUNTY_DATA_SET,paste0(RDS_SAVE,"/All_Wyoming_County_Populations.Rds" ))
|
||||
write_csv(WY_COUNTY_DATA_SET,paste0(CSV_SAVE,"/All_Wyoming_County_Populations.csv" ))
|
||||
|
||||
|
||||
###################Demographics
|
||||
if(!file.exists("./Data/Demo_Single_Year_2020s.xls")){download.file('http://eadiv.state.wy.us/Pop/CO_SYASEX24.xlsx',"./Data/Demo_Single_Year_2020s.xls")}
|
||||
TEMP <- read_xlsx("./Data/Demo_Single_Year_2020s.xls",skip=2)[,-1]
|
||||
TEMP <- TEMP[1:(min(which(is.na(TEMP[,1])))-1),]
|
||||
TEMP <- TEMP[!grepl("Base",TEMP$YEAR,ignore.case=TRUE),] #There are two population values provided. I believe one is the census baseline, and one is a estimate in July. Keep the later estimate, to line up with the same seasonal collection pattern of the rest of the data
|
||||
TEMP$YEAR <- year(as.Date(substr((TEMP$YEAR),1,8),format="%m/%d/%Y"))
|
||||
colnames(TEMP) <- c("County","Year","Age","Number","Num_Male","Num_Female")
|
||||
TEMP$County <- gsub(" County","",TEMP$County,ignore.case=TRUE)
|
||||
DEM_2020 <- TEMP %>% select(-Number)
|
||||
###Demogrpahics all
|
||||
DEM_DATA <- read_delim("Data/County_Demographics_Census/wy.1969_2023.singleages.through89.90plus.txt",delim=" ",col_names=c("ID","VALUES"),col_types=list('c','c'))
|
||||
DEM_DATA$Year <- as.integer(substr(DEM_DATA$ID,1,4))
|
||||
DEM_DATA$fips<- substr(DEM_DATA$ID,7,11)
|
||||
COUNTY_LIST <- read_csv("https://github.com/kjhealy/fips-codes/raw/refs/heads/master/county_fips_master.csv",col_types=list('c','c')) %>% filter(state_abbr=="WY") %>% select(fips,County=county_name) %>% mutate(County=gsub(" ","_",gsub(" County","",County,ignore.case=TRUE)))
|
||||
DEM_DATA <- DEM_DATA %>% left_join(COUNTY_LIST) %>% select(-fips)
|
||||
#16=3
|
||||
DEM_DATA$Sex <- ifelse(substr(DEM_DATA$VALUES,3,3)==1,"Male","Female")
|
||||
DEM_DATA$Age <- parse_number(substr(DEM_DATA$VALUES,4,5))
|
||||
DEM_DATA$Number <- parse_number(substr(DEM_DATA$VALUES,6,14))
|
||||
DEM_DATA <- DEM_DATA %>% select(-ID,-VALUES)
|
||||
|
||||
|
||||
DEM_DATA <- DEM_DATA %>% group_by(Year,County,Sex,Age) %>% summarize(Number=sum(Number)) %>% ungroup()#Aggregate to sex and age level
|
||||
|
||||
#The Wyoming census data seems newer than this data set from SEER cancer data source. Drop any of these records that overlap with the Wyoming data before merging. Arrange so the column order is the same between the two data sets, so they can be easily bound together.
|
||||
DEM_DATA <- pivot_wider(DEM_DATA,names_from=Sex,values_from=Number) %>% rename(Num_Female=Female,Num_Male=Male) %>% select(colnames(DEM_2020)) %>% filter(Year<min(DEM_2020$Year)) %>% unique
|
||||
DEM_DATA <- rbind(DEM_2020,DEM_DATA) %>% ungroup %>% arrange(Year,Age) %>% unique
|
||||
|
||||
###Save demographic data set
|
||||
LIN_DEM <- DEM_DATA %>% filter(County=='Lincoln')
|
||||
write_csv(LIN_DEM,file="./Data/Cleaned_Data/Lincoln_Demographic_Data.csv")
|
||||
write_csv(DEM_DATA,file="./Data/Cleaned_Data/Wyoming_County_Demographic_Data.csv")
|
||||
saveRDS(LIN_DEM,file="./Data/Cleaned_Data/Lincoln_Demographic_Data.Rds")
|
||||
saveRDS(DEM_DATA,file="./Data/Cleaned_Data/Wyoming_County_Demographic_Data.Rds")
|
||||
|
||||
#########################################Mortality Rate
|
||||
GET_MORTALITY_DATA <- function(FILE,SEX,LOWER_AGE,UPPER_AGE){
|
||||
#Create clean mortality rate data
|
||||
#Data gathered from https://hdpulse.nimhd.nih.gov/data-portal/mortality/table?cod=247&cod_options=cod_15&ratetype=aa&ratetype_options=ratetype_2&race=00&race_options=race_6&sex=2&sex_options=sex_3&age=177&age_options=age_11&ruralurban=0&ruralurban_options=ruralurban_3&yeargroup=5&yeargroup_options=year5yearmort_1&statefips=56&statefips_options=area_states&county=56000&county_options=counties_wyoming&comparison=counties_to_us&comparison_options=comparison_counties&radio_comparison=areas&radio_comparison_options=cods_or_areas
|
||||
|
||||
NAMES <- c("County","FIPS","Death_Rate","Lower_Rate","Upper_Rate","Deaths","Trend_Category","Trend","Lower_Trend","Upper_Trend")
|
||||
DF <- read_csv(FILE,skip=5,col_names=NAMES,col_types=list('c',"i",'d','d','d','d','c','d','d','d')) %>% filter(grepl("County|Wyoming",County)|County=="United States") %>% mutate(Rate_SD=(Upper_Rate-Lower_Rate)/(2*1.96),Trend_SD=(Upper_Trend-Lower_Trend)/(2*1.96)) %>% select(County,Death_Rate,Rate_SD,Trend,Trend_SD)
|
||||
DF$County <- gsub(" County","",DF$County,ignore.case=TRUE)
|
||||
DF[,-1] <- DF[,-1]/100000
|
||||
WYOMING_TREND <- pull(DF[DF$County=="Wyoming",],"Trend")
|
||||
US_TREND <- pull(DF[DF$County=="United States",],"Trend")
|
||||
|
||||
WYOMING_RATE <- pull(DF[DF$County=="Wyoming",],"Death_Rate")
|
||||
US_RATE <- pull(DF[DF$County=="United States",],"Death_Rate")
|
||||
|
||||
DF$Imparted_Trend <- FALSE
|
||||
if(is.na(WYOMING_TREND)){
|
||||
DF[1,4:5] <- DF[2,4:5]
|
||||
DF[1,6] <- TRUE
|
||||
}
|
||||
|
||||
DF$Imparted_Rate <- FALSE
|
||||
if(is.na(WYOMING_RATE)){
|
||||
DF[1,4:5] <- DF[2,2:3]
|
||||
DF[1,6] <- TRUE
|
||||
}
|
||||
WYOMING_BASELINE_TREND <-cbind (DF[1,4:5] ,TRUE)
|
||||
WYOMING_BASELINE_RATE <-DF[1,2:3]
|
||||
|
||||
for(i in 3:nrow(DF)){
|
||||
#Impart any missing trends based on higher levels
|
||||
if(is.na(pull(DF[i,],"Trend"))){ DF[i,4:6] <- WYOMING_BASELINE_TREND}
|
||||
#Impart any missing death rates based on higher levels
|
||||
if(is.na(pull(DF[i,],"Death_Rate"))){
|
||||
DF[i,2:3] <- WYOMING_BASELINE_RATE
|
||||
DF[i,"Imparted_Rate"] <- TRUE
|
||||
}
|
||||
}
|
||||
DF$Sex <- SEX
|
||||
DF$Min_Age <- LOWER_AGE
|
||||
DF$Max_Age <- UPPER_AGE
|
||||
DF <- DF %>% select(County,Sex,Min_Age,Max_Age,Death_Rate,Rate_SD,Imparted_Rate,everything())
|
||||
return(DF)
|
||||
}
|
||||
MORTALITY_DATA_ALL <- rbind(
|
||||
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/A_Under1.csv","Female",0,0),
|
||||
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/B_1_9.csv","Female",1,9),
|
||||
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/C_10_19.csv","Female",10,19),
|
||||
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/D_20_39.csv","Female",20,39),
|
||||
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/E_40_64.csv","Female",40,64),
|
||||
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/G_65_74.csv","Female",65,74),
|
||||
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/H_75_84.csv","Female",75,84),
|
||||
GET_MORTALITY_DATA("Data/Mortality_Rates/Female/I_85+.csv","Female",85,Inf),
|
||||
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/A_Under1.csv","Male",0,0),
|
||||
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/B_1_9.csv","Male",1,9),
|
||||
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/C_10_19.csv","Male",10,19),
|
||||
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/D_20_39.csv","Male",20,39),
|
||||
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/E_40_64.csv","Male",40,64),
|
||||
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/G_65_74.csv","Male",65,74),
|
||||
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/H_75_84.csv","Male",75,84),
|
||||
GET_MORTALITY_DATA("Data/Mortality_Rates/Male/I_85+.csv","Male",85,Inf)
|
||||
|
||||
)
|
||||
LIN_MORTALITY <- MORTALITY_DATA_ALL %>% filter(County=="Lincoln")
|
||||
###Save Mortality Rate data set
|
||||
write_csv(LIN_MORTALITY,file="./Data/Cleaned_Data/Lincoln_Mortality_Rate.csv")
|
||||
write_csv(MORTALITY_DATA_ALL,file="./Data/Cleaned_Data/Not_Used/Wyoming_County_Mortality_Rate.csv")
|
||||
saveRDS(LIN_MORTALITY,file="./Data/Cleaned_Data/Lincoln_Mortality_Rate.Rds")
|
||||
saveRDS(MORTALITY_DATA_ALL,file="./Data/Cleaned_Data/Not_Used/Wyoming_County_Mortality_Rate.Rds")
|
||||
|
||||
#Clean all data from memory. Will be loaded in any paticular script as needed
|
||||
rm(list = ls())
|
||||
gc()
|
||||
|
||||
44
Scripts/2_Download_and_Process_Demographic_Data.r
Normal file
44
Scripts/2_Download_and_Process_Demographic_Data.r
Normal file
@ -0,0 +1,44 @@
|
||||
library(tidyverse)
|
||||
library(readxl)
|
||||
|
||||
###################Demographics
|
||||
if(!file.exists("./Data/Demo_Single_Year_2020s.xls")){download.file('http://eadiv.state.wy.us/Pop/CO_SYASEX24.xlsx',"./Data/Demo_Single_Year_2020s.xls")}
|
||||
TEMP <- read_xlsx("./Data/Demo_Single_Year_2020s.xls",skip=2)[,-1]
|
||||
TEMP <- TEMP[1:(min(which(is.na(TEMP[,1])))-1),]
|
||||
TEMP <- TEMP[!grepl("Base",TEMP$YEAR,ignore.case=TRUE),] #There are two population values provided. I believe one is the census baseline, and one is a estimate in July. Keep the later estimate, to line up with the same seasonal collection pattern of the rest of the data
|
||||
TEMP$YEAR <- year(as.Date(substr((TEMP$YEAR),1,8),format="%m/%d/%Y"))
|
||||
colnames(TEMP) <- c("County","Year","Age","Number","Num_Male","Num_Female")
|
||||
TEMP$County <- gsub(" County","",TEMP$County,ignore.case=TRUE)
|
||||
DEM_2020 <- TEMP %>% select(-Number)
|
||||
###Demographics all
|
||||
DEM_DATA <- read_delim("Data/County_Demographics_Census/wy.1969_2023.singleages.through89.90plus.txt",delim=" ",col_names=c("ID","VALUES"),col_types=list('c','c'))
|
||||
DEM_DATA$Year <- as.integer(substr(DEM_DATA$ID,1,4))
|
||||
DEM_DATA$fips<- substr(DEM_DATA$ID,7,11)
|
||||
COUNTY_LIST <- read_csv("https://github.com/kjhealy/fips-codes/raw/refs/heads/master/county_fips_master.csv",col_types=list('c','c')) %>% filter(state_abbr=="WY") %>% select(fips,County=county_name) %>% mutate(County=gsub(" ","_",gsub(" County","",County,ignore.case=TRUE)))
|
||||
DEM_DATA <- DEM_DATA %>% left_join(COUNTY_LIST) %>% select(-fips)
|
||||
#16=3
|
||||
DEM_DATA$Sex <- ifelse(substr(DEM_DATA$VALUES,3,3)==1,"Male","Female")
|
||||
DEM_DATA$Age <- parse_number(substr(DEM_DATA$VALUES,4,5))
|
||||
DEM_DATA$Number <- parse_number(substr(DEM_DATA$VALUES,6,14))
|
||||
DEM_DATA <- DEM_DATA %>% select(-ID,-VALUES)
|
||||
DEM_DATA <- DEM_DATA %>% group_by(Year,County,Sex,Age) %>% summarize(Number=sum(Number)) %>% ungroup()#Aggregate to sex and age level
|
||||
|
||||
#The Wyoming census data seems newer than this data set from SEER cancer data source. Drop any of these records that overlap with the Wyoming data before merging. Arrange so the column order is the same between the two data sets, so they can be easily bound together.
|
||||
DEM_DATA <- pivot_wider(DEM_DATA,names_from=Sex,values_from=Number) %>% rename(Num_Female=Female,Num_Male=Male) %>% select(colnames(DEM_2020)) %>% filter(Year<min(DEM_2020$Year)) %>% unique
|
||||
DEM_DATA <- rbind(DEM_2020,DEM_DATA) %>% ungroup %>% arrange(Year,Age) %>% unique %>% mutate(Region=County) %>% select(County,Region,Year,Age,Num_Male,Num_Female) #Add a region which in this case is just the county. For Lincoln cities and other areas are analysed
|
||||
###Save demographic data set
|
||||
if(!exists("SAVE_DEMO_LOC")){SAVE_DEMO_LOC <-"./Data/Cleaned_Data/Demographic_Sex_Age_Data"}
|
||||
CSV_SAVE <- paste0(SAVE_DEMO_LOC,"/CSV")
|
||||
RDS_SAVE <- paste0(SAVE_DEMO_LOC,"/RDS")
|
||||
|
||||
#Save files for all county demographics
|
||||
dir.create(CSV_SAVE , recursive = TRUE, showWarnings = FALSE)
|
||||
dir.create(RDS_SAVE , recursive = TRUE, showWarnings = FALSE)
|
||||
saveRDS(DEM_DATA,paste0(RDS_SAVE,"/All_Wyoming_Counties_Demographics.Rds" ))
|
||||
write_csv(DEM_DATA,paste0(CSV_SAVE,"/All_Wyoming_Counties_Demographics.csv" ))
|
||||
|
||||
#Save a file that is just Lincoln County, this just speeds up code so you do not always need to filter for 'Lincoln'
|
||||
LIN_DEM <- DEM_DATA %>% filter(County=='Lincoln')
|
||||
saveRDS(LIN_DEM,paste0(RDS_SAVE,"/Full_Lincoln_County_Demographics.Rds" ))
|
||||
write_csv(LIN_DEM,paste0(CSV_SAVE,"/Full_Lincoln_County_Demographics.csv" ))
|
||||
|
||||
99
Scripts/3_Process_Existing_NIH_Mortality_Data.r
Normal file
99
Scripts/3_Process_Existing_NIH_Mortality_Data.r
Normal file
@ -0,0 +1,99 @@
|
||||
#########################################Mortality Rate
|
||||
library(tidyverse)
|
||||
###Create Location to Save raw data sets
|
||||
if(!exists("SAVE_LOC_RAW")){SAVE_LOC_RAW <-"./Data/Raw_Data/"}
|
||||
dir.create(SAVE_LOC_RAW, recursive = TRUE, showWarnings = FALSE)
|
||||
GET_MORTALITY_DATA <- function(FILE,SEX,LOWER_AGE,UPPER_AGE){
|
||||
#Create clean mortality rate data
|
||||
#Data gathered from https://hdpulse.nimhd.nih.gov/data-portal/mortality/table?cod=247&cod_options=cod_15&ratetype=aa&ratetype_options=ratetype_2&race=00&race_options=race_6&sex=2&sex_options=sex_3&age=177&age_options=age_11&ruralurban=0&ruralurban_options=ruralurban_3&yeargroup=5&yeargroup_options=year5yearmort_1&statefips=56&statefips_options=area_states&county=56000&county_options=counties_wyoming&comparison=counties_to_us&comparison_options=comparison_counties&radio_comparison=areas&radio_comparison_options=cods_or_areas
|
||||
|
||||
NAMES <- c("County","FIPS","Death_Rate","Lower_Rate","Upper_Rate","Deaths","Trend_Category","Trend","Lower_Trend","Upper_Trend")
|
||||
DF <- read_csv(FILE,skip=5,col_names=NAMES,col_types=list('c',"i",'d','d','d','d','c','d','d','d')) %>% filter(grepl("County|Wyoming",County)|County=="United States") %>% mutate(Rate_SD=(Upper_Rate-Lower_Rate)/(2*1.96),Trend_SD=(Upper_Trend-Lower_Trend)/(2*1.96)) %>% select(County,Death_Rate,Rate_SD,Trend,Trend_SD)
|
||||
DF$County <- gsub(" County","",DF$County,ignore.case=TRUE)
|
||||
DF[,-1] <- DF[,-1]/100000
|
||||
WYOMING_TREND <- pull(DF[DF$County=="Wyoming",],"Trend")
|
||||
US_TREND <- pull(DF[DF$County=="United States",],"Trend")
|
||||
|
||||
WYOMING_RATE <- pull(DF[DF$County=="Wyoming",],"Death_Rate")
|
||||
US_RATE <- pull(DF[DF$County=="United States",],"Death_Rate")
|
||||
|
||||
DF$Imparted_Trend <- FALSE
|
||||
if(is.na(WYOMING_TREND)){
|
||||
DF[1,4:5] <- DF[2,4:5]
|
||||
DF[1,6] <- TRUE
|
||||
}
|
||||
|
||||
DF$Imparted_Rate <- FALSE
|
||||
if(is.na(WYOMING_RATE)){
|
||||
DF[1,4:5] <- DF[2,2:3]
|
||||
DF[1,6] <- TRUE
|
||||
}
|
||||
WYOMING_BASELINE_TREND <-cbind (DF[1,4:5] ,TRUE)
|
||||
WYOMING_BASELINE_RATE <-DF[1,2:3]
|
||||
|
||||
for(i in 3:nrow(DF)){
|
||||
#Impart any missing trends based on higher levels
|
||||
if(is.na(pull(DF[i,],"Trend"))){ DF[i,4:6] <- WYOMING_BASELINE_TREND}
|
||||
#Impart any missing death rates based on higher levels
|
||||
if(is.na(pull(DF[i,],"Death_Rate"))){
|
||||
DF[i,2:3] <- WYOMING_BASELINE_RATE
|
||||
DF[i,"Imparted_Rate"] <- TRUE
|
||||
}
|
||||
}
|
||||
DF$Sex <- SEX
|
||||
DF$Min_Age <- LOWER_AGE
|
||||
DF$Max_Age <- UPPER_AGE
|
||||
DF <- DF %>% select(County,Sex,Min_Age,Max_Age,Death_Rate,Rate_SD,Imparted_Rate,everything())
|
||||
return(DF)
|
||||
}
|
||||
SAVE_LOC_RAW_MORT <- paste0(SAVE_LOC_RAW,"Mortality_Rates/")
|
||||
RAW_MORTALITY_LOC_FEMALE <- paste0(SAVE_LOC_RAW_MORT,"Female/")
|
||||
RAW_MORTALITY_LOC_MALE <- paste0(SAVE_LOC_RAW_MORT,"Male/")
|
||||
#Gather all mortality Records
|
||||
MORTALITY_DATA_ALL <- rbind(
|
||||
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"A_Under1.csv"),"Female",0,0),
|
||||
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"B_1_9.csv"),"Female",1,9),
|
||||
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"C_10_19.csv"),"Female",10,19),
|
||||
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"D_20_39.csv"),"Female",20,39),
|
||||
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"E_40_64.csv"),"Female",40,64),
|
||||
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"G_65_74.csv"),"Female",65,74),
|
||||
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"H_75_84.csv"),"Female",75,84),
|
||||
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_FEMALE,"I_85+.csv"),"Female",85,Inf),
|
||||
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"A_Under1.csv"),"Male",0,0),
|
||||
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"B_1_9.csv"),"Male",1,9),
|
||||
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"C_10_19.csv"),"Male",10,19),
|
||||
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"D_20_39.csv"),"Male",20,39),
|
||||
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"E_40_64.csv"),"Male",40,64),
|
||||
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"G_65_74.csv"),"Male",65,74),
|
||||
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"H_75_84.csv"),"Male",75,84),
|
||||
GET_MORTALITY_DATA(paste0(RAW_MORTALITY_LOC_MALE,"I_85+.csv"),"Male",85,Inf)
|
||||
)
|
||||
LIN_MORTALITY <- MORTALITY_DATA_ALL %>% filter(County=="Lincoln")
|
||||
##Save the mortality data
|
||||
if(!exists("SAVE_MORT_LOC")){SAVE_MORT_LOC <-"./Data/Cleaned_Data/Mortality_Rate_Data"}
|
||||
CSV_SAVE <- paste0(SAVE_MORT_LOC ,"/CSV")
|
||||
RDS_SAVE <- paste0(SAVE_MORT_LOC,"/RDS")
|
||||
#Save files for all county demographics
|
||||
dir.create(CSV_SAVE , recursive = TRUE, showWarnings = FALSE)
|
||||
dir.create(RDS_SAVE , recursive = TRUE, showWarnings = FALSE)
|
||||
saveRDS(MORTALITY_DATA_ALL,paste0(RDS_SAVE,"/All_Wyoming_Counties_Mortality_Rates.Rds" ))
|
||||
write_csv(MORTALITY_DATA_ALL,paste0(CSV_SAVE,"/All_Wyoming_Counties_Mortality_Rates.csv" ))
|
||||
saveRDS(LIN_MORTALITY,paste0(RDS_SAVE,"/Lincoln_County_Mortality_Rates.Rds" ))
|
||||
write_csv(LIN_MORTALITY,paste0(CSV_SAVE,"/Lincoln_County_Mortality_Rates.csv" ))
|
||||
#Create a short readme files to make the data sources more clear
|
||||
#Save a raw data readme
|
||||
sink(file=paste0(SAVE_LOC_RAW,"/README_MORTALITY_DATA.txt"),append=FALSE)
|
||||
cat("Data files gathered manually from:\n")
|
||||
cat("https://hdpulse.nimhd.nih.gov/data-portal/mortality/table?cod=247&cod_options=cod_15&ratetype=aa&ratetype_options=ratetype_2&race=00&race_options=race_6&sex=2&sex_options=sex_3&age=177&age_options=age_11&ruralurban=0&ruralurban_options=ruralurban_3&yeargroup=5&yeargroup_options=year5yearmort_1&statefips=56&statefips_options=area_states&county=56000&county_options=counties_wyoming&comparison=counties_to_us&comparison_options=comparison_counties&radio_comparison=areas&radio_comparison_options=cods_or_areas\n")
|
||||
cat("\nEach file is single age group, so age weighting does not apply despite the variable names\n")
|
||||
sink()
|
||||
|
||||
|
||||
|
||||
#Save a processed raw data readme
|
||||
sink(file=paste0(SAVE_MORT_LOC,"/README_MORTALITY_DATA.txt"),append=FALSE)
|
||||
cat("This is a processed file of NIH death rates by age and county. Data files first gathered manually from:\n")
|
||||
cat("\nhttps://hdpulse.nimhd.nih.gov/data-portal/mortality/table?cod=247&cod_options=cod_15&ratetype=aa&ratetype_options=ratetype_2&race=00&race_options=race_6&sex=2&sex_options=sex_3&age=177&age_options=age_11&ruralurban=0&ruralurban_options=ruralurban_3&yeargroup=5&yeargroup_options=year5yearmort_1&statefips=56&statefips_options=area_states&county=56000&county_options=counties_wyoming&comparison=counties_to_us&comparison_options=comparison_counties&radio_comparison=areas&radio_comparison_options=cods_or_areas\n")
|
||||
cat("\nThese manually saved files are in the raw data directory. Each file is single age group, so age weighting does not apply despite the variable names\n")
|
||||
sink()
|
||||
|
||||
44
Scripts/Get_Sim_Intial_Demographic_Data.r
Normal file
44
Scripts/Get_Sim_Intial_Demographic_Data.r
Normal file
@ -0,0 +1,44 @@
|
||||
##Extract Lincoln county ACS data including separate indicators for if a census tract is in Kemmerer, or any other part of the county. Population by Age-Sex in each year for each tract.
|
||||
GET_ACS_LIN_DATA <- function(ACS_YEAR,ACS_CODES=CODES,CENSUS_TRACTS=PROJ_TRACTS){
|
||||
AGE_DATA <- get_acs(geography="tract",year=ACS_YEAR,variables=ACS_CODES$variable,state='WY',county='lincoln') %>% mutate(se=moe/1.64) %>% left_join(ACS_CODES %>% mutate(variable=gsub('E','',variable))) %>% select(-NAME,-moe) %>% left_join(CENSUS_TRACTS) %>% mutate(IN_KEM=ifelse(is.na(IN_KEM),0,1)) %>% rename(Population=estimate) %>% select(-variable,-GEOID) %>% select(Sex,Min_Age,Max_Age,Med_Age,IN_KEM,Population,se) %>% filter(!(Min_Age==0& Max_Age==Inf))
|
||||
AGE_DATA <- AGE_DATA %>% group_by(Sex,Min_Age,Max_Age,Med_Age,IN_KEM)%>% summarize(Population=sum(Population)) %>% ungroup
|
||||
#Add Descriptive age category to a clean graph
|
||||
AGE_DATA$Ages <- paste(AGE_DATA$Min_Age,"to",AGE_DATA$Max_Age)
|
||||
AGE_DATA[AGE_DATA$Max_Age==Inf,"Ages"] <- "85+"
|
||||
AGE_DATA[AGE_DATA$Med_Age==18,"Ages"] <- "18"
|
||||
AGE_DATA[AGE_DATA$Med_Age==20,"Ages"] <- "20"
|
||||
AGE_DATA[AGE_DATA$Med_Age==21,"Ages"] <- "21"
|
||||
AGE_DATA[AGE_DATA$Min_Age==0,"Ages"] <- "Under 5"
|
||||
#Turn the ages into factors to keep the correct order in graphs
|
||||
#Add the percent of total relative population in the region
|
||||
ORD <- AGE_DATA %>% select(Min_Age,Ages) %>% unique %>% arrange(Min_Age) %>% pull(Ages) %>% unique
|
||||
AGE_DATA$Ages <- factor(AGE_DATA$Ages,levels=ORD)
|
||||
AGE_DATA$Year <- ACS_YEAR
|
||||
AGE_DATA <-AGE_DATA %>% select(Year,Sex,Ages,IN_KEM,everything())
|
||||
return(AGE_DATA)
|
||||
}
|
||||
####################Loop to create data for each year. Projecting distribution of ages into Kemmerer, and returning a demographic distribution for Kemmerer/Diamondville in each year of the ACS (currently 2009 to 2023)
|
||||
MAKE_KEM_DEMO_DATA_YEAR <- function(ACS_YEAR){
|
||||
LIN_DEMOGRAPHICS <- readRDS("Data/Cleaned_Data/Lincoln_Demographic_Data.Rds") %>% filter(Year==ACS_YEAR)
|
||||
AGE_DATA <- GET_ACS_LIN_DATA(ACS_YEAR)
|
||||
for(i in 1:nrow(AGE_DATA)){
|
||||
if(i==1 & exists("RES")){rm(RES)}
|
||||
C_DEMO <- AGE_DATA[i,]
|
||||
C_SEX <- C_DEMO$Sex
|
||||
if(C_DEMO$Min_Age==C_DEMO$Max_Age){C_AGE_YEARLY_DATA <- LIN_DEMOGRAPHICS %>% filter(Age==C_DEMO$Min_Age)}else{
|
||||
C_AGE_YEARLY_DATA <- LIN_DEMOGRAPHICS %>% filter(Age>=C_DEMO$Min_Age,Age<=C_DEMO$Max_Age)
|
||||
}
|
||||
#Extract only the current sex being applied
|
||||
if(C_SEX=='Female'){C_AGE_YEARLY_DATA <- C_AGE_YEARLY_DATA %>% select(Age,Num_Female) %>% rename(POP=Num_Female)}else{C_AGE_YEARLY_DATA <- C_AGE_YEARLY_DATA %>% select(Age,Num_Male) %>% rename(POP=Num_Male)}
|
||||
C_AGE_YEARLY_DATA$Sex <- C_SEX
|
||||
C_AGE_YEARLY_DATA$PER <- C_AGE_YEARLY_DATA$POP/sum(C_AGE_YEARLY_DATA$POP)
|
||||
C_RES <- C_AGE_YEARLY_DATA%>% left_join(C_DEMO) %>% mutate(Population=PER*Population) %>% select(Age,Sex,IN_KEM,Population)
|
||||
if(i==1){RES <- C_RES}else{RES <- rbind(RES,C_RES)}
|
||||
}
|
||||
RES$Year <- ACS_YEAR
|
||||
RES$County <- "Lincoln"
|
||||
RES <- RES %>% select(County,IN_KEM,Year,Age,Sex,Population)
|
||||
return(RES)
|
||||
}
|
||||
|
||||
|
||||
@ -1,9 +1,11 @@
|
||||
#library(tidyverse);setwd("../")
|
||||
library(tidycensus)
|
||||
library(zipcodeR)
|
||||
source("Scripts/Get_Sim_Intial_Demographic_Data.r")
|
||||
if(!exists("SAVE_DEMO_LOC")){SAVE_DEMO_LOC <-"./Data/Cleaned_Data/Demographic_Sex_Age_Data"}
|
||||
ACS_END_YEAR <- 2023 #most recent in package as of Nov 4 2025
|
||||
#Pull the relevant median age variables the value moe (margine of error) can be converted to standard error, following the link below
|
||||
#https://www.census.gov/content/dam/Census/library/publications/2018/acs/acs_general_handbook_2018_ch08.pdf
|
||||
ACS_YEAR <- 2023 #most recent as of Nov 4 2025
|
||||
source("Scripts/Downshift_Population_Functions.r")
|
||||
#Packages to instal on computer if zipcodeR won't install
|
||||
#Sudo apt install libssl-dev libudunits2-dev libabsl-dev libcurl4-openssl-dev libgdal-dev cmake libfontconfig1-dev libharfbuzz-dev libfribidi-dev
|
||||
#install.packages("zipcodeR")
|
||||
@ -13,28 +15,32 @@ source("Scripts/Downshift_Population_Functions.r")
|
||||
#census_api_key(KEY, install = TRUE)
|
||||
PROJ_TRACTS <- get_tracts(search_city('Kemmerer','WY')$zipcode) %>% full_join(get_tracts(search_city('Diamondville','WY')$zipcode))
|
||||
PROJ_TRACTS <- PROJ_TRACTS %>% select(GEOID) %>% mutate('IN_KEM'=1) %>% mutate(GEOID=as.character(GEOID))
|
||||
|
||||
MED_AGE_VAR <- cbind(c('B01002_001E','B01002_002E','B01002_003E'),c('Median_Age','Median_Age_Male','Median_Age_Female')) %>% as_tibble %>% rename(variable=V1,Data_Type=V2)
|
||||
|
||||
###Load data manually created which links vairable names to sex-age census data
|
||||
CODES <- read_csv("Data/API_CENSUS_CODES.csv",skip=1) %>% mutate(Med_Age=(Min_Age+Max_Age)/2) %>% rename(variable=Code)
|
||||
#Testing age Comparison between the two
|
||||
###Extract census data for all tracts in Lincoln county, clean up the data, and indicate if the tract is in Kemmerer/Diamondvile or not.
|
||||
AGE_DATA <- get_acs(geography="tract",year=ACS_YEAR,variables=CODES$variable,state='WY',county='lincoln') %>% mutate(se=moe/1.64) %>% left_join(CODES %>% mutate(variable=gsub('E','',variable))) %>% select(-NAME,-moe) %>% left_join(PROJ_TRACTS) %>% mutate(IN_KEM=ifelse(is.na(IN_KEM),0,1)) %>% rename(Population=estimate) %>% select(-variable,-GEOID) %>% select(Sex,Min_Age,Max_Age,Med_Age,IN_KEM,Population,se) %>% filter(!(Min_Age==0& Max_Age==Inf))
|
||||
AGE_DATA <- AGE_DATA %>% group_by(Sex,Min_Age,Max_Age,Med_Age,IN_KEM)%>% summarize(Population=sum(Population)) %>% ungroup
|
||||
#Add Descriptive age category to a clean graph
|
||||
AGE_DATA$Ages <- paste(AGE_DATA$Min_Age,"to",AGE_DATA$Max_Age)
|
||||
AGE_DATA[AGE_DATA$Max_Age==Inf,"Ages"] <- "85+"
|
||||
AGE_DATA[AGE_DATA$Med_Age==18,"Ages"] <- "18"
|
||||
AGE_DATA[AGE_DATA$Med_Age==20,"Ages"] <- "20"
|
||||
AGE_DATA[AGE_DATA$Med_Age==21,"Ages"] <- "21"
|
||||
AGE_DATA[AGE_DATA$Min_Age==0,"Ages"] <- "Under 5"
|
||||
#Turn the ages into factors to keep the correct order in graphs
|
||||
#Add the percent of total relative population in the region
|
||||
ORD <- AGE_DATA %>% select(Min_Age,Ages) %>% unique %>% arrange(Min_Age) %>% pull(Ages) %>% unique
|
||||
AGE_DATA$Ages <- factor(AGE_DATA$Ages,levels=ORD)
|
||||
AGE_DATA$Year <- ACS_YEAR
|
||||
AGE_DATA <-AGE_DATA %>% select(Year,Sex,Ages,IN_KEM,everything())
|
||||
if(!exists('ACS_FILE_LOC')){ACS_FILE_LOC <- "Data/Cleaned_Data/ACS_Census_Demographic_Data.Rds"}
|
||||
saveRDS(AGE_DATA,ACS_FILE_LOC)
|
||||
DEMO_DATA_ALL <- do.call(rbind,lapply(2009:ACS_END_YEAR,MAKE_KEM_DEMO_DATA_YEAR))
|
||||
ORIG_DEMO_DATA_ALL <- DEMO_DATA_ALL
|
||||
#Merge Age of 85+ into one group, because NIH death spastics end in this age category so it is extra overhead to keep each year
|
||||
YOUNG <- DEMO_DATA_ALL %>% filter(Age<85)
|
||||
OLD <- DEMO_DATA_ALL %>% filter(Age>=85) %>% group_by(County,IN_KEM,Year,Sex) %>% summarize(Age=85,Population=sum(Population)) %>% ungroup
|
||||
DEMO_DATA_ALL <- rbind(YOUNG,OLD) %>% arrange(County,Year,IN_KEM,Sex)
|
||||
DEMO_DATA_ALL <- DEMO_DATA_ALL %>% mutate(Population=round(Population)) %>% pivot_wider(values_from=Population,names_from=Sex,names_prefix="Num_")
|
||||
OTHER_LIN_DEMO_DATA <- DEMO_DATA_ALL %>% filter(IN_KEM==0) %>% rename(Region=IN_KEM) %>% mutate(Region='Lincoln_Other')
|
||||
KEM_DEMO_DATA <- DEMO_DATA_ALL %>% filter(IN_KEM==1) %>% rename(Region=IN_KEM) %>% mutate(Region='Kemmerer')
|
||||
|
||||
|
||||
|
||||
####Save results
|
||||
CSV_SAVE <- paste0(SAVE_DEMO_LOC,"/CSV")
|
||||
RDS_SAVE <- paste0(SAVE_DEMO_LOC,"/RDS")
|
||||
dir.create(CSV_SAVE , recursive = TRUE, showWarnings = FALSE)
|
||||
dir.create(RDS_SAVE , recursive = TRUE, showWarnings = FALSE)
|
||||
#Kemmerer Save
|
||||
saveRDS(KEM_DEMO_DATA,paste0(RDS_SAVE,"/Kemmerer_Diamondville_Demographics.Rds" ))
|
||||
write_csv(KEM_DEMO_DATA,paste0(CSV_SAVE,"/Kemmerer_Diamondville_Demographics.csv" ))
|
||||
#Other Lincoln area (not Kemmerer) save
|
||||
saveRDS(OTHER_LIN_DEMO_DATA,paste0(RDS_SAVE,"/Other_Lincoln_Demographics.Rds" ))
|
||||
write_csv(OTHER_LIN_DEMO_DATA,paste0(CSV_SAVE,"/Other_Lincoln_Demographics.csv" ))
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user