diff --git a/Prelim_Process.sh b/Prelim_Process.sh index 1d4b9ac..cdf4291 100644 --- a/Prelim_Process.sh +++ b/Prelim_Process.sh @@ -4,9 +4,10 @@ Rscript "./Scripts/1B_Process_Existing_NIH_Mortality_Data.r" Rscript "./Scripts/1C_Download_and_Process_Demographic_Data.r" Rscript "./Scripts/1D_Use_ACS_Census_Data_to_Estimate_Kemmerer_Demographics.r" #Create data sets used in later simulations, produce some results for the report when related to this process. -Rscript "./Scripts/2A_Birth_Rate_Regression.r" -Rscript "./Scripts/2B_Migration_by_Age_Regression.r" -Rscript "./Scripts/2C_Overall_Migration_ARIMA.r" +Rscript "./Scripts/2A_Birth_Rate_Regression_and_Impart_Kemmerer_Births.r" +Rscript "./Scripts/2B_Impart_Deaths_and_Migration_to_Subregions.r" +Rscript "./Scripts/2C_Migration_by_Age_Regression.r" +Rscript "./Scripts/2D_Overall_Migration_ARIMA.r" #Produce final results for either the simulation, or information for the report, but not anything used in later stages of the simulation Rscript "./Scripts/3A_Population_Pyramid.r" diff --git a/Scripts/1D_Use_ACS_Census_Data_to_Estimate_Kemmerer_Demographics.r b/Scripts/1D_Use_ACS_Census_Data_to_Estimate_Kemmerer_Demographics.r index a974a04..889856d 100644 --- a/Scripts/1D_Use_ACS_Census_Data_to_Estimate_Kemmerer_Demographics.r +++ b/Scripts/1D_Use_ACS_Census_Data_to_Estimate_Kemmerer_Demographics.r @@ -22,7 +22,6 @@ PROJ_TRACTS <- PROJ_TRACTS %>% select(-ZCTA5) %>% filter(GEOID!=56023978200) %>% PROJ_TRACTS <- PROJ_TRACTS %>% select(GEOID) %>% mutate('IN_KEM'=1) %>% mutate(GEOID=as.character(GEOID)) ###Load data manually created which links vairable names to sex-age census data CODES <- read_csv("./Data/Raw_Data/ACS_Demographics/API_CENSUS_CODES.csv",skip=1) %>% mutate(Med_Age=(Min_Age+Max_Age)/2) %>% rename(variable=Code) -#Testing age Comparison between the two ###Extract census data for all tracts in Lincoln county, clean up the data, and indicate if the tract is in Kemmerer/Diamondvile or not. DEMO_DATA_ALL <- do.call(rbind,lapply(2009:ACS_END_YEAR,MAKE_KEM_DEMO_DATA_YEAR)) ORIG_DEMO_DATA_ALL <- DEMO_DATA_ALL @@ -36,13 +35,13 @@ KEM_DEMO_DATA <- DEMO_DATA_ALL %>% filter(IN_KEM==1) %>% rename(Region=IN_KEM) #Ajust the populations to match the total population stastics from the other data sources, since the tracts may spill into other areas POST_ADJUST_DATA <- KEM_DEMO_DATA %>% group_by(Year) %>% summarize(Kem_Demo_Population=sum(Num_Female)+sum(Num_Male)) %>% left_join(OTHER_LIN_DEMO_DATA %>% group_by(Year) %>% summarize(Other_Demo_Population=sum(Num_Female)+sum(Num_Male))) %>% mutate(Total_Lincoln_Demo_Population=Kem_Demo_Population+Other_Demo_Population) DIRECT_POP <- readRDS("Data/Cleaned_Data/Population_Data/RDS/All_Wyoming_County_Populations.Rds") %>% filter(County=='Lincoln') %>% select(Year,Lin_Direct_Population=Population) %>% full_join(readRDS("Data/Cleaned_Data/Population_Data/RDS/All_Wyoming_City_Populations.Rds") %>% filter(City %in% c('Kemmerer','Diamondville')) %>% group_by(Year) %>% summarize(Kem_Direct_Population=sum(Population,na.rm=TRUE))) %>% mutate(Other_Direct_Population=Lin_Direct_Population-Kem_Direct_Population) -ADJUST_TABLE <- DIRECT_POP %>% inner_join(POST_ADJUST_DATA) %>% mutate(KEM_ADJ=Kem_Direct_Population/Kem_Demo_Population,OTHER_ADJ=Lin_Direct_Population/Other_Demo_Population,LIN_ADJ=Lin_Direct_Population/Total_Lincoln_Demo_Population) %>% select(Year,LIN_ADJ,KEM_ADJ,OTHER_ADJ) +ADJUST_TABLE <- DIRECT_POP %>% inner_join(POST_ADJUST_DATA) %>% mutate(KEM_ADJ=Kem_Direct_Population/Kem_Demo_Population,OTHER_ADJ=Other_Direct_Population/Other_Demo_Population,LIN_ADJ=Lin_Direct_Population/Total_Lincoln_Demo_Population) %>% select(Year,LIN_ADJ,KEM_ADJ,OTHER_ADJ) +##Checking Total KEM_DEMO_DATA <- KEM_DEMO_DATA %>% left_join(ADJUST_TABLE %>% select(Year,KEM_ADJ)) %>% mutate(Num_Female=round(KEM_ADJ*Num_Female),Num_Male=round(KEM_ADJ*Num_Male)) %>% select(-KEM_ADJ) + OTHER_LIN_DEMO_DATA <- OTHER_LIN_DEMO_DATA%>% left_join(ADJUST_TABLE %>% select(Year,OTHER_ADJ)) %>% mutate(Num_Female=round(OTHER_ADJ*Num_Female),Num_Male=round(OTHER_ADJ*Num_Male)) %>% select(-OTHER_ADJ) - - #Find the most recent data year MAX_YEAR <- max(KEM_DEMO_DATA$Year) KEM_DEMO_MAT <- KEM_DEMO_DATA %>% filter(Year==MAX_YEAR) %>% select(Num_Male,Num_Female) %>% as.matrix diff --git a/Scripts/2A_Birth_Rate_Regression.r b/Scripts/2A_Birth_Rate_Regression_and_Impart_Kemmerer_Births.r similarity index 84% rename from Scripts/2A_Birth_Rate_Regression.r rename to Scripts/2A_Birth_Rate_Regression_and_Impart_Kemmerer_Births.r index 93242dc..1466b3d 100644 --- a/Scripts/2A_Birth_Rate_Regression.r +++ b/Scripts/2A_Birth_Rate_Regression_and_Impart_Kemmerer_Births.r @@ -36,10 +36,8 @@ KEM_BIRTH_DATA <- KEM_BIRTH_DATA %>% left_join(ADD_BIRTH_GROUP_DATA(KEM_RAW_DEMO ####################Create same data set but for only parts of Lincoln not in the Kemmerer Diamondville area OTHER_RAW_DEMOGRAPHIC_DATA <- readRDS(DEMOGRAPHIC_OTHER_LIN_LOC) -OTHER_DEMO_DATA <- OTHER_RAW_DEMOGRAPHIC_DATA %>% mutate(POP=Num_Male+Num_Female,Births=ifelse(Age==0,POP,0)) %>% group_by(Year,County,Region) %>% summarize(Births=sum(Births)) -OTHER_POP_DATA <- readRDS(POPULATION_CITY_LOC)%>% rename(Region=City) %>% filter(!(Region %in% c("Kemmerer","Diamondville")),County=='Lincoln') %>% group_by(Year) %>% mutate(Population=sum(Population,na.rm=TRUE)) %>% ungroup %>% mutate(Region='Lincoln_Other') %>% unique %>% ungroup -OTHER_BIRTH_DATA <- OTHER_POP_DATA %>% left_join(OTHER_DEMO_DATA) -OTHER_BIRTH_DATA <- OTHER_BIRTH_DATA %>% left_join(ADD_BIRTH_GROUP_DATA(OTHER_RAW_DEMOGRAPHIC_DATA)) +OTHER_BIRTH_DATA<- OTHER_RAW_DEMOGRAPHIC_DATA %>% mutate(Population=Num_Male+Num_Female,Births=ifelse(Age==0,Population,0)) %>% group_by(Year,County,Region) %>% summarize(Population=sum(Population),Births=sum(Births)) %>% ungroup %>% arrange(desc(Year)) +OTHER_BIRTH_DATA <- OTHER_BIRTH_DATA %>% left_join(ADD_BIRTH_GROUP_DATA(OTHER_RAW_DEMOGRAPHIC_DATA)) ####################################################3 ###Because the total of the Kemmerer and Other Lincoln should sum to the births in the entire county, we assume that the ratio of children under the age of 1, in each area is the same percentage as the total births in the county. The total births in the county is then taken from the Wyoming eadiv estimates at the county level. This makes the two sum to the proper amount @@ -49,7 +47,10 @@ ADJUSTED_KEM_STUB_DATA <- COUNTY_BIRTH_DATA %>% filter(Region=='Lincoln') %>% s KEM_BIRTH_DATA <- KEM_BIRTH_DATA %>% select(-Births) %>% left_join(ADJUSTED_KEM_STUB_DATA ) OTHER_BIRTH_DATA <- OTHER_BIRTH_DATA %>% select(-Births) %>% left_join(ADJUSTED_KEM_STUB_DATA) REG_DATA <- rbind(COUNTY_BIRTH_DATA,rbind(KEM_BIRTH_DATA,OTHER_BIRTH_DATA) %>% mutate(Deaths=NA,Migration=NA) %>% select(colnames(COUNTY_BIRTH_DATA))) + + REG_DATA <- REG_DATA %>% group_by(Region) %>% arrange(Year) %>% mutate(Lag_Births=lag(Births),Lag_Two_Births=lag(Births,2)) %>% ungroup %>% arrange(County,Region,Year) + REG_REDUCED_DATA <- REG_DATA %>% filter(!is.na(Births),!is.na(Lag_Two_Births),!is.na(Min_Birth_Group),!is.na(Lag_Births),!is.na(Region)) %>% select(Year,Region,Min_Birth_Group,Births,Lag_Births,Lag_Two_Births) @@ -140,4 +141,27 @@ ST_REG_DATA <- REG_REDUCED_DATA %>% filter(Region=='Lincoln') %>% filter(Year==m saveRDS(REG_REDUCED_DATA,paste0(SAVE_DATA_LOC,"/Birth_Regression_Input_Data.Rds")) write_csv(REG_REDUCED_DATA,paste0(SAVE_DATA_LOC,"/Birth_Regression_Input_Data.csv")) +##########Update existing demogrpahic data with new births +KEM_DATA_NEW <- REG_DATA %>% filter(Region=='Kemmerer & Diamondville') %>% select(Year,County,Region,Population,Births,Deaths,Migration) +PRED_KEM_BIRTH_2024 <- REG_DATA %>% filter(Region=='Kemmerer & Diamondville',Year>=2023) %>% mutate(Min_Birth_Group=lag(Min_Birth_Group)) %>% filter(Year==2024) +KEM_DATA_NEW[KEM_DATA_NEW$Year==2024,"Births"] <- round(exp(predict(MOD_BIRTHS,PRED_KEM_BIRTH_2024))) + +OTHER_DATA_NEW <- REG_DATA %>% filter(Region=='Lincoln_Other') %>% select(Year,County,Region,Population,Births,Deaths,Migration) +PRED_OTHER_BIRTH_2024 <- REG_DATA %>% filter(Region=='Lincoln_Other',Year>=2023) %>% mutate(Min_Birth_Group=lag(Min_Birth_Group)) %>% filter(Year==2024) +OTHER_DATA_NEW[OTHER_DATA_NEW$Year==2024,"Births"] <- round(exp(predict(MOD_BIRTHS,PRED_OTHER_BIRTH_2024))) + +LIN_DATA_NEW <- REG_DATA %>% filter(Region=='Lincoln') %>% select(Year,County,Region,Population,Births,Deaths,Migration) +# +if(!exists("POPULATION_SAVE_RDS")){POPULATION_SAVE_RDS <- "./Data/Cleaned_Data/Population_Data/RDS/"} + dir.create(POPULATION_SAVE_RDS, recursive = TRUE, showWarnings = FALSE) + +if(!exists("POPULATION_SAVE_CSV")){POPULATION_SAVE_CSV <- "./Data/Cleaned_Data/Population_Data/CSV/"} + dir.create(POPULATION_SAVE_CSV, recursive = TRUE, showWarnings = FALSE) +saveRDS(LIN_DATA_NEW,paste0(POPULATION_SAVE_RDS,"Full_Lincoln_County_Population_Data.Rds")) +write_csv(LIN_DATA_NEW,paste0(POPULATION_SAVE_CSV,"Full_Lincoln_County_Population_Data.csv")) +saveRDS(KEM_DATA_NEW,paste0(POPULATION_SAVE_RDS,"Kemmerer_Diamondville_Population_Data.Rds")) +write_csv(KEM_DATA_NEW,paste0(POPULATION_SAVE_CSV,"Kemmerer_Diamondville_Population_Data.csv")) +saveRDS(OTHER_DATA_NEW,paste0(POPULATION_SAVE_RDS,"Other_Lincoln_Population_Data.Rds")) +write_csv(OTHER_DATA_NEW,paste0(POPULATION_SAVE_CSV,"Other_Lincoln_Population_Data.csv")) + diff --git a/Scripts/2B_Impart_Deaths_and_Migration_to_Subregions.r b/Scripts/2B_Impart_Deaths_and_Migration_to_Subregions.r new file mode 100644 index 0000000..1904950 --- /dev/null +++ b/Scripts/2B_Impart_Deaths_and_Migration_to_Subregions.r @@ -0,0 +1,50 @@ +###May want to check rounding of demographic data as migration does not line up perfectly (off by about 9 on average) +library(tidyverse) +#setwd("../") +KEM_POP_LOC <- "Data/Cleaned_Data/Population_Data/RDS/Kemmerer_Diamondville_Population_Data.Rds" +OTHER_POP_LOC <- "Data/Cleaned_Data/Population_Data/RDS/Other_Lincoln_Population_Data.Rds" +KEM_POP_LOC_CSV <- "Data/Cleaned_Data/Population_Data/RDS/Kemmerer_Diamondville_Population_Data.csv" +OTHER_POP_LOC_CSV <- "Data/Cleaned_Data/Population_Data/RDS/Other_Lincoln_Population_Data.csv" + +MORT <- readRDS("Data/Cleaned_Data/Mortality_Rate_Data/RDS/Lincoln_County_Mortality_Rates.Rds") +KEM_DEM <-readRDS("Data/Cleaned_Data/Demographic_Sex_Age_Data/RDS/Kemmerer_Diamondville_Demographics.Rds") +OTHER_DEM <- readRDS("Data/Cleaned_Data/Demographic_Sex_Age_Data/RDS/Other_Lincoln_Demographics.Rds") +LIN_DEM <- readRDS("Data/Cleaned_Data/Demographic_Sex_Age_Data/RDS/Full_Lincoln_County_Demographics.Rds") +MORTALITY <- MORT +#Estimate split of deaths in county (share of Kemmerer vs other Lincoln) +PRED_DEATHS <- function(DATA,MORTALITY){ + MORTALITY <- MORTALITY%>% pivot_wider(values_from=Death_Rate,names_from=Sex) %>% group_by(County,Min_Age,Max_Age) %>% summarize(Rate_Male=mean(Male,na.rm=TRUE),Rate_Female=mean(Female,na.rm=TRUE)) %>% ungroup + RES <- DATA[DATA$Age >= MORTALITY$Min_Age[1] & MORTALITY$Max_Age[1]>= DATA$Age,] %>% group_by(County,Region,Year,Age) %>% summarize(Num_Female=sum(Num_Female),Num_Male=sum(Num_Male)) %>% ungroup %>% mutate(Min_Age=MORTALITY$Min_Age[1],Max_Age= MORTALITY$Max_Age[1]) + for(i in 2:nrow(MORTALITY)){ + RES <- rbind(RES,DATA[DATA$Age >= MORTALITY$Min_Age[i] & MORTALITY$Max_Age[i]>= DATA$Age,] %>% group_by(County,Region,Year,Age) %>% summarize(Num_Female=sum(Num_Female),Num_Male=sum(Num_Male)) %>% ungroup %>% mutate(Min_Age=MORTALITY$Min_Age[i],Max_Age= MORTALITY$Max_Age[i])) + } +RES <- RES%>% arrange(Year) %>% left_join(MORTALITY)%>% group_by(County,Region,Year) %>% summarize(Predicted_Deaths=sum(Rate_Male*Num_Male)+sum(Rate_Female*Num_Female) ) %>% ungroup %>% select(County,Region,Year,Predicted_Deaths) + return(RES) +} +##Predict all deaths an merge into a table to find the ratios of predicted deaths +LIN_PRED <- PRED_DEATHS(LIN_DEM,MORT) %>% select(Year,Lin_Pred_Deaths=Predicted_Deaths) +KEM_PRED <- PRED_DEATHS(KEM_DEM,MORT) %>% select(Year,Kem_Pred_Deaths=Predicted_Deaths) +OTHER_PRED <- PRED_DEATHS(OTHER_DEM,MORT) %>% select(Year,Other_Pred_Deaths=Predicted_Deaths) +PRED_DATA<- LIN_PRED %>% left_join(KEM_PRED) %>% left_join(OTHER_PRED) +Death_Adj <- PRED_DATA %>% filter(!is.na(Kem_Pred_Deaths)) %>% mutate(Kem_Death_Ratio=Kem_Pred_Deaths/(Kem_Pred_Deaths+Other_Pred_Deaths),Other_Death_Ratio=1-Kem_Death_Ratio) %>% select(Year,Kem_Death_Ratio,Other_Death_Ratio) +#Add deaths to Lincoln were missing (lacking data) +LIN_POP <- readRDS("Data/Cleaned_Data/Population_Data/RDS/Full_Lincoln_County_Population_Data.Rds") +LIN_POP <- LIN_POP %>% left_join(LIN_PRED) %>% mutate(Deaths=ifelse(is.na(Deaths) & !is.na(Lin_Pred_Deaths),Lin_Pred_Deaths,Deaths)) %>% select(-Lin_Pred_Deaths) %>% mutate(BD=Population+Births-Deaths) %>% mutate(Missing=Population-lag(BD),Missing=lead(Missing),Migration=ifelse(is.na(Migration),Missing,Migration)) %>% mutate(NEXT=Population+Births-Deaths+Migration) %>% arrange(desc(Year)) %>% select(colnames(LIN_POP)) +###Estimate the number of deaths in Kemmerer based on the total deaths, and the predicted share of deaths + #Find migration based on the remaining missing population (after deaths,and births) +KEM_POP <- readRDS(KEM_POP_LOC) +KEM_DEATHS <- LIN_POP %>% select(Year,Deaths) %>% left_join(Death_Adj) %>% left_join(KEM_PRED) %>% mutate(Deaths=round(Deaths*Kem_Death_Ratio)) %>% select(Year,Deaths) +KEM_POP <- KEM_POP%>% select(-Deaths) %>% left_join(KEM_DEATHS) %>% mutate(BD=Population+Births-Deaths) %>% mutate(Missing=Population-lag(BD),Missing=lead(Missing),Migration=ifelse(is.na(Migration),Missing,Migration))%>% mutate(NEXT=Population+Births-Deaths+Migration)%>% arrange(desc(Year)) %>% select(colnames(LIN_POP)) + +###Estimate the number of deaths in the other parts of Lincoln (not Kemmerer) based on the total deaths, and the predicted share of deaths + #Find migration based on the remaining missing population (after deaths,and births) +OTHER_POP <- readRDS(OTHER_POP_LOC) +OTHER_DEATHS <- LIN_POP %>% select(Year,Deaths) %>% left_join(Death_Adj) %>% left_join(OTHER_PRED) %>% mutate(Deaths=round(Deaths*Other_Death_Ratio)) %>% select(Year,Deaths) +OTHER_POP <- OTHER_POP %>% select(-Deaths) %>% left_join(OTHER_DEATHS) %>% mutate(BD=Population+Births-Deaths) %>% mutate(Missing=Population-lag(BD),Missing=lead(Missing),Migration=ifelse(is.na(Migration),Missing,Migration))%>% arrange(desc(Year)) %>% mutate(NEXT=Population+Births-Deaths+Migration)%>% select(colnames(LIN_POP)) %>% arrange(desc(Year)) +#Save outputs +saveRDS(KEM_POP,KEM_POP_LOC) +saveRDS(OTHER_POP,OTHER_POP_LOC) +write_csv(KEM_POP,KEM_POP_LOC_CSV) +write_csv(OTHER_POP,OTHER_POP_LOC_CSV) + + diff --git a/Scripts/2B_Migration_by_Age_Regression.r b/Scripts/2C_Migration_by_Age_Regression.r similarity index 100% rename from Scripts/2B_Migration_by_Age_Regression.r rename to Scripts/2C_Migration_by_Age_Regression.r diff --git a/Scripts/2C_Overall_Migration_ARIMA.r b/Scripts/2C_Overall_Migration_ARIMA.r deleted file mode 100644 index c354442..0000000 --- a/Scripts/2C_Overall_Migration_ARIMA.r +++ /dev/null @@ -1,30 +0,0 @@ -library(forecast) -####Work on overall migration trends -#Could use code cleanup after trying things, but have but I have a working ARIMA to model Lincoln county migration -TS_DATA <- POP_DATA %>% mutate(In_Migration=ifelse(Migration>0,1,0)) %>% group_by(County) %>% arrange(County,Year) %>% mutate(Prev_Pop=lag(Population)) -ST_YEAR <- min(pull(TS_DATA %>% filter(!is.na(Migration)),Year)) -END_YEAR <- max(pull(TS_DATA %>% filter(!is.na(Migration)),Year)) -#GRAPH_DATA <- TS_DATA %>% filter(!is.na(Migration)) -#GRAPH_DATA_LN <- TS_DATA %>% filter(!is.na(Migration),County=="Lincoln") -#ggplot(GRAPH_DATA,aes(x=Year,y=Migration/Prev_Pop,group=County,color=County))+geom_point()+geom_line(data=GRAPH_DATA_LN) -TS_WIDE <- TS_DATA %>% dplyr::select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% filter(Year>ST_YEAR+1,Year<=END_YEAR) %>%ts(start=c(ST_YEAR+1),frequency=1) -LN <- TS_DATA %>% dplyr::select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% dplyr::select(Lincoln,Year) %>% filter(Year>=ST_YEAR,Year<=END_YEAR) %>% dplyr::select(-Year) %>%ts(start=c(ST_YEAR),frequency=1) -#Create an ARIMA of Migration so the number of people migrating can be simulated -#Time series tests -#library(tseries) -#adf.test(LN,k=1) #Stationary with one lag, otherwise not stationary -#kpss.test(LN) #Stationary,default of program and has some model fit improvements -MOD <- auto.arima(LN,stationary=TRUE) -#summary(MOD) -#Validity tests -#autoplot(MOD) -#acf(resid(MOD)) -#pacf(resid(MOD)) -# adf.test(resid(MOD)) -#checkresiduals(MOD) -#Save the resulting model outputs, will need to be changed if looking at other counties -saveRDS(MOD,"Data/Regression_Results/LN_ARIMA_MODEL.Rds") - -MIGRATION_ARIMA_SIMS <- (do.call(cbind,mclapply(1:NUM_SIMULATIONS,function(x){as.numeric(round(simulate(MOD,future=TRUE, nsim=NUM_YEARS_PROJECTED)))},mc.cores =detectCores()-1)))#testing a multiple run simulation could use parallel process -saveRDS(MIGRATION_ARIMA_SIMS,"Data/Simulated_Data_Sets/Migration_ARIMA.Rds") -write.csv(MIGRATION_ARIMA_SIMS,row.names=FALSE,"Data/Simulated_Data_Sets/Migration_ARIMA.csv") diff --git a/Scripts/2D_Overall_Migration_ARIMA.r b/Scripts/2D_Overall_Migration_ARIMA.r new file mode 100644 index 0000000..309bbf9 --- /dev/null +++ b/Scripts/2D_Overall_Migration_ARIMA.r @@ -0,0 +1,72 @@ +library(forecast) +library(tidyverse) +#setwd("../") +####Work on overall migration trends +#Could use code cleanup after trying things, but have but I have a working ARIMA to model Lincoln county migration +POP_DATA <- readRDS("Data/Cleaned_Data/Population_Data/RDS/Full_Lincoln_County_Population_Data.Rds") %>% mutate(Migration=Migration/Population) +POP_DATA_TEST <- readRDS("Data/Cleaned_Data/Population_Data/RDS/Full_Lincoln_County_Population_Data.Rds") %>% mutate(Migration=Migration/Population) + +POP_KEM_DATA <- readRDS("Data/Cleaned_Data/Population_Data/RDS/Kemmerer_Diamondville_Population_Data.Rds") +POP_OTHER_DATA <- readRDS("Data/Cleaned_Data/Population_Data/RDS/Other_Lincoln_Population_Data.Rds") +hist(POP_OTHER_DATA$Migration/POP_OTHER_DATA$Population) +hist(POP_KEM_DATA$Migration/POP_KEM_DATA$Population) + + +TS_DATA <- POP_DATA %>% mutate(In_Migration=ifelse(Migration>0,1,0)) %>% group_by(County) %>% arrange(County,Year) %>% mutate(Prev_Pop=lag(Population)) %>% ungroup +TS_DATA_TEST <- POP_DATA_TEST %>% mutate(In_Migration=ifelse(Migration>0,1,0)) %>% group_by(County) %>% arrange(County,Year) %>% mutate(Prev_Pop=lag(Population)) %>% ungroup + +TS_KEM_DATA <- POP_KEM_DATA %>% mutate(In_Migration=ifelse(Migration>0,1,0)) %>% group_by(County) %>% arrange(County,Year) %>% mutate(Prev_Pop=lag(Population)) %>% ungroup +TS_OTHER_DATA <- POP_OTHER_DATA %>% mutate(In_Migration=ifelse(Migration>0,1,0)) %>% group_by(County) %>% arrange(County,Year) %>% mutate(Prev_Pop=lag(Population)) %>% ungroup + + +ST_YEAR <- min(pull(TS_DATA %>% filter(!is.na(Migration)),Year)) +END_YEAR <- max(pull(TS_DATA %>% filter(!is.na(Migration)),Year)) + +ST_YEAR_KEM <- min(pull(TS_KEM_DATA %>% filter(!is.na(Migration)),Year)) +END_YEAR_KEM <- max(pull(TS_KEM_DATA %>% filter(!is.na(Migration)),Year)) + +ST_YEAR_OTHER <- min(pull(TS_OTHER_DATA %>% filter(!is.na(Migration)),Year)) +END_YEAR_OTHER <- max(pull(TS_OTHER_DATA %>% filter(!is.na(Migration)),Year)) + + +#GRAPH_DATA <- TS_DATA %>% filter(!is.na(Migration)) +#GRAPH_DATA_LN <- TS_DATA %>% filter(!is.na(Migration),County=="Lincoln") +#ggplot(GRAPH_DATA,aes(x=Year,y=Migration/Prev_Pop,group=County,color=County))+geom_point()+geom_line(data=GRAPH_DATA_LN) +TS_WIDE <- TS_DATA %>% dplyr::select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% filter(Year>ST_YEAR+1,Year<=END_YEAR) %>%ts(start=c(ST_YEAR+1),frequency=1) + +TS_KEM_WIDE <- TS_KEM_DATA %>% dplyr::select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% filter(Year>ST_YEAR+1,Year<=END_YEAR) %>%ts(start=c(ST_YEAR+1),frequency=1) +TS_OTHER_WIDE <- TS_OTHER_DATA %>% dplyr::select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% filter(Year>ST_YEAR+1,Year<=END_YEAR) %>%ts(start=c(ST_YEAR+1),frequency=1) + + +LN <- TS_DATA %>% dplyr::select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% dplyr::select(Lincoln,Year) %>% filter(Year>=ST_YEAR,Year<=END_YEAR) %>% dplyr::select(-Year) %>%ts(start=c(ST_YEAR),frequency=1) + +KEM <- TS_KEM_DATA %>% dplyr::select(Year,Region,Migration) %>% pivot_wider(values_from=Migration,names_from=Region) %>% arrange(Year) %>% dplyr::select('Kemmerer & Diamondville',Year) %>% filter(Year>=ST_YEAR_KEM,Year<=END_YEAR_KEM) %>% dplyr::select(-Year) %>%ts(start=c(ST_YEAR_KEM),frequency=1) +TS_OTHER_DATA +OTHER <- TS_OTHER_DATA %>% dplyr::select(Year,Region,Migration) %>% pivot_wider(values_from=Migration,names_from=Region) %>% arrange(Year) %>% dplyr::select('Lincoln Other'=Lincoln_Other,Year) %>% filter(Year>=ST_YEAR_OTHER,Year<=END_YEAR_OTHER) %>% dplyr::select(-Year) %>%ts(start=c(ST_YEAR_OTHER),frequency=1) + +#Create an ARIMA of Migration so the number of people migrating can be simulated +#Time series tests +#library(tseries) +#adf.test(LN,k=1) #Stationary with one lag, otherwise not stationary +#kpss.test(LN) #Stationary,default of program and has some model fit improvements +MOD <- auto.arima(LN,stationary=TRUE) +MOD_KEM <- auto.arima(KEM) +MOD_OTHER <- auto.arima(OTHER) +plot(forecast(MOD )) +plot(forecast(MOD_KEM )) +plot(forecast(MOD_OTHER )) +plot(forecast(MOD,abs(KEM)) + +#summary(MOD) +#Validity tests +#autoplot(MOD) +#acf(resid(MOD)) +#pacf(resid(MOD)) +# adf.test(resid(MOD)) +#checkresiduals(MOD) +#Save the resulting model outputs, will need to be changed if looking at other counties +#saveRDS(MOD,"Data/Regression_Results/LN_ARIMA_MODEL.Rds") + +MIGRATION_ARIMA_SIMS <- (do.call(cbind,mclapply(1:NUM_SIMULATIONS,function(x){as.numeric(round(simulate(MOD,future=TRUE, nsim=NUM_YEARS_PROJECTED)))},mc.cores =detectCores()-1)))#testing a multiple run simulation could use parallel process +saveRDS(MIGRATION_ARIMA_SIMS,"Data/Simulated_Data_Sets/Migration_ARIMA.Rds") +write.csv(MIGRATION_ARIMA_SIMS,row.names=FALSE,"Data/Simulated_Data_Sets/Migration_ARIMA.csv")