First combination of migration, births and deaths

2025-10-29 16:57:18 -06:00 · 2025-10-29 16:57:18 -06:00 · c66d260d41
commit c66d260d41
parent 60dff9e4da
5 changed files with 133 additions and 126 deletions
--- a/1_Run_Full_Simulation.r
+++ b/1_Run_Full_Simulation.r
@ -1,13 +1,17 @@
 #####Packages
 library(tidyverse) #Cleaning data
 library(fixest) #Estimating a model of birth rates, to provide variance in the birth rate Monte Carlo using a fixed effect model.
+library(parallel)
 #Load custom functions needed for the simulation
 source("Scripts/Birth_Simulation_Functions.r")
 source("Scripts/Monte_Carlo_Functions.r")
+source("Scripts/Migration_Simulation_Functions.r")

 #####User Configuration Values
-	NUM_SIMULATIONS <- 10^4 #Number of Monte Carlo Simulations to run
+	NUM_SIMULATIONS <- 10^6 #Number of Monte Carlo Simulations to run
 	RERUN_MORTALITY_SIMULATION <- TRUE #Rerun the Monte Carlo simulation of future mortality rates (not actual deaths) even if a Rds file of a mortality rates exists. This can be used to speed up runs when FALSE
+	RERUN_MIGRATION_SIMULATION <- TRUE #Rerun the ARIMA simulations that predict total migration in any year even if a Rds file of a mortality rates exists. This can be used to speed up runs when FALSE
+
 	NUM_YEARS_PROJECTED <-  50 #How many years into the future should each Monte Carlo run project to. For example 25 years if starting from 2025 and ending in 2050.
 	BIRTH_RATE_REG_RESULTS <- "Data/Regression_Results/Birth_Rate_Model.Rds" #Location of the regression used to model variance in birth rates in each Monte Carlo simulation.
 	START_DEMOGRAPHIC_DATA <- "Data/Cleaned_Data/Start_Year_Demographic_Data_With_Fertility_Groups.Rds" #Location of the data for the first year needing a forecasted birth rate, which aggregates the yearly splits of populations, into a single, year-county data set with variables for key birth prediction (total number of men and women in ages with high fertility rates), and then combines with the data set including births, deaths, migration and population.
@ -17,20 +21,24 @@ source("Scripts/Monte_Carlo_Functions.r")
 	#A script contains the code needed to create a feols (fixest) regression of the birth rate given age distribution. Load this saved result or else create it to use in each simulation for gathering variance of births in any given age distribution path of the Monte Carlo.
 	if(file.exists(BIRTH_RATE_REG_RESULTS)){MOD_BIRTHS <- readRDS(BIRTH_RATE_REG_RESULTS);FIRST_PREDICT_YEAR_POPULATION_DATA <- readRDS(START_DEMOGRAPHIC_DATA)} else{source("Birth_Rate_Regression.r")}
 	if(file.exists(AGE_OF_MIGRANT_PROBABILITY)){MIG_AGE_DIST <- read.csv(AGE_OF_MIGRANT_PROBABILITY)} else{source("Migration_Regression.r")}
+
+	#Rerun the migration simulation if requested
+if(RERUN_MIGRATION_SIMULATION ){source("Migration_Regression.r")}
+
 #######################################################Main Monte Carlo
 START_DEM_DATA <- readRDS("Data/Cleaned_Data/Lincoln_Demographic_Data.Rds") %>% group_by(County) %>% filter(Year==2023) %>% ungroup %>% select(-County)
 MORTALITY_SIMULATION <- readRDS("./Data/Simulated_Data_Sets/MORTALITY_MONTE_CARLO.Rds") #Load the Mortality simulation to speed up simulation
+MIGRATION_ARIMA_SIMULATION <- readRDS("./Data/Simulated_Data_Sets/Migration_ARIMA.Rds") #Load the Migration simulation to speed up simulation

 #First run
-C_RES <-RUN_SINGLE_SIM(MOD_BIRTHS,FIRST_PREDICT_YEAR_POPULATION_DATA,START_DEM_DATA,MORTALITY_SIMULATION,SIM_NUMBER=1,START_OF_SIM=2023)
+C_RES <-RUN_SINGLE_SIM(MOD_BIRTHS,FIRST_PREDICT_YEAR_POPULATION_DATA,START_DEM_DATA,MORTALITY_SIMULATION,SIM_NUMBER=1,START_OF_SIM=2023,MIGRATION_ARIMA_SIMULATION,MIG_AGE_DIST )
 # Year County  Population Births Deaths Migration Min_Birth_Group PREV_BIRTH PREV_TWO_BIRTH Male_Birth_Group
 #Second run, work on making into loop and saving the output to file (check if that will slow the results). Maybe use a predifined matrix, so that the results can be stored quirckly
-
 SINGLE_PATH_SIM <- function(j){
-  C_RES <- RUN_SINGLE_SIM(MOD_BIRTHS,FIRST_PREDICT_YEAR_POPULATION_DATA,START_DEM_DATA,MORTALITY_SIMULATION,SIM_NUMBER=j,START_OF_SIM=2023)
+  C_RES <- RUN_SINGLE_SIM(MOD_BIRTHS,FIRST_PREDICT_YEAR_POPULATION_DATA,START_DEM_DATA,MORTALITY_SIMULATION,SIM_NUMBER=j,START_OF_SIM=2023,MIGRATION_ARIMA_SIMULATION,MIG_AGE_DIST)
  RES <- C_RES[[1]]
    for(i in 1:NUM_YEARS_PROJECTED){
-      C_RES <- RUN_SINGLE_SIM(MOD_BIRTHS,C_RES[[3]],C_RES[[2]],MORTALITY_SIMULATION,SIM_NUMBER=j,START_OF_SIM=2023)
+      C_RES <- RUN_SINGLE_SIM(MOD_BIRTHS,C_RES[[3]],C_RES[[2]],MORTALITY_SIMULATION,SIM_NUMBER=j,START_OF_SIM=2023,MIGRATION_ARIMA_SIMULATION,MIG_AGE_DIST)
      RES <- rbind(RES,C_RES[[1]])
    }
  return(RES)
@ -38,8 +46,7 @@ SINGLE_PATH_SIM <- function(j){
 #Run the full simulation across all simulations simulating changes in demographic, and mortality data.
 system.time({
 FULL_RESULTS <- mclapply(1:NUM_SIMULATIONS,SINGLE_PATH_SIM,mc.cores = detectCores()-1)
-})
-plot(FULL_RESULTS[[2000]]$Population)
-TEMP <- sapply(1:length(FULL_RESULTS),function(x){(FULL_RESULTS[[x]] %>% pull(Population))[25] })
-hist(TEMP)
+
+})
+plot(FULL_RESULTS[[1]][3])

--- a/Data/Other_Intermediate_Outputs/Migreation_Age_Probablity_One_to_Ninety.csv
+++ b/Data/Other_Intermediate_Outputs/Migreation_Age_Probablity_One_to_Ninety.csv
@ -83,9 +83,4 @@
 5.13038157978179e-05
 4.82137214190365e-06
 4.82137214190365e-06
-4.82137214190365e-06
-4.82137214190365e-06
-4.82137214190365e-06
-4.82137214190365e-06
-4.82137214190365e-06
-3.31534934593599e-05
+9.54339236147969e-06
--- a/Migration_Regression.r
+++ b/Migration_Regression.r
@ -1,10 +1,9 @@
 ##########################Model Migration Trends for use in the Monte Carlo. This is important because a 18 year old is more likely to move than 75 year old.
-library(tidyverse)
-library(fixest)
-library(corrplot)
+#library(tidyverse)
+#library(fixest)
+library(forecast)
 ######Checking correlations with migration rates
  DEMOGRAPHIC_DATA <- readRDS("Data/Cleaned_Data/Wyoming_County_Demographic_Data.Rds")
-
  #Extract the population trend data to connect with demographics (Population,births,deaths)
  POP_DATA <- readRDS("Data/Cleaned_Data/Wyoming_County_Population.Rds")
 #Identify births, deaths an migration from existing data.
@ -12,14 +11,12 @@ library(corrplot)
 	DEMO2 <- DEMOGRAPHIC_DATA %>% mutate(Year=Year+1,Age=Age+1) %>% rename(PREV_MALE=Num_Male,PREV_FEMALE=Num_Female)
 	#Combine into a usable data set. Calculate the change in the age-sex population from year to year. This change will include the effect of migration, where absolute values are a mix of factors unrelated to migration. This first-difference approach is preferred to absolute values.
 DEMO_DATA <- inner_join(DEMO1,DEMO2) %>% mutate(Male=Num_Male-PREV_MALE,Female=Num_Female-PREV_FEMALE,Pop_Change=Male+Female) %>% select(County,Year,Age,Male,Female,Pop_Change) %>% arrange(County,Year,Age)
-DEMO_DATA 


 #############################Observed that men and women behave similarly allowing for the groups to be combined
-#COR_MAT_DATA_FULL <- pivot_wider(DEMO_DATA,values_from=c(Male,Female),names_from=Age) 
-#COR_MAT_DATA_FULL <- POP_DATA %>% left_join(COR_MAT_DATA_FULL )
+COR_MAT_DATA_FULL <- pivot_wider(DEMO_DATA,values_from=c(Male,Female),names_from=Age) 
+COR_MAT_DATA_FULL <- POP_DATA %>% left_join(COR_MAT_DATA_FULL )

-#TEST_DATA <- COR_MAT_DATA_FULL %>% filter(!is.na(Migration),!is.na(Deaths),!is.na(Population),!is.na(Male_34),!is.na(Year),!is.na(County))
 #RES <- cbind(c(rep("Male",90),rep("Female",90)),c(rep(1:90,2)),rep(NA,180)) %>% as_tibble
 #colnames(RES ) <- c("Sex","Age","MIGRATION_COEF")
 #RES$MIGRATION_COEF <- as.numeric(RES$MIGRATION_COEF)
@ -37,9 +34,8 @@ DEMO_DATA
 #Create a wide data set with ages in each column so that each regression of age can be predicted one by one.
 	#Use the previous years population data as the starting point, so that the regression does not use data already including migration.
 DEMO_DATA <- DEMO_DATA %>% select(-Male,-Female)
-COR_MAT_DATA_FULL <- pivot_wider(DEMO_DATA,values_from=Pop_Change,names_from=Age,names_prefix="Age_") 
-TEST_DATA <- POP_DATA %>% mutate(Population=Population-Migration-Births+Deaths) %>% left_join( COR_MAT_DATA_FULL) %>% filter(!is.na(Migration),!is.na(Deaths),!is.na(Population),!is.na(Year),!is.na(County)) 
-
+COR_MAT_DATA_FULL <- pivot_wider(DEMO_DATA,values_from=Pop_Change,names_from=Age,names_prefix="Age_")
+AGE_WIDE_DATA <- POP_DATA %>% mutate(Population=Population-Migration-Births+Deaths) %>% left_join( COR_MAT_DATA_FULL) %>% filter(!is.na(Migration),!is.na(Deaths),!is.na(Population),!is.na(Year),!is.na(County))

 #Create a table to store the resulting coefficients in.
 RES <- cbind(1:90,c(rep("Child",17),"18",rep("Adult",90-18)),rep(NA,90),rep(NA,90)) %>% as_tibble
@ -53,13 +49,12 @@ RES <- cbind(1:90,c(rep("Child",17),"18",rep("Adult",90-18)),rep(NA,90),rep(NA,9
 #Predicating the effect of migration on population in any one age group, so that trends over age can be observed. Less when old, more when 18-19.
 	#Loop over all age groups, predict number of people in the age group, from previous population, deaths, and Migrations. Extract the Migration Coefficient for use in a trend analysis.
 for(x in 1:90){
-	TEST_DATA$Y_VAL <- as.numeric(t(TEST_DATA[,6+x]))#Extract the change
-		TEST <- feols(Y_VAL~Deaths+Migration+Population|Year+County,data=TEST_DATA)
-	RES[x,3] <- as.numeric(coef(TEST)["Migration"])
-	RES[x,4] <- as.numeric(coef(TEST)["Deaths"])
-	#	RES[x,3] <- mean(predict(TEST,TEST_DATA %>% mutate(Migration=Migration+1))-predict(TEST),na.rm=TRUE)
-	#	RES[x,4] <- mean(predict(TEST,TEST_DATA %>% mutate(Deaths=Deaths+1))-predict(TEST),na.rm=TRUE)
+	AGE_WIDE_DATA$Y_VAL <- as.numeric(t(AGE_WIDE_DATA[,6+x]))#Extract the change
+		C_REG<- feols(Y_VAL~Deaths+Migration+Population|Year+County,data=AGE_WIDE_DATA)
+	RES[x,3] <- as.numeric(coef(C_REG)["Migration"])
+	RES[x,4] <- as.numeric(coef(C_REG)["Deaths"])
 }
+rm(C_REG)
 #Create data to create graphs and analyze. Remove some observed outliers
 GRAPH_DATA  <- RES %>% filter(abs(MIGRATION_COEF)<Inf,Age<100) %>% filter(Age!=25,Age!=35,Age!=85)
 ##Graph when using log scales and grouping by child/adult. Looks pretty linear
@ -90,109 +85,41 @@ GRAPH_DATA  <- RES %>% filter(abs(MIGRATION_COEF)<Inf,Age<100) %>% filter(Age!=2
 	#Convert the absolute coefficient values to a percentage chance that any one immigrant will be in the given age. This wont line up perfectly with the coefficients if using them to predict immigration, because the age-sex data set uses different totals than the population/migration data. However, the distribution should be the same, so we divide each estimate by the total. The results is the percent probability that a single independent immigrant will be of the given age.
 	#If this is run from the main script MIG_AGE_DIST will be the key variable and should not be changed.
 MIG_AGE_DIST <- PRED_DATA$MIGRATION_COEF/sum(PRED_DATA$MIGRATION_COEF)
+#Condense down to 85+ group because death rates only use 85+
+MIG_AGE_DIST[85] <- mean(MIG_AGE_DIST[85:90])
+MIG_AGE_DIST <- MIG_AGE_DIST[1:85] 
 write.csv(MIG_AGE_DIST ,"Data/Other_Intermediate_Outputs/Migreation_Age_Probablity_One_to_Ninety.csv",row.names=FALSE)

 ####Work on overall migration trends
 	#Could use code cleanup after trying things, but have but I have a working ARIMA to model lincoln county migration
-TS_DATA <- POP_DATA
-TS_DATA <- TS_DATA %>% mutate(In_Migration=ifelse(Migration>0,1,0)) %>% group_by(County) %>% arrange(County,Year) %>% mutate(Prev_Pop=lag(Population),ABS_PER_MIG=abs(Migration)/Prev_Pop,L_MIG=lag(Migration))
-ST_YEAR <- min(pull(TS_DATA,Year))
-END_YEAR <- max(pull(TS_DATA,Year))
-TS_DATA %>% filter(County=='Lincoln') %>% pull(Migration) %>% plot()
-TS_DATA %>% pull(Migration) %>% plot()
-GRAPH_DATA <- TS_DATA %>% filter(!is.na(Migration))
-GRAPH_DATA_LN <- TS_DATA %>% filter(!is.na(Migration),County=="Lincoln")
-
-ggplot(GRAPH_DATA,aes(x=Year,y=Migration/Prev_Pop,group=County,color=County))+geom_point()+geom_line(data=GRAPH_DATA_LN)
-TS_WIDE <- TS_DATA %>% mutate(Migration=Migration) %>% dplyr::select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% filter(Year>1990,Year<=2021) %>%ts(start=c(1991),frequency=1)
-TS_DATA %>% filter(County=='Lincoln')
-ST_YEAR <- 1970 
-LN <- TS_DATA %>% mutate(Migration=Migration) %>% dplyr::select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% dplyr::select(Lincoln,Year) %>% filter(Year>=ST_YEAR,Year<=2021) %>% dplyr::select(-Year) %>%ts(start=c(ST_YEAR),frequency=1)
+TS_DATA <- POP_DATA %>% mutate(In_Migration=ifelse(Migration>0,1,0)) %>% group_by(County) %>% arrange(County,Year) %>% mutate(Prev_Pop=lag(Population))
+ST_YEAR <- min(pull(TS_DATA %>% filter(!is.na(Migration)),Year))
+END_YEAR <- max(pull(TS_DATA %>% filter(!is.na(Migration)),Year))
+	#GRAPH_DATA <- TS_DATA %>% filter(!is.na(Migration))
+	#GRAPH_DATA_LN <- TS_DATA %>% filter(!is.na(Migration),County=="Lincoln")
+	#ggplot(GRAPH_DATA,aes(x=Year,y=Migration/Prev_Pop,group=County,color=County))+geom_point()+geom_line(data=GRAPH_DATA_LN)
+TS_WIDE <- TS_DATA %>% dplyr::select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% filter(Year>ST_YEAR+1,Year<=END_YEAR) %>%ts(start=c(ST_YEAR+1),frequency=1)
+LN <- TS_DATA %>%  dplyr::select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% dplyr::select(Lincoln,Year) %>% filter(Year>=ST_YEAR,Year<=END_YEAR) %>% dplyr::select(-Year) %>%ts(start=c(ST_YEAR),frequency=1)
 #Create an ARIMA of Migration so the number of people migrating can be simulated
-library("forecast")
 	#Time series tests
-	adf.test(LN,k=1) #Stationary with one lag, otherwise not stationary
-	kpss.test(LN) #Stationary,default of program and has some model fit improvements
-MOD <- auto.arima(LN,test="kpss",stationary=TRUE)
-	summary(MOD)
+	#library(tseries)
+	#adf.test(LN,k=1) #Stationary with one lag, otherwise not stationary
+	#kpss.test(LN) #Stationary,default of program and has some model fit improvements
+MOD <- auto.arima(LN,stationary=TRUE)
+	#summary(MOD)
 	#Validity tests
-	autoplot(MOD)
-	acf(resid(MOD))
-	pacf(resid(MOD))
+	#autoplot(MOD)
+	#acf(resid(MOD))
+	#pacf(resid(MOD))
 #	adf.test(resid(MOD))
-	checkresiduals(MOD)
+	#checkresiduals(MOD)
 #Save the resulting model outputs, will need to be changed if looking at other counties
 saveRDS(MOD,"Data/Regression_Results/LN_ARIMA_MODEL.Rds")

-lapply(1:10^6,function(x){round(simulate(MOD,future=TRUE, nsim=50))})#testing a multiple run simulation could use parallel process
+MIGRATION_ARIMA_SIMS <- (do.call(cbind,mclapply(1:NUM_SIMULATIONS,function(x){as.numeric(round(simulate(MOD,future=TRUE, nsim=NUM_YEARS_PROJECTED)))},mc.cores =detectCores()-1)))#testing a multiple run simulation could use parallel process
+saveRDS(MIGRATION_ARIMA_SIMS,"Data/Simulated_Data_Sets/Migration_ARIMA.Rds")
+write.csv(MIGRATION_ARIMA_SIMS,row.names=FALSE,"Data/Simulated_Data_Sets/Migration_ARIMA.csv")


-#################################Simulate data
-MIG_AGE_DIST
-NUM_SIMS<-34 
-PROB <- MIG_AGE_DIST
-
-MIG_SIM <- round(simulate(MOD,future=TRUE, nsim=50)) #Round up for whole numbers
-NUM_SIMS <- abs(MIG_SIM[[1]])
-INCREASE <- MIG_SIM[[1]]/abs(MIG_SIM[[1]]) #Check if positive or negative migration, as these are handled differently
-if(INCREASE==1){MF_SAMPLE <- sample(x=c("Male","Female"),NUM_SIMS,replace=TRUE)}
-sample(x=1:90,size=NUM_SIMS,prob=PROB,replace=TRUE)
-C_DEMO_DATA <-  DEMOGRAPHIC_DATA %>% filter(County=='Lincoln',Year==max(Year))
-NUM_MALE <- pull(C_DEMO_DATA ,"Num_Male")
-NUM_FEMALE <- pull(C_DEMO_DATA,"Num_Female")
-####WORKING ON THE CASE WHEN WE ARE REMOVING INDIVIDUALS. IF THERE ARE NONE THEY SHOULD NOT MOVE. ON THE OTHER HAND IF MOVING IN THE AVERAGE VALUES WORK
-
-PROB <- MIG_AGE_DIST
-LN_DEMO_DATA <- DEMOGRAPHIC_DATA %>% filter(County=='Lincoln',Year==max(Year)) 
-LN_DEMO_DATA$Age
-MAKE_SET <- function(PROB_AGE_DIST,DEMOGRAPHIC_DATA){
-	SINGLE_AGE_RET <- function(x,PROB,DEMO_DATA){
-		C_PROB<- PROB[x]
-		C_LOOP <- DEMO_DATA[x+1,]
-		NUM_MALE <- pull(C_LOOP,Num_Male)
-		NUM_FEMALE <- pull(C_LOOP,Num_Female)
-		C_AGE <- C_LOOP$Age
-		MEN <- cbind(rep(C_AGE,NUM_MALE),rep("Male",NUM_MALE),rep(C_PROB,NUM_MALE)) 
-		WOMEN <-	cbind(rep(C_AGE,NUM_FEMALE),rep("Female",NUM_FEMALE),rep(C_PROB,NUM_FEMALE)) 
-	RES <- rbind(MEN,WOMEN)
-#	colnames(RES) <- c("Age","Sex","Probability")
-	return(RES)
-
-	}
-
-	FINAL_OUT <-do.call(rbind,sapply(1:84,function(x){SINGLE_AGE_RET(x,PROB_AGE_DIST,DEMOGRAPHIC_DATA) })) %>% as_tibble
-	colnames(FINAL_OUT ) <- c("Age","Sex","Probability")
-	FINAL_OUT$Age <- as.numeric(FINAL_OUT$Age)
-	return(FINAL_OUT)
-}
-#A function to find the number of migrants leaving the county (net out), accounting for the fact that fewer or more people in any one age-sex bracket will decrease the odds of being the person to leave even if they are 18-19 and likely to leave.
-OUT_MIGRATION_SIMULATION <- function(NUM_MIGRATED,MIG_AGE_DIST,DEMO_DATA){
-	CURRENT_POP <- MAKE_SET(MIG_AGE_DIST,DEMO_DATA)
-		NUM_POP <- nrow(CURRENT_POP)
-	#Rank all individuals to easily combine with the ordinal data set
-	ORD <- sample(1:NUM_POP,prob=pull(CURRENT_POP,Probablity),size=NUM_POP,replace=FALSE) 
-	#Set the migration out status of all individuals to zero (staying in the county)
-	CURRENT_POP$Migrated <- 0 
-	#The people drawn first are assumed to migrate up to the point where all migration is filled.
-	CURRENT_POP[ORD[1:NUM_MIGRATED],"Migrated"] <- -1
-	MIG_STATUS <- CURRENT_POP %>% group_by(Age,Sex) %>% summarize(Migrated=sum(Migrated)) %>% arrange(desc(Sex),Age) %>% ungroup
-	return(MIG_STATUS)
-}
-
-OUT_MIGRATION_SIMULATION(100,MIG_AGE_DIST,LN_DEMO_DATA)
-#Function to find the number of migrants to a county, by age-sex when coming from outside the county.
-IN_MIGRATION_SIMULATION <- function(NUM_MIGRATED,MIG_AGE_DIST,DEMO_DATA){
-	EMPTY_SET <- 	cbind(rep(1:90,2), c(rep("Male",90),rep("Female",90)),rep(0,180))
-		MIGRATED_AGE <- sample(1:90,prob=MIG_AGE_DIST,size=NUM_MIGRATED,replace=TRUE)
-		MIGRATED_SEX <- sample(c("Male","Female"),size=NUM_MIGRATED,replace=TRUE)
-	MIGRATED_GROUP <-   cbind(MIGRATED_AGE,MIGRATED_SEX,rep(1,NUM_MIGRATED))
-	MIGRATED_DATA <- rbind(EMPTY_SET,MIGRATED_GROUP) %>% as_tibble
-	colnames(MIGRATED_DATA) <- c("Age","Sex","Migration")
-	MIGRATED_DATA$Age <-  as.numeric(MIGRATED_DATA$Age)
-	MIGRATED_DATA$Migration <-  as.numeric(MIGRATED_DATA$Migration)
-	MIGRATED_DATA <- MIGRATED_DATA %>% group_by(Age,Sex) %>% summarize(Migration=sum(Migration)) %>% ungroup %>% arrange(desc(Sex),Age)
-	return(MIGRATED_DATA)
-}
-IN_MIGRATION_SIMULATION(100,MIG_AGE_DIST,LN_DEMO_DATA)


--- a/Scripts/Migration_Simulation_Functions.r
+++ b/Scripts/Migration_Simulation_Functions.r
@ -0,0 +1,71 @@
+#################################Simulate data
+#A function used only inside the OUT_MIGRATION_SIMULATION function (see below). This creates a data set which has one entry for every individual in the county, with a assigned probability of moving out of the county. This allows the migration simulation to extract individuals based on true population distribution. When migration is net positive this does not matter, as anyone age-sex group can move in.
+#DEMOGRAPHIC_DATA <- DEMO_DATA 
+#DEMOGRAPHIC_DATA <- START_DEM_DATA
+MAKE_SET <- function(PROB_AGE_DIST,DEMOGRAPHIC_DATA){
+
+#		if(nrow(DEMOGRAPHIC_DATA)==86){DEMOGRAPHIC_DATA<- DEMOGRAPHIC_DATA[-1,]} #Drop age zero if it was included
+	SINGLE_AGE_RET <- function(x,PROB,DEMO_DATA){
+		C_PROB<- PROB[x]
+		C_LOOP <- DEMO_DATA[x+1,]
+		NUM_MALE <- pull(C_LOOP,Num_Male)
+		NUM_FEMALE <- pull(C_LOOP,Num_Female)
+		C_AGE <- C_LOOP$Age
+		MEN <- cbind(rep(C_AGE,NUM_MALE),rep("Male",NUM_MALE),rep(C_PROB,NUM_MALE)) 
+		WOMEN <-	cbind(rep(C_AGE,NUM_FEMALE),rep("Female",NUM_FEMALE),rep(C_PROB,NUM_FEMALE)) 
+	RES <- rbind(MEN,WOMEN)
+#	colnames(RES) <- c("Age","Sex","Probability")
+	return(RES)
+
+	}
+	FINAL_OUT <-do.call(rbind,sapply(1:85,function(x){SINGLE_AGE_RET(x,PROB_AGE_DIST,DEMOGRAPHIC_DATA) })) %>% as_tibble
+	colnames(FINAL_OUT ) <- c("Age","Sex","Probability")
+	FINAL_OUT$Age <- as.numeric(FINAL_OUT$Age)
+	return(FINAL_OUT)
+}
+#A function to find the number of migrants leaving the county (net out), accounting for the fact that fewer or more people in any one age-sex bracket will decrease the odds of being the person to leave even if they are 18-19 and likely to leave.
+OUT_MIGRATION_SIMULATION <- function(NUM_MIGRATED,MIG_AGE_DIST,DEMO_DATA){
+	CURRENT_POP <- MAKE_SET(MIG_AGE_DIST,DEMO_DATA)
+	NUM_POP <- nrow(CURRENT_POP)
+	#Rank all individuals to easily combine with the ordinal data set
+	ORD <- sample(1:NUM_POP,prob=pull(CURRENT_POP,Probability),size=NUM_POP,replace=FALSE) 
+	#Set the migration out status of all individuals to zero (staying in the county)
+	CURRENT_POP$Migrated <- 0 
+	#The people drawn first are assumed to migrate up to the point where all migration is filled.
+	CURRENT_POP[ORD[1:NUM_MIGRATED],"Migrated"] <- -1
+	MIG_STATUS <- CURRENT_POP %>% group_by(Age,Sex) %>% summarize(Migrated=sum(Migrated)) %>% arrange(desc(Sex),Age) %>% ungroup
+	return(MIG_STATUS)
+}
+
+#Function to find the number of migrants to a county, by age-sex when coming from outside the county.
+#NUM_MIGRATED <- TOTAL_MIGRATION;
+IN_MIGRATION_SIMULATION <- function(NUM_MIGRATED,MIG_AGE_DIST){
+	NUM_MIGRATED <-	abs(NUM_MIGRATED)
+	NUM_AGES <- 85
+	EMPTY_SET <- 	cbind(rep(1:NUM_AGES,2), c(rep("Male",NUM_AGES),rep("Female",NUM_AGES)),rep(0,2*NUM_AGES))
+	colnames(EMPTY_SET) <- c("Age","Sex","Migrated")
+	if(NUM_MIGRATED==0){return(EMPTY_SET)}
+		MIGRATED_AGE <- sample(1:NUM_AGES,prob=MIG_AGE_DIST,size=NUM_MIGRATED,replace=TRUE)
+		MIGRATED_SEX <- sample(c("Male","Female"),size=NUM_MIGRATED,replace=TRUE)
+	MIGRATED_GROUP <-   cbind(MIGRATED_AGE,MIGRATED_SEX,rep(1,NUM_MIGRATED))
+	MIGRATED_DATA <- rbind(EMPTY_SET,MIGRATED_GROUP) %>% as_tibble
+	colnames(MIGRATED_DATA) <- c("Age","Sex","Migrated")
+	MIGRATED_DATA$Age <-  as.numeric(MIGRATED_DATA$Age)
+	MIGRATED_DATA$Migrated <-  as.numeric(MIGRATED_DATA$Migrated)
+	MIGRATED_DATA <- MIGRATED_DATA %>% group_by(Age,Sex) %>% summarize(Migrated=sum(Migrated)) %>% ungroup %>% arrange(desc(Sex),Age)
+	return(MIGRATED_DATA)
+}
+#	MIG_AGE_DIST <- Migration_Age_Distribution;DEMO_DATA <- START_DETAILED_DATA;NET_MIGRATION <- TOTAL_MIGRATION
+MIGRATION_SIMULATION <- function(MIG_AGE_DIST,DEMO_DATA,NET_MIGRATION){
+#MIG_AGE_DIST: A distribution of probabilities of any one age of person to migrate to or out of a county used to grab a reasonable rate of in/out migration by age from a total migration number
+#DEMO_DATA: A data set which contains the number of people in each sex-age category (ex Male 30, or Female 82) 
+#TOTAL_MIGRATION_ARIMA_SIM: A single simulated path of a ARIMA simulation of net migration in the county.
+#YEARS_AHEAD_OF_SIMULATION: The number of years forward from the start of the ARIMA to extract as total migration.
+if(NET_MIGRATION<0){
+	RES <- OUT_MIGRATION_SIMULATION(abs(NET_MIGRATION),MIG_AGE_DIST,DEMO_DATA)
+} else{
+	RES <- IN_MIGRATION_SIMULATION(NET_MIGRATION,MIG_AGE_DIST)}
+	return(RES %>% pivot_wider(values_from=Migrated,names_from=Sex,names_prefix="Num_"))
+}
+
+
--- a/Scripts/Monte_Carlo_Functions.r
+++ b/Scripts/Monte_Carlo_Functions.r
@ -1,13 +1,15 @@
 #Uncomment to check the function line by line
-	#REG_BIRTH_MODEL=MOD_BIRTHS;START_BASIC_DATA=FIRST_PREDICT_YEAR_POPULATION_DATA;START_DETAILED_DATA=START_DEM_DATA;Mortality_Rate_Sim=MORTALITY_SIMULATION;SIM_NUMBER=1;START_OF_SIM_YEAR=2023
+#	REG_BIRTH_MODEL=MOD_BIRTHS;START_BASIC_DATA=FIRST_PREDICT_YEAR_POPULATION_DATA;START_DETAILED_DATA=START_DEM_DATA;Mortality_Rate_Sim=MORTALITY_SIMULATION;SIM_NUMBER=1;START_OF_SIM_YEAR=2023;Migration_ARIMA_Sim=MIGRATION_ARIMA_SIMULATION;Migration_Age_Distribution=MIG_AGE_DIST

-RUN_SINGLE_SIM <- function(REG_BIRTH_MODEL,START_BASIC_DATA,START_DETAILED_DATA,Mortality_Rate_Sim,SIM_NUMBER,START_OF_SIM_YEAR=2023){
+RUN_SINGLE_SIM <- function(REG_BIRTH_MODEL,START_BASIC_DATA,START_DETAILED_DATA,Mortality_Rate_Sim,SIM_NUMBER,START_OF_SIM_YEAR=2023,Migration_ARIMA_Sim,Migration_Age_Distribution ){
 	#REG_BIRTH_MODEL: Feols regression object of population model.
 	#START_BASIC_DATA: A single row of data, with information for the birth regression (Male_Birth_Group,PREV_BIRTH etc.)
 	#START_DETAILED_DATA: A data set, with the number of men and women at each individual age (zero to 85+)
 	#Mortality_Rate_Sim: A list object with a set of project future mortality rates by age. See ./Scripts/Mortality_Rate_Over_Time_Simulation_Function.r. By passing this in the simulation speed is increased significantly.
 	#SIM_NUMBER: The current Monte Carlo simulation being applied. This extracts the correct index of Mortality_Rate_Sim Object for the present simulation.
 	#START_OF_SIM_YEAR: This is the first year of data which requires a simulation. This allows for the time trend to be properly estimated as this depends on the number of years since the forecast began
+	#The simulates paths of of net migration using an ARIMA model, from the start year until the end year.
+	#Migration_Age_Distribution: A vector which has entries from age 1 to 90 of the assessed probability of migrating in a single year, for each net migrant in the county.
 		NEXT_BASIC_DATA <- START_BASIC_DATA #Create a data set for the data to feed into the next run.
 		C_BIRTHS <- BIRTH_SIM(REG_BIRTH_MODEL,START_BASIC_DATA)
 		NEXT_BASIC_DATA[,"PREV_TWO_BIRTH"] <- START_BASIC_DATA[,"PREV_BIRTH"]
@ -24,7 +26,12 @@ RUN_SINGLE_SIM <- function(REG_BIRTH_MODEL,START_BASIC_DATA,START_DETAILED_DATA,
 	#Run a preliminary Monte Carlo which estimates the future mortality rate, for each simulation and year of of Monte Carlo Simulation

 		YEARS_AHEAD <- max(START_BASIC_DATA[,'Year']-START_OF_SIM_YEAR,1) #Define the number of years forward from the simulation start based on the current year of analysis, and the user provided first year.
-	MORTALITY_C_RES <- MORTALITY_SIM(YEARS_AHEAD,SIM_NUMBER,START_DETAILED_DATA,FALSE,Mortality_Rate_Sim )
+	TOTAL_MIGRATION <- Migration_ARIMA_Sim[YEARS_AHEAD,SIM_NUMBER]
+	START_BASIC_DATA[,"Migration"] <- TOTAL_MIGRATION
+	MIG <- MIGRATION_SIMULATION(Migration_Age_Distribution,START_DETAILED_DATA,TOTAL_MIGRATION)
+	MORTALITY_INPUT_DETAILED_DATA <- START_DETAILED_DATA
+	MORTALITY_INPUT_DETAILED_DATA[-1,c("Num_Male","Num_Female")] <- MORTALITY_INPUT_DETAILED_DATA[-1,c("Num_Male","Num_Female")]+MIG[,c("Num_Male","Num_Female")]
+	MORTALITY_C_RES <- MORTALITY_SIM(YEARS_AHEAD,SIM_NUMBER,MORTALITY_INPUT_DETAILED_DATA,FALSE,Mortality_Rate_Sim )
 	#Update number of deaths in the current run (which should be blank when supplied to the function)
        START_BASIC_DATA[,"Deaths"] <- sum(MORTALITY_C_RES[,c("Male_Deaths","Female_Deaths")] )
 	#