diff --git a/Migration_Regression.r b/Migration_Regression.r index cdbf477..e34081a 100644 --- a/Migration_Regression.r +++ b/Migration_Regression.r @@ -93,20 +93,9 @@ MIG_AGE_DIST <- PRED_DATA$MIGRATION_COEF/sum(PRED_DATA$MIGRATION_COEF) write.csv(MIG_AGE_DIST ,"Data/Other_Intermediate_Outputs/Migreation_Age_Probablity_One_to_Ninety.csv",row.names=FALSE) ####Work on overall migration trends + #Could use code cleanup after trying things, but have but I have a working ARIMA to model lincoln county migration TS_DATA <- POP_DATA TS_DATA <- TS_DATA %>% mutate(In_Migration=ifelse(Migration>0,1,0)) %>% group_by(County) %>% arrange(County,Year) %>% mutate(Prev_Pop=lag(Population),ABS_PER_MIG=abs(Migration)/Prev_Pop,L_MIG=lag(Migration)) -TS_DATA -feols(Migration~L_MIG+Prev_Pop|Year+County,data=TS_DATA) -(feols(ABS_PER_MIG~In_Migration+log(Prev_Pop)+Year*County,TS_DATA)) -LN_TS_DATA <- TS_DATA %>% filter(County=='Lincoln') - -MOD <- feols(Migration~L_MIG+Prev_Pop+Year,data=LN_TS_DATA) - -MOD <- feols(Migration~L_MIG+Prev_Pop+Year*County,data=TS_DATA) - -acf(resid(MOD)) -pacf(resid(MOD)) -library("forecast") ST_YEAR <- min(pull(TS_DATA,Year)) END_YEAR <- max(pull(TS_DATA,Year)) TS_DATA %>% filter(County=='Lincoln') %>% pull(Migration) %>% plot() @@ -115,39 +104,62 @@ GRAPH_DATA <- TS_DATA %>% filter(!is.na(Migration)) GRAPH_DATA_LN <- TS_DATA %>% filter(!is.na(Migration),County=="Lincoln") ggplot(GRAPH_DATA,aes(x=Year,y=Migration/Prev_Pop,group=County,color=County))+geom_point()+geom_line(data=GRAPH_DATA_LN) -TS_DATA -TS_WIDE <- TS_DATA %>% mutate(Migration=Migration/Prev_Pop) %>% select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% filter(Year>1990,Year<=2021) %>%ts(start=c(1991),frequency=1) +TS_WIDE <- TS_DATA %>% mutate(Migration=Migration) %>% dplyr::select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% filter(Year>1990,Year<=2021) %>%ts(start=c(1991),frequency=1) +TS_DATA %>% filter(County=='Lincoln') +ST_YEAR <- 1970 +LN <- TS_DATA %>% mutate(Migration=Migration) %>% dplyr::select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% dplyr::select(Lincoln,Year) %>% filter(Year>=ST_YEAR,Year<=2021) %>% dplyr::select(-Year) %>%ts(start=c(ST_YEAR),frequency=1) +#Create an ARIMA of Migration so the number of people migrating can be simulated +library("forecast") + #Time series tests + adf.test(LN,k=1) #Stationary with one lag, otherwise not stationary + kpss.test(LN) #Stationary,default of program and has some model fit improvements +MOD <- auto.arima(LN,test="kpss",stationary=TRUE) + summary(MOD) + #Validity tests + autoplot(MOD) + acf(resid(MOD)) + pacf(resid(MOD)) +# adf.test(resid(MOD)) + checkresiduals(MOD) +#Save the resulting model outputs, will need to be changed if looking at other counties +saveRDS(MOD,"Data/Regression_Results/LN_ARIMA_MODEL.Rds") -LN <- TS_DATA %>% mutate(Migration=Migration) %>% select(Year,County,Migration) %>% pivot_wider(values_from=Migration,names_from=County) %>% arrange(Year) %>% select(Lincoln,Year) %>% filter(Year>1990,Year<=2021) %>% select(-Year) %>%ts(start=c(1991),frequency=1) - -TS_WIDE[,'Lincoln'] - -LN <- TS_WIDE[,"Lincoln"] -plot(LN ) -library(tseries) -adf.test(LN) -TEST <- auto.arima(LN ) -plot(TEST) -forecast(TEST) -acf(resid(TEST)) -arima.sim(TEST,n=1) -arima.sim(arima(LN,order=c(2,4,3)),n=1) - -summary(TEST) -plot(forecast(TEST)) -resid(TEST) -acf(resid(TEST)) - auto.arima(LN ) - -plot(forecast(TEST)) -TS_WIDE %>% filter(Year>1970) %>% pull(Year) -TS_WIDE +lapply(1:10^6,function(x){round(simulate(MOD,future=TRUE, nsim=50))})#testing a multiple run simulation could use parallel process -summary(feols(ABS_PER_MIG~Year,LN_TS_DATA)) +#################################Simulate data +MIG_AGE_DIST +NUM_SIMS<-34 +PROB <- MIG_AGE_DIST + +MIG_SIM <- round(simulate(MOD,future=TRUE, nsim=50)) #Round up for whole numbers +NUM_SIMS <- abs(MIG_SIM[[1]]) +INCREASE <- MIG_SIM[[1]]/abs(MIG_SIM[[1]]) #Check if positive or negative migration, as these are handled diffrently +if(INCREASE==1){MF_SAMPLE <- sample(x=c("Male","Female"),NUM_SIMS,replace=TRUE)} +sample(x=1:90,size=NUM_SIMS,prob=PROB,replace=TRUE) +C_DEMO_DATA <- DEMOGRAPHIC_DATA %>% filter(County=='Lincoln',Year==max(Year)) +NUM_MALE <- pull(C_DEMO_DATA ,"Num_Male") +NUM_FEMALE <- pull(C_DEMO_DATA,"Num_Female") +####WORKING ON THE CASE WHEN WE ARE REMOVING INDIVIDUALS. IF THERE ARE NONE THEY SHOULD NOT MOVE. ON THE OTHER HAND IF MOVING IN THE AVERAGE VALUES WORK +MAKE_SET <- function(x){ + C_PROB <- MIG_AGE_DIST[x] + C_LOOP <- C_DEMO_DATA[x+1,] + NUM_MALE <- C_LOOP$Num_Male + NUM_FEMALE <- C_LOOP$Num_Female + C_AGE <- C_LOOP$Age + return(rbind(cbind(rep(C_AGE,NUM_MALE),rep("Male",NUM_MALE),rep(C_PROB,NUM_MALE)), + cbind(rep(C_AGE,NUM_FEMALE),rep("Female",NUM_FEMALE),rep(C_PROB,NUM_FEMALE)))) +} +FULL_SET <- sapply(1:85,MAKE_SET) %>% as_tibble +FULL_SET[1] + +sample(1:length(FULL_SET),prob=FULL_SET[,3],size=1) + +sample(x=0,size=NUM_SIMS,prob=c(NUM_MALE,NUM_FEMALE)*rep(PROB,2),replace=TRUE) +sample(x=c('a','a','b'),size=2,prob=c(2,2,0),replace=FALSE) + + + +mean(sample(x=c(1,0),2000000,prob=c(3,1),replace=TRUE)) + -#Start of a simulation method with the inputs. - #plot(100*PROB) - #NUM_SIMS <- 1000 - #sample(x=1:90,size=NUM_SIMS,prob=PROB,replace=TRUE) - #sample(x=c("Male","Female"),size=NUM_SIMS,replace=TRUE)